Line termination change and old code.
This commit is contained in:
		
							
								
								
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							
							
						
						
									
										258
									
								
								.Rprofile
									
									
									
									
									
								
							@@ -1,129 +1,129 @@
 | 
			
		||||
# .Rprofile
 | 
			
		||||
#
 | 
			
		||||
# This script is automatically executed on startup
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
init <- function() {
 | 
			
		||||
 | 
			
		||||
  # Create a local copy of myScript.R if not done yet.
 | 
			
		||||
  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
 | 
			
		||||
    file.copy(".tmp.R", "myScript.R")
 | 
			
		||||
    cat("A new file \"myScript.R\" was created. You can use it for\n")
 | 
			
		||||
    cat("notes and code experiments.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
 | 
			
		||||
  cat("\"files\" pane), edit it and save it.\n")
 | 
			
		||||
  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
 | 
			
		||||
  cat("to move it into the \"myScripts\" folder.\n\n")
 | 
			
		||||
 | 
			
		||||
  file.edit("ABC-units.R")
 | 
			
		||||
  return(invisible(NULL))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! file.exists("./myScripts/.myProfile.R")) {
 | 
			
		||||
  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
 | 
			
		||||
  cat("    =================")
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
  cat("        WELCOME !\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
  cat("  Type  'init()'  to begin\n\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
  cat("    =================")
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
 | 
			
		||||
} else {  # local profile exists ... validate state:
 | 
			
		||||
  cat("\n\nLoading local functions ...")
 | 
			
		||||
 | 
			
		||||
  source(".utilities.R")  # local profile appears sane, source utilities
 | 
			
		||||
  source("./myScripts/.myProfile.R")
 | 
			
		||||
 | 
			
		||||
  if (! exists("myEMail")) {  # ... has eMail been defined?
 | 
			
		||||
    cat("ERROR !\n")
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("the variable \"myEMail\" was not loaded.\n")
 | 
			
		||||
    cat("Please contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
 | 
			
		||||
    cat("ERROR !\n")
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("the variable \"myStudentNumber\" was not loaded.\n")
 | 
			
		||||
    cat("Please contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
 | 
			
		||||
    cat("ERROR !\n")                 # is the Student Number valid?
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("your Student Number could not be validated.\n")
 | 
			
		||||
    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
 | 
			
		||||
    cat(" and fix the problem or contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
 | 
			
		||||
                            # ... and write it into the profile.
 | 
			
		||||
       prf <- readLines("./myScripts/.myProfile.R")
 | 
			
		||||
       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
 | 
			
		||||
       out <- prf[1:iEmail]
 | 
			
		||||
       out <- c(out, sprintf("MYSPE <- \"%s\" ",
 | 
			
		||||
                             getMYSPE(myStudentNumber)))
 | 
			
		||||
       out <- c(out, prf[(iEmail+1):length(prf)])
 | 
			
		||||
       writeLines(out, "./myScripts/.myProfile.R")
 | 
			
		||||
 | 
			
		||||
       cat("\n")
 | 
			
		||||
       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
 | 
			
		||||
                   getMYSPE(myStudentNumber)))
 | 
			
		||||
       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
 | 
			
		||||
       rm(prf, iEmail, out)                # cleanup
 | 
			
		||||
  }
 | 
			
		||||
  cat("... done.\n\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (default.stringsAsFactors()) {
 | 
			
		||||
  cat("WARNING.\n")
 | 
			
		||||
  cat("========\n")
 | 
			
		||||
  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
 | 
			
		||||
  cat("This will break some of the code.\n")
 | 
			
		||||
  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
errText <- list()
 | 
			
		||||
errText[["noProfileFile"]] <- '
 | 
			
		||||
Your PROFILE FILE does not exist. This problem must be fixed to continue.
 | 
			
		||||
 | 
			
		||||
  The code expects the file "./myScripts/.myProfile.R" to exist and to
 | 
			
		||||
  contain your correct eMail address and student number. Detailed
 | 
			
		||||
  instructions were given when you first ran the init() command.
 | 
			
		||||
 | 
			
		||||
  Try running init() again and follow the instructions. Reload youR RStudio
 | 
			
		||||
  session and start over with this file.
 | 
			
		||||
 | 
			
		||||
  If this does not fix the problem, ask for help.
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
errText[["noStudentNumber"]] <- '
 | 
			
		||||
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
 | 
			
		||||
 | 
			
		||||
  The code expects the file "./myScripts/.myProfile.R" to exist and to
 | 
			
		||||
  contain your correct eMail address and student number. This file gets
 | 
			
		||||
  sourced when you start a new R-session, but since you see this error
 | 
			
		||||
  message there was a problem.
 | 
			
		||||
 | 
			
		||||
  Perhaps you need to restart your R-session. Try closing the RStudio
 | 
			
		||||
  project and reopening it from the File > Recent Projects menu.
 | 
			
		||||
 | 
			
		||||
  Perhaps there was a syntax error in your file. Then not all the
 | 
			
		||||
  instructions in the file are executed. Check the file: is your
 | 
			
		||||
  email perhpas not defined? Or did you type it without qwuoataion
 | 
			
		||||
  marks?
 | 
			
		||||
 | 
			
		||||
  Try fixing problems, and then restart R as described above.
 | 
			
		||||
 | 
			
		||||
  If none of this fixes the problem, ask for help.
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# .Rprofile
 | 
			
		||||
#
 | 
			
		||||
# This script is automatically executed on startup
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
init <- function() {
 | 
			
		||||
 | 
			
		||||
  # Create a local copy of myScript.R if not done yet.
 | 
			
		||||
  if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
 | 
			
		||||
    file.copy(".tmp.R", "myScript.R")
 | 
			
		||||
    cat("A new file \"myScript.R\" was created. You can use it for\n")
 | 
			
		||||
    cat("notes and code experiments.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
  cat("Please open the file \".myProfile.R\" (click on the file-name in the\n")
 | 
			
		||||
  cat("\"files\" pane), edit it and save it.\n")
 | 
			
		||||
  cat("Then click the checkbox, and use the More -> Move... dialogue\n")
 | 
			
		||||
  cat("to move it into the \"myScripts\" folder.\n\n")
 | 
			
		||||
 | 
			
		||||
  file.edit("ABC-units.R")
 | 
			
		||||
  return(invisible(NULL))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! file.exists("./myScripts/.myProfile.R")) {
 | 
			
		||||
  cat("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
 | 
			
		||||
  cat("    =================")
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
  cat("        WELCOME !\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
  cat("  Type  'init()'  to begin\n\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
  cat("    =================")
 | 
			
		||||
  cat("\n\n")
 | 
			
		||||
 | 
			
		||||
} else {  # local profile exists ... validate state:
 | 
			
		||||
  cat("\n\nLoading local functions ...")
 | 
			
		||||
 | 
			
		||||
  source(".utilities.R")  # local profile appears sane, source utilities
 | 
			
		||||
  source("./myScripts/.myProfile.R")
 | 
			
		||||
 | 
			
		||||
  if (! exists("myEMail")) {  # ... has eMail been defined?
 | 
			
		||||
    cat("ERROR !\n")
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("the variable \"myEMail\" was not loaded.\n")
 | 
			
		||||
    cat("Please contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
  if (! exists("myStudentNumber")) {  # ... has the Student Number been defined?
 | 
			
		||||
    cat("ERROR !\n")
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("the variable \"myStudentNumber\" was not loaded.\n")
 | 
			
		||||
    cat("Please contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
  if (! grepl("^(100.{7})|(99.{7})$", as.character(myStudentNumber))) {
 | 
			
		||||
    cat("ERROR !\n")                 # is the Student Number valid?
 | 
			
		||||
    cat("=======\n")
 | 
			
		||||
    cat("The file \"./myScripts/.myProfile.R\" exists, but\n")
 | 
			
		||||
    cat("your Student Number could not be validated.\n")
 | 
			
		||||
    cat("Please examine the file \"./myScripts/.myProfile.R\"\n")
 | 
			
		||||
    cat(" and fix the problem or contact your instructor to continue.\n\n")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if (! exists("MYSPE")) {  # if MYSPE has not yet been defined, define it now
 | 
			
		||||
                            # ... and write it into the profile.
 | 
			
		||||
       prf <- readLines("./myScripts/.myProfile.R")
 | 
			
		||||
       iEmail <- grep("^\\s*myStudentNumber\\s*<-", prf)
 | 
			
		||||
       out <- prf[1:iEmail]
 | 
			
		||||
       out <- c(out, sprintf("MYSPE <- \"%s\" ",
 | 
			
		||||
                             getMYSPE(myStudentNumber)))
 | 
			
		||||
       out <- c(out, prf[(iEmail+1):length(prf)])
 | 
			
		||||
       writeLines(out, "./myScripts/.myProfile.R")
 | 
			
		||||
 | 
			
		||||
       cat("\n")
 | 
			
		||||
       cat(sprintf("MYSPE (%s) was added to \"./myScripts/.myProfile.R\"\n\n",
 | 
			
		||||
                   getMYSPE(myStudentNumber)))
 | 
			
		||||
       MYSPE <- getMYSPE(myStudentNumber)  # ... define it for immediate use
 | 
			
		||||
       rm(prf, iEmail, out)                # cleanup
 | 
			
		||||
  }
 | 
			
		||||
  cat("... done.\n\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (default.stringsAsFactors()) {
 | 
			
		||||
  cat("WARNING.\n")
 | 
			
		||||
  cat("========\n")
 | 
			
		||||
  cat("Your default \"stringsAsFactors\" parameter is set to \"TRUE\".\n")
 | 
			
		||||
  cat("This will break some of the code.\n")
 | 
			
		||||
  cat("Please contact your instructor to troubleshoot and fix this issue.\n")
 | 
			
		||||
  cat("\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
errText <- list()
 | 
			
		||||
errText[["noProfileFile"]] <- '
 | 
			
		||||
Your PROFILE FILE does not exist. This problem must be fixed to continue.
 | 
			
		||||
 | 
			
		||||
  The code expects the file "./myScripts/.myProfile.R" to exist and to
 | 
			
		||||
  contain your correct eMail address and student number. Detailed
 | 
			
		||||
  instructions were given when you first ran the init() command.
 | 
			
		||||
 | 
			
		||||
  Try running init() again and follow the instructions. Reload youR RStudio
 | 
			
		||||
  session and start over with this file.
 | 
			
		||||
 | 
			
		||||
  If this does not fix the problem, ask for help.
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
errText[["noStudentNumber"]] <- '
 | 
			
		||||
Your STUDENT NUMBER has not been defined. This problem must be fixed to continue.
 | 
			
		||||
 | 
			
		||||
  The code expects the file "./myScripts/.myProfile.R" to exist and to
 | 
			
		||||
  contain your correct eMail address and student number. This file gets
 | 
			
		||||
  sourced when you start a new R-session, but since you see this error
 | 
			
		||||
  message there was a problem.
 | 
			
		||||
 | 
			
		||||
  Perhaps you need to restart your R-session. Try closing the RStudio
 | 
			
		||||
  project and reopening it from the File > Recent Projects menu.
 | 
			
		||||
 | 
			
		||||
  Perhaps there was a syntax error in your file. Then not all the
 | 
			
		||||
  instructions in the file are executed. Check the file: is your
 | 
			
		||||
  email perhpas not defined? Or did you type it without qwuoataion
 | 
			
		||||
  marks?
 | 
			
		||||
 | 
			
		||||
  Try fixing problems, and then restart R as described above.
 | 
			
		||||
 | 
			
		||||
  If none of this fixes the problem, ask for help.
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										88
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1,44 +1,44 @@
 | 
			
		||||
# Miscellaneous
 | 
			
		||||
.Ds_store
 | 
			
		||||
instructor/
 | 
			
		||||
dev/
 | 
			
		||||
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
 | 
			
		||||
 | 
			
		||||
# History files
 | 
			
		||||
.Rhistory
 | 
			
		||||
.Rapp.history
 | 
			
		||||
 | 
			
		||||
# Session Data files
 | 
			
		||||
# .RData
 | 
			
		||||
 | 
			
		||||
# Files produced in assingments
 | 
			
		||||
data/APSESphyloSet.mfa
 | 
			
		||||
data/APSEStreeRproml.rds
 | 
			
		||||
 | 
			
		||||
# Example code in package build process
 | 
			
		||||
*-Ex.R
 | 
			
		||||
 | 
			
		||||
# Output files from R CMD build
 | 
			
		||||
/*.tar.gz
 | 
			
		||||
 | 
			
		||||
# Output files from R CMD check
 | 
			
		||||
/*.Rcheck/
 | 
			
		||||
 | 
			
		||||
# RStudio files
 | 
			
		||||
.Rproj.user/
 | 
			
		||||
 | 
			
		||||
# produced vignettes
 | 
			
		||||
vignettes/*.html
 | 
			
		||||
vignettes/*.pdf
 | 
			
		||||
 | 
			
		||||
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 | 
			
		||||
.httr-oauth
 | 
			
		||||
 | 
			
		||||
# knitr and R markdown default cache directories
 | 
			
		||||
/*_cache/
 | 
			
		||||
/cache/
 | 
			
		||||
 | 
			
		||||
# Temporary files created by R markdown
 | 
			
		||||
*.utf8.md
 | 
			
		||||
*.knit.md
 | 
			
		||||
.Rproj.user
 | 
			
		||||
# Miscellaneous
 | 
			
		||||
.Ds_store
 | 
			
		||||
instructor/
 | 
			
		||||
dev/
 | 
			
		||||
# myScripts/ # We don't want to ignore this so we can save our work to our own fork.
 | 
			
		||||
 | 
			
		||||
# History files
 | 
			
		||||
.Rhistory
 | 
			
		||||
.Rapp.history
 | 
			
		||||
 | 
			
		||||
# Session Data files
 | 
			
		||||
# .RData
 | 
			
		||||
 | 
			
		||||
# Files produced in assingments
 | 
			
		||||
data/APSESphyloSet.mfa
 | 
			
		||||
data/APSEStreeRproml.rds
 | 
			
		||||
 | 
			
		||||
# Example code in package build process
 | 
			
		||||
*-Ex.R
 | 
			
		||||
 | 
			
		||||
# Output files from R CMD build
 | 
			
		||||
/*.tar.gz
 | 
			
		||||
 | 
			
		||||
# Output files from R CMD check
 | 
			
		||||
/*.Rcheck/
 | 
			
		||||
 | 
			
		||||
# RStudio files
 | 
			
		||||
.Rproj.user/
 | 
			
		||||
 | 
			
		||||
# produced vignettes
 | 
			
		||||
vignettes/*.html
 | 
			
		||||
vignettes/*.pdf
 | 
			
		||||
 | 
			
		||||
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 | 
			
		||||
.httr-oauth
 | 
			
		||||
 | 
			
		||||
# knitr and R markdown default cache directories
 | 
			
		||||
/*_cache/
 | 
			
		||||
/cache/
 | 
			
		||||
 | 
			
		||||
# Temporary files created by R markdown
 | 
			
		||||
*.utf8.md
 | 
			
		||||
*.knit.md
 | 
			
		||||
.Rproj.user
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								.tmp.R
									
									
									
									
									
								
							@@ -1,38 +1,38 @@
 | 
			
		||||
# myScript.R
 | 
			
		||||
#
 | 
			
		||||
# --- As you work with this file, you can delete the instructions below --------
 | 
			
		||||
# Write your notes and code experiments into this document. Save it
 | 
			
		||||
# from time to time - however I recommend that you do not _commit_
 | 
			
		||||
# your saved version.
 | 
			
		||||
#
 | 
			
		||||
# As long as you do not _commit_ this script to version control,
 | 
			
		||||
# you can _pull_ updated versions of the entire project from GitHub
 | 
			
		||||
# by using the RStudio version control interface. However, once
 | 
			
		||||
# you _commit_ any file in your local version, RStudio will require
 | 
			
		||||
# you to resolve conflicts before you can _pull_ updates.
 | 
			
		||||
# --- As you work with this file, you can delete the instructions above --------
 | 
			
		||||
#
 | 
			
		||||
## Purpose: <...>
 | 
			
		||||
#
 | 
			
		||||
# Version: <...>
 | 
			
		||||
#
 | 
			
		||||
# Date:    <...>
 | 
			
		||||
# Author:  <Name> (<namee@mail.utoronto.ca>)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#
 | 
			
		||||
#   <number>    <Features>
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   <...>
 | 
			
		||||
#
 | 
			
		||||
# ====================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
# myScript.R
 | 
			
		||||
#
 | 
			
		||||
# --- As you work with this file, you can delete the instructions below --------
 | 
			
		||||
# Write your notes and code experiments into this document. Save it
 | 
			
		||||
# from time to time - however I recommend that you do not _commit_
 | 
			
		||||
# your saved version.
 | 
			
		||||
#
 | 
			
		||||
# As long as you do not _commit_ this script to version control,
 | 
			
		||||
# you can _pull_ updated versions of the entire project from GitHub
 | 
			
		||||
# by using the RStudio version control interface. However, once
 | 
			
		||||
# you _commit_ any file in your local version, RStudio will require
 | 
			
		||||
# you to resolve conflicts before you can _pull_ updates.
 | 
			
		||||
# --- As you work with this file, you can delete the instructions above --------
 | 
			
		||||
#
 | 
			
		||||
## Purpose: <...>
 | 
			
		||||
#
 | 
			
		||||
# Version: <...>
 | 
			
		||||
#
 | 
			
		||||
# Date:    <...>
 | 
			
		||||
# Author:  <Name> (<namee@mail.utoronto.ca>)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#
 | 
			
		||||
#   <number>    <Features>
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   <...>
 | 
			
		||||
#
 | 
			
		||||
# ====================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
							
						
						
									
										1308
									
								
								.utilities.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,257 +1,257 @@
 | 
			
		||||
# 2021-10-12_In-Class_exploration.R
 | 
			
		||||
#
 | 
			
		||||
#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
 | 
			
		||||
#
 | 
			
		||||
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
 | 
			
		||||
# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
 | 
			
		||||
# Scribe:     boris.steipe@utoronto.ca
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# In our last session we explored some properties of amino acids and noted that
 | 
			
		||||
# we can arrange them in a scatter-plot according to some properties. But can
 | 
			
		||||
# we also arrange them according to generic properties, i.e. taking all
 | 
			
		||||
# published property scales into account? We will try to use all tables from
 | 
			
		||||
# the seqinr package.
 | 
			
		||||
 | 
			
		||||
# First we load the package - this makes all datasets immediately available and
 | 
			
		||||
# we don't have to load them one by one.
 | 
			
		||||
 | 
			
		||||
library(seqinr)
 | 
			
		||||
 | 
			
		||||
# Determine what datasets are available
 | 
			
		||||
#
 | 
			
		||||
# Using "find in topic" ... "amino acid"
 | 
			
		||||
data(aacost)
 | 
			
		||||
data(aaindex)
 | 
			
		||||
data(pK)
 | 
			
		||||
 | 
			
		||||
# We note that datasets may be sorted in different ways: for example
 | 
			
		||||
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
 | 
			
		||||
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
 | 
			
		||||
# acids are sorted in the same way.
 | 
			
		||||
 | 
			
		||||
# Build a datastructure ...
 | 
			
		||||
# rows: amino acids
 | 
			
		||||
# columns: properties
 | 
			
		||||
 | 
			
		||||
# Are all lists in aaindex organized in the same way?
 | 
			
		||||
 | 
			
		||||
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
 | 
			
		||||
                                  # index as a reference list
 | 
			
		||||
 | 
			
		||||
# Loop over each list in aaindex
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
#   get the I-vector
 | 
			
		||||
  x <- aaindex[[i]]$I
 | 
			
		||||
#   get the names
 | 
			
		||||
  x <- names(x)
 | 
			
		||||
#   compare with the names of our reference list
 | 
			
		||||
#   the == and != operators are vectorized. Applying them to two vectors
 | 
			
		||||
#   gives TRUE or FALSE for each pair of elements. any() or all() can be
 | 
			
		||||
#   applied to logical vectors to anylise them and return a soingle result.
 | 
			
		||||
#   if (...) conditions evaluate only a single value and will throw a warning if
 | 
			
		||||
#   there is more than one.
 | 
			
		||||
 | 
			
		||||
  if (any(x != refNames)) {
 | 
			
		||||
    # There was at least one not-equal pair - so: complain
 | 
			
		||||
    print(sprintf("Problem in list %d: names don't match", i))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# If we get here without identifying problems, it means all pairs of
 | 
			
		||||
# rownames match throughout the aainfex list.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
 | 
			
		||||
# one of the list elements) to our dataframe?
 | 
			
		||||
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
 | 
			
		||||
aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
 | 
			
		||||
 | 
			
		||||
str(aaData)  # Confirm: we now have a two-column dataframe
 | 
			
		||||
 | 
			
		||||
# Next: add the rest ...
 | 
			
		||||
for (i in 3:length(aaindex)) {
 | 
			
		||||
  #   get the I-vector and write it into our dataframe
 | 
			
		||||
  aaData[,i] <- aaindex[[i]]$I
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Sanity check
 | 
			
		||||
plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
 | 
			
		||||
 | 
			
		||||
# Looks good.
 | 
			
		||||
 | 
			
		||||
# We finished building our data structure ... but let's add the aacost table
 | 
			
		||||
# aacost is ordered differently:
 | 
			
		||||
rownames(aaData)
 | 
			
		||||
aacost[ , 1]
 | 
			
		||||
 | 
			
		||||
# using order(), applied to aacost - ordering the column with column-name
 | 
			
		||||
# "aaa"
 | 
			
		||||
sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
 | 
			
		||||
aacost[sel, "aaa"] # applying the order vector sorts the column
 | 
			
		||||
 | 
			
		||||
# Is this the same order as refNames?
 | 
			
		||||
refNames == aacost[sel, "aaa"]  # Yes!
 | 
			
		||||
 | 
			
		||||
# add the data from column "tot" (i.e. total metabolic cost) after the
 | 
			
		||||
# last column of aaData
 | 
			
		||||
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
 | 
			
		||||
 | 
			
		||||
# Done.
 | 
			
		||||
str(aaData)  # A dataframe with 20 rows and 545 columns
 | 
			
		||||
 | 
			
		||||
# To answer the question "Which amino acids are similar to each other?" we
 | 
			
		||||
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
 | 
			
		||||
# we will succumb to the "Curse of Dimensionality":
 | 
			
		||||
#
 | 
			
		||||
#    "in high dimensional data, however, all objects appear
 | 
			
		||||
#     to be sparse and dissimilar in many ways..."
 | 
			
		||||
#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
 | 
			
		||||
#
 | 
			
		||||
# A classic way to do this is Principal Component Analysis (PCA) ...
 | 
			
		||||
# (Principal components analysis)
 | 
			
		||||
#
 | 
			
		||||
# PCA expects objects in columns, properties in rows. Therefore we need to
 | 
			
		||||
# transpose our dataset:
 | 
			
		||||
 | 
			
		||||
aaPCA <- prcomp(t(aaData))
 | 
			
		||||
 | 
			
		||||
# This creates an error, because some of our indicews contain NA values!
 | 
			
		||||
# Which indices are this?
 | 
			
		||||
 | 
			
		||||
# We create a vector "sel" for which we check whether any element in each
 | 
			
		||||
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
 | 
			
		||||
# then use this vector to subset ourt dataframe.
 | 
			
		||||
 | 
			
		||||
sel <- logical()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {         # for each index
 | 
			
		||||
  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
 | 
			
		||||
    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
 | 
			
		||||
  } else {                          #   else
 | 
			
		||||
    sel <- c(sel, TRUE)             #     add a TRUE element
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Done. sel now subsets only the NA-free columns
 | 
			
		||||
545 - sum(sel)                      # 13 columns excluded
 | 
			
		||||
 | 
			
		||||
# Do the PCA ... use the prcomp() function
 | 
			
		||||
aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
 | 
			
		||||
 | 
			
		||||
str(aaPCA)   # structure of the result
 | 
			
		||||
 | 
			
		||||
plot(aaPCA)                         # plot the contributions of the
 | 
			
		||||
                                    # components to the variance
 | 
			
		||||
 | 
			
		||||
plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
 | 
			
		||||
     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
 | 
			
		||||
     type ="n")                     # just to set up the coordinate system
 | 
			
		||||
 | 
			
		||||
text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
 | 
			
		||||
     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# PCA results are sensitive to the absolute numeric value of the features that
 | 
			
		||||
# we are comparing. The prcomp() function has an option scale. = TRUE that
 | 
			
		||||
# scales each row of features so that the variance of the value is 1.0  This
 | 
			
		||||
# ensures that each feature is given approximately equal weight
 | 
			
		||||
 | 
			
		||||
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
 | 
			
		||||
 | 
			
		||||
plot(aaPCA)
 | 
			
		||||
 | 
			
		||||
plot(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaPCA$rotation[ , 2],
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaPCA$rotation[ , 2],
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next we try to identify what the PCs correspond to. We see whether there are
 | 
			
		||||
# specific features that are highly correlated with the PCs
 | 
			
		||||
 | 
			
		||||
# ==== Rotation 1 ===================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
 | 
			
		||||
 | 
			
		||||
# The function cor() calculates Pearson coefficients of correlation
 | 
			
		||||
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Iterate over all columns and calculate correlations
 | 
			
		||||
cors <- numeric()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {
 | 
			
		||||
  cors[i] <- cor(PC1, aaData[ , i])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
summary(cors)
 | 
			
		||||
#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
 | 
			
		||||
# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
 | 
			
		||||
#
 | 
			
		||||
#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
 | 
			
		||||
 | 
			
		||||
which(cors == max(cors, na.rm = TRUE))
 | 
			
		||||
 | 
			
		||||
aaindex[[504]]   # Linker propensity ???
 | 
			
		||||
 | 
			
		||||
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
 | 
			
		||||
 | 
			
		||||
# Plot this ...
 | 
			
		||||
plot(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaindex[[504]]$I,
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaindex[[504]]$I,
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# This is essentially a random correlation but for Cysteine ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==== Rotation 2 ===================
 | 
			
		||||
#
 | 
			
		||||
# same process
 | 
			
		||||
PC2 <- aaPCA$rotation[ , 2]
 | 
			
		||||
 | 
			
		||||
cors2 <- numeric()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {
 | 
			
		||||
  cors2[i] <- cor(PC2, aaData[ , i])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
summary(cors2)
 | 
			
		||||
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
 | 
			
		||||
# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
 | 
			
		||||
 | 
			
		||||
# Here we have quite strong correlations
 | 
			
		||||
 | 
			
		||||
which(cors2 == max(cors2, na.rm = TRUE))
 | 
			
		||||
 | 
			
		||||
aaindex[[148]]
 | 
			
		||||
 | 
			
		||||
# this index itself is correlated with many other indices
 | 
			
		||||
 | 
			
		||||
cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
 | 
			
		||||
 | 
			
		||||
# Plot this too...
 | 
			
		||||
plot(aaPCA$rotation[ , 2],
 | 
			
		||||
     aaindex[[148]]$I,
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 2],
 | 
			
		||||
     aaindex[[148]]$I,
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# This correlates well with hydrophobicity measures. In this case the
 | 
			
		||||
# PC is to a certain degree interpretable - but this is not always the case
 | 
			
		||||
# with PCA (see the example of the first PC).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# 2021-10-12_In-Class_exploration.R
 | 
			
		||||
#
 | 
			
		||||
#         =====  T H E   E V E N   B E T T E R   A M I N O   A C I D =====
 | 
			
		||||
#
 | 
			
		||||
# Code and comments for BCH441 in-class exploration, Tuesday, 2021-10-12
 | 
			
		||||
# Explorers:  Jocelyn Nurtanto, Yuzi Li, and  Jerry Gu
 | 
			
		||||
# Scribe:     boris.steipe@utoronto.ca
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# In our last session we explored some properties of amino acids and noted that
 | 
			
		||||
# we can arrange them in a scatter-plot according to some properties. But can
 | 
			
		||||
# we also arrange them according to generic properties, i.e. taking all
 | 
			
		||||
# published property scales into account? We will try to use all tables from
 | 
			
		||||
# the seqinr package.
 | 
			
		||||
 | 
			
		||||
# First we load the package - this makes all datasets immediately available and
 | 
			
		||||
# we don't have to load them one by one.
 | 
			
		||||
 | 
			
		||||
library(seqinr)
 | 
			
		||||
 | 
			
		||||
# Determine what datasets are available
 | 
			
		||||
#
 | 
			
		||||
# Using "find in topic" ... "amino acid"
 | 
			
		||||
data(aacost)
 | 
			
		||||
data(aaindex)
 | 
			
		||||
data(pK)
 | 
			
		||||
 | 
			
		||||
# We note that datasets may be sorted in different ways: for example
 | 
			
		||||
# alphabetically by one letter code (A, C, D, E, ...) or three-letter code (Ala,
 | 
			
		||||
# Arg, Asn, Asp, ...) - this means we need to ensure and validate that amino
 | 
			
		||||
# acids are sorted in the same way.
 | 
			
		||||
 | 
			
		||||
# Build a datastructure ...
 | 
			
		||||
# rows: amino acids
 | 
			
		||||
# columns: properties
 | 
			
		||||
 | 
			
		||||
# Are all lists in aaindex organized in the same way?
 | 
			
		||||
 | 
			
		||||
refNames <- names(aaindex[[1]]$I) # Take the rownames of the first list item
 | 
			
		||||
                                  # index as a reference list
 | 
			
		||||
 | 
			
		||||
# Loop over each list in aaindex
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
#   get the I-vector
 | 
			
		||||
  x <- aaindex[[i]]$I
 | 
			
		||||
#   get the names
 | 
			
		||||
  x <- names(x)
 | 
			
		||||
#   compare with the names of our reference list
 | 
			
		||||
#   the == and != operators are vectorized. Applying them to two vectors
 | 
			
		||||
#   gives TRUE or FALSE for each pair of elements. any() or all() can be
 | 
			
		||||
#   applied to logical vectors to anylise them and return a soingle result.
 | 
			
		||||
#   if (...) conditions evaluate only a single value and will throw a warning if
 | 
			
		||||
#   there is more than one.
 | 
			
		||||
 | 
			
		||||
  if (any(x != refNames)) {
 | 
			
		||||
    # There was at least one not-equal pair - so: complain
 | 
			
		||||
    print(sprintf("Problem in list %d: names don't match", i))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# If we get here without identifying problems, it means all pairs of
 | 
			
		||||
# rownames match throughout the aainfex list.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next: what is the cvorrect syntax to add one vector (the "I" vector of
 | 
			
		||||
# one of the list elements) to our dataframe?
 | 
			
		||||
aaData <- as.data.frame(aaindex[[1]]$I) # Make a dataframe from the first index
 | 
			
		||||
aaData[,2] <- aaindex[[2]]$I            # ... add the secondf index
 | 
			
		||||
 | 
			
		||||
str(aaData)  # Confirm: we now have a two-column dataframe
 | 
			
		||||
 | 
			
		||||
# Next: add the rest ...
 | 
			
		||||
for (i in 3:length(aaindex)) {
 | 
			
		||||
  #   get the I-vector and write it into our dataframe
 | 
			
		||||
  aaData[,i] <- aaindex[[i]]$I
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Sanity check
 | 
			
		||||
plot(aaData[,37], aaData[,544])  # plot two arbitray inices against each other
 | 
			
		||||
 | 
			
		||||
# Looks good.
 | 
			
		||||
 | 
			
		||||
# We finished building our data structure ... but let's add the aacost table
 | 
			
		||||
# aacost is ordered differently:
 | 
			
		||||
rownames(aaData)
 | 
			
		||||
aacost[ , 1]
 | 
			
		||||
 | 
			
		||||
# using order(), applied to aacost - ordering the column with column-name
 | 
			
		||||
# "aaa"
 | 
			
		||||
sel <- order(aacost[ , "aaa"])  # alphebetic ordering of three-letter codes
 | 
			
		||||
aacost[sel, "aaa"] # applying the order vector sorts the column
 | 
			
		||||
 | 
			
		||||
# Is this the same order as refNames?
 | 
			
		||||
refNames == aacost[sel, "aaa"]  # Yes!
 | 
			
		||||
 | 
			
		||||
# add the data from column "tot" (i.e. total metabolic cost) after the
 | 
			
		||||
# last column of aaData
 | 
			
		||||
aaData[ , length(aaindex) + 1] <- aacost[sel, "tot"]
 | 
			
		||||
 | 
			
		||||
# Done.
 | 
			
		||||
str(aaData)  # A dataframe with 20 rows and 545 columns
 | 
			
		||||
 | 
			
		||||
# To answer the question "Which amino acids are similar to each other?" we
 | 
			
		||||
# need to reduce this 545-dimensional dataset to fewer dimensions, otherwise
 | 
			
		||||
# we will succumb to the "Curse of Dimensionality":
 | 
			
		||||
#
 | 
			
		||||
#    "in high dimensional data, however, all objects appear
 | 
			
		||||
#     to be sparse and dissimilar in many ways..."
 | 
			
		||||
#                   https://en.wikipedia.org/wiki/Curse_of_dimensionality
 | 
			
		||||
#
 | 
			
		||||
# A classic way to do this is Principal Component Analysis (PCA) ...
 | 
			
		||||
# (Principal components analysis)
 | 
			
		||||
#
 | 
			
		||||
# PCA expects objects in columns, properties in rows. Therefore we need to
 | 
			
		||||
# transpose our dataset:
 | 
			
		||||
 | 
			
		||||
aaPCA <- prcomp(t(aaData))
 | 
			
		||||
 | 
			
		||||
# This creates an error, because some of our indicews contain NA values!
 | 
			
		||||
# Which indices are this?
 | 
			
		||||
 | 
			
		||||
# We create a vector "sel" for which we check whether any element in each
 | 
			
		||||
# column is NA, and write FALSE if we encounter an NA, TRUE otherwise. We can
 | 
			
		||||
# then use this vector to subset ourt dataframe.
 | 
			
		||||
 | 
			
		||||
sel <- logical()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {         # for each index
 | 
			
		||||
  if (any(is.na(aaData[,i]))) {     #   if there is any NA value ...
 | 
			
		||||
    sel <- c(sel, FALSE)            #     add a FALSE element to the vector
 | 
			
		||||
  } else {                          #   else
 | 
			
		||||
    sel <- c(sel, TRUE)             #     add a TRUE element
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Done. sel now subsets only the NA-free columns
 | 
			
		||||
545 - sum(sel)                      # 13 columns excluded
 | 
			
		||||
 | 
			
		||||
# Do the PCA ... use the prcomp() function
 | 
			
		||||
aaPCA <- prcomp(t(aaData[ ,sel]))   # PCA of the transposed, selected data set
 | 
			
		||||
 | 
			
		||||
str(aaPCA)   # structure of the result
 | 
			
		||||
 | 
			
		||||
plot(aaPCA)                         # plot the contributions of the
 | 
			
		||||
                                    # components to the variance
 | 
			
		||||
 | 
			
		||||
plot(aaPCA$rotation[ , 1],          # plot the first PC against the second PC
 | 
			
		||||
     aaPCA$rotation[ , 2],          # in a scatterplot, in an empty frame
 | 
			
		||||
     type ="n")                     # just to set up the coordinate system
 | 
			
		||||
 | 
			
		||||
text(aaPCA$rotation[ , 1],          # plot the names of the amino acids into
 | 
			
		||||
     aaPCA$rotation[ , 2],          # their respective (PC1, PC2) positions
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# PCA results are sensitive to the absolute numeric value of the features that
 | 
			
		||||
# we are comparing. The prcomp() function has an option scale. = TRUE that
 | 
			
		||||
# scales each row of features so that the variance of the value is 1.0  This
 | 
			
		||||
# ensures that each feature is given approximately equal weight
 | 
			
		||||
 | 
			
		||||
aaPCA <- prcomp(t(aaData[ ,sel]), scale. = TRUE)
 | 
			
		||||
 | 
			
		||||
plot(aaPCA)
 | 
			
		||||
 | 
			
		||||
plot(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaPCA$rotation[ , 2],
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaPCA$rotation[ , 2],
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next we try to identify what the PCs correspond to. We see whether there are
 | 
			
		||||
# specific features that are highly correlated with the PCs
 | 
			
		||||
 | 
			
		||||
# ==== Rotation 1 ===================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
(PC1 <- aaPCA$rotation[ , 1])  # Assign PC1
 | 
			
		||||
 | 
			
		||||
# The function cor() calculates Pearson coefficients of correlation
 | 
			
		||||
cor(PC1, aaData[ , 37]) # e.g. correlate PC1 against index 37
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Iterate over all columns and calculate correlations
 | 
			
		||||
cors <- numeric()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {
 | 
			
		||||
  cors[i] <- cor(PC1, aaData[ , i])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
summary(cors)
 | 
			
		||||
#    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
 | 
			
		||||
# -0.54072 -0.13703  0.05654  0.03729  0.21349  0.59589       13
 | 
			
		||||
#
 | 
			
		||||
#  The max correlation is ~0.6. That is not very high. Which ijndex is it?
 | 
			
		||||
 | 
			
		||||
which(cors == max(cors, na.rm = TRUE))
 | 
			
		||||
 | 
			
		||||
aaindex[[504]]   # Linker propensity ???
 | 
			
		||||
 | 
			
		||||
cor(PC1, aaindex[[504]]$I) # Did we get the right index?
 | 
			
		||||
 | 
			
		||||
# Plot this ...
 | 
			
		||||
plot(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaindex[[504]]$I,
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 1],
 | 
			
		||||
     aaindex[[504]]$I,
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# This is essentially a random correlation but for Cysteine ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==== Rotation 2 ===================
 | 
			
		||||
#
 | 
			
		||||
# same process
 | 
			
		||||
PC2 <- aaPCA$rotation[ , 2]
 | 
			
		||||
 | 
			
		||||
cors2 <- numeric()
 | 
			
		||||
 | 
			
		||||
for (i in 1:ncol(aaData)) {
 | 
			
		||||
  cors2[i] <- cor(PC2, aaData[ , i])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
summary(cors2)
 | 
			
		||||
#     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's
 | 
			
		||||
# -0.95214 -0.56067 -0.12817 -0.05787  0.43046  0.94346       13
 | 
			
		||||
 | 
			
		||||
# Here we have quite strong correlations
 | 
			
		||||
 | 
			
		||||
which(cors2 == max(cors2, na.rm = TRUE))
 | 
			
		||||
 | 
			
		||||
aaindex[[148]]
 | 
			
		||||
 | 
			
		||||
# this index itself is correlated with many other indices
 | 
			
		||||
 | 
			
		||||
cor(PC2, aaindex[[148]]$I)   # confirmn that we have the right index
 | 
			
		||||
 | 
			
		||||
# Plot this too...
 | 
			
		||||
plot(aaPCA$rotation[ , 2],
 | 
			
		||||
     aaindex[[148]]$I,
 | 
			
		||||
     type ="n")
 | 
			
		||||
text(aaPCA$rotation[ , 2],
 | 
			
		||||
     aaindex[[148]]$I,
 | 
			
		||||
     labels = rownames(aaPCA$rotation))
 | 
			
		||||
 | 
			
		||||
# This correlates well with hydrophobicity measures. In this case the
 | 
			
		||||
# PC is to a certain degree interpretable - but this is not always the case
 | 
			
		||||
# with PCA (see the example of the first PC).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,161 +1,161 @@
 | 
			
		||||
# tocID <- "ABC-Install_all_packages.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              Installing all packages in this course
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2021  10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                          Line
 | 
			
		||||
#TOC> ----------------------------------------------
 | 
			
		||||
#TOC>   1        Packages                         33
 | 
			
		||||
#TOC>   2        CRAN packages                    98
 | 
			
		||||
#TOC>   3        Bioconductor packages           127
 | 
			
		||||
#TOC>   4        Other package sources           142
 | 
			
		||||
#TOC>   5        Updating packages               148
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Packages  ============================================================
 | 
			
		||||
 | 
			
		||||
# Much of R's functionality is contributed in packages: bundles of R scripts
 | 
			
		||||
# or code in other languages, pre-configured objects, and datasets. Making this
 | 
			
		||||
# functionality available is often done by issuing a library(<package-name>)
 | 
			
		||||
# command, however this is not the preferred way, since it may override other
 | 
			
		||||
# R functions and it makes it harder to understand where the source code of
 | 
			
		||||
# a particular function is located. In this course we call the function name
 | 
			
		||||
# prefixed with the package name and two colons:
 | 
			
		||||
#   <package-name>::<function-name>()
 | 
			
		||||
# This is the preferred way, since it is explicit.
 | 
			
		||||
#
 | 
			
		||||
# Regardless of which idiom one uses to call the actual function, the package
 | 
			
		||||
#  needs to be "installed" first, i.e. the code must have been downloaded
 | 
			
		||||
# from CRAN, or using the BiocManager::install() function.
 | 
			
		||||
#
 | 
			
		||||
# This script contains download commands for all packages that are used in the
 | 
			
		||||
# course. You can execute the script line by line (or even source the entire
 | 
			
		||||
# script) to make sure all packages can be installed on your computer. Just
 | 
			
		||||
# one reminder: if you are ever asked to install from source, the correct
 | 
			
		||||
# answer is usually "no" - except if you really know what you are doing and why.
 | 
			
		||||
#
 | 
			
		||||
# Once packages are installed you can get additional information about
 | 
			
		||||
# the contents of a package with the commands:
 | 
			
		||||
#  library(help=<package-name>)       # basic information
 | 
			
		||||
#  browseVignettes("<package-name>")  # available vignettes
 | 
			
		||||
#  data(package = "<package-name>")   # available datasets
 | 
			
		||||
#
 | 
			
		||||
#  ... and you can load data sets with:
 | 
			
		||||
#  data(<data-set-name>, package = "<package-name>")
 | 
			
		||||
#
 | 
			
		||||
#  All packages here are installed only when they have not been installed
 | 
			
		||||
#  before, using the following idiom:
 | 
			
		||||
#
 | 
			
		||||
#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
 | 
			
		||||
#       install.packages("<package-name>")
 | 
			
		||||
#     }
 | 
			
		||||
#
 | 
			
		||||
#  ... or its BiocManager::install() equivalent:
 | 
			
		||||
#
 | 
			
		||||
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
 | 
			
		||||
#   BiocManager::install("<bioconductor-package-name>")
 | 
			
		||||
# }
 | 
			
		||||
#
 | 
			
		||||
#  If you want to _force_ a re-installation of the package, simply issue
 | 
			
		||||
#  the install.packages("<package-name>") command on its own. For compactness
 | 
			
		||||
#  we wrap the idiom into a function, which can also switch between CRAN
 | 
			
		||||
#  and BIOconductor sources:
 | 
			
		||||
 | 
			
		||||
installIfNeeded <- function(package, s = "CRAN") {
 | 
			
		||||
  # s: "CRAN" or "BIO"
 | 
			
		||||
  if (s == "CRAN") {
 | 
			
		||||
    if (! requireNamespace(package, quietly=TRUE)) {
 | 
			
		||||
      install.packages(package)
 | 
			
		||||
    }
 | 
			
		||||
  } else if (s == "BIO") {
 | 
			
		||||
    if (! requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
      install.packages("BiocManager")
 | 
			
		||||
    }
 | 
			
		||||
    if (! requireNamespace(package, quietly=TRUE)) {
 | 
			
		||||
      BiocManager::install(package)
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(sprintf("Unknown source \"%s\".", s))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  CRAN packages  =======================================================
 | 
			
		||||
 | 
			
		||||
installIfNeeded("ape")
 | 
			
		||||
installIfNeeded("BiocManager")
 | 
			
		||||
installIfNeeded("bio3d")
 | 
			
		||||
installIfNeeded("evd")
 | 
			
		||||
installIfNeeded("ggseqlogo")
 | 
			
		||||
installIfNeeded("ggtern")
 | 
			
		||||
installIfNeeded("hexbin")
 | 
			
		||||
installIfNeeded("httr")
 | 
			
		||||
installIfNeeded("igraph")
 | 
			
		||||
installIfNeeded("jsonlite")
 | 
			
		||||
installIfNeeded("magrittr")
 | 
			
		||||
installIfNeeded("MASS")
 | 
			
		||||
installIfNeeded("microbenchmark")
 | 
			
		||||
installIfNeeded("phangorn")
 | 
			
		||||
installIfNeeded("plotly")
 | 
			
		||||
installIfNeeded("plotrix")
 | 
			
		||||
installIfNeeded("profvis")
 | 
			
		||||
installIfNeeded("robustbase")
 | 
			
		||||
installIfNeeded("RColorBrewer")
 | 
			
		||||
installIfNeeded("Rphylip")
 | 
			
		||||
installIfNeeded("rvest")
 | 
			
		||||
installIfNeeded("seqinr")
 | 
			
		||||
installIfNeeded("stringi")
 | 
			
		||||
installIfNeeded("taxize")
 | 
			
		||||
installIfNeeded("testthat")
 | 
			
		||||
installIfNeeded("xml2")
 | 
			
		||||
 | 
			
		||||
# =    3  Bioconductor packages  ===============================================
 | 
			
		||||
 | 
			
		||||
installIfNeeded("Biobase",       s = "BIO")
 | 
			
		||||
installIfNeeded("biomaRt",       s = "BIO")
 | 
			
		||||
installIfNeeded("Biostrings",    s = "BIO")
 | 
			
		||||
installIfNeeded("DECIPHER",      s = "BIO")
 | 
			
		||||
installIfNeeded("GEOquery",      s = "BIO")
 | 
			
		||||
installIfNeeded("GOSim",         s = "BIO")
 | 
			
		||||
installIfNeeded("limma",         s = "BIO")
 | 
			
		||||
installIfNeeded("msa",           s = "BIO")
 | 
			
		||||
installIfNeeded("org.Sc.sgd.db", s = "BIO")
 | 
			
		||||
installIfNeeded("prada",         s = "BIO")
 | 
			
		||||
installIfNeeded("topGO",         s = "BIO")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Other package sources  ===============================================
 | 
			
		||||
 | 
			
		||||
# Using sources other than CRAN or Bioconductor to download general-purpose
 | 
			
		||||
# programs that run on your computer is not generally recommended.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Updating packages  ===================================================
 | 
			
		||||
 | 
			
		||||
# From time to time, update CRAN packages with the following command ...
 | 
			
		||||
 | 
			
		||||
update.packages()
 | 
			
		||||
 | 
			
		||||
# ... and also update Bioconductor packages as follows:
 | 
			
		||||
 | 
			
		||||
BiocManager::install()
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "ABC-Install_all_packages.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              Installing all packages in this course
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2021  10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                          Line
 | 
			
		||||
#TOC> ----------------------------------------------
 | 
			
		||||
#TOC>   1        Packages                         33
 | 
			
		||||
#TOC>   2        CRAN packages                    98
 | 
			
		||||
#TOC>   3        Bioconductor packages           127
 | 
			
		||||
#TOC>   4        Other package sources           142
 | 
			
		||||
#TOC>   5        Updating packages               148
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Packages  ============================================================
 | 
			
		||||
 | 
			
		||||
# Much of R's functionality is contributed in packages: bundles of R scripts
 | 
			
		||||
# or code in other languages, pre-configured objects, and datasets. Making this
 | 
			
		||||
# functionality available is often done by issuing a library(<package-name>)
 | 
			
		||||
# command, however this is not the preferred way, since it may override other
 | 
			
		||||
# R functions and it makes it harder to understand where the source code of
 | 
			
		||||
# a particular function is located. In this course we call the function name
 | 
			
		||||
# prefixed with the package name and two colons:
 | 
			
		||||
#   <package-name>::<function-name>()
 | 
			
		||||
# This is the preferred way, since it is explicit.
 | 
			
		||||
#
 | 
			
		||||
# Regardless of which idiom one uses to call the actual function, the package
 | 
			
		||||
#  needs to be "installed" first, i.e. the code must have been downloaded
 | 
			
		||||
# from CRAN, or using the BiocManager::install() function.
 | 
			
		||||
#
 | 
			
		||||
# This script contains download commands for all packages that are used in the
 | 
			
		||||
# course. You can execute the script line by line (or even source the entire
 | 
			
		||||
# script) to make sure all packages can be installed on your computer. Just
 | 
			
		||||
# one reminder: if you are ever asked to install from source, the correct
 | 
			
		||||
# answer is usually "no" - except if you really know what you are doing and why.
 | 
			
		||||
#
 | 
			
		||||
# Once packages are installed you can get additional information about
 | 
			
		||||
# the contents of a package with the commands:
 | 
			
		||||
#  library(help=<package-name>)       # basic information
 | 
			
		||||
#  browseVignettes("<package-name>")  # available vignettes
 | 
			
		||||
#  data(package = "<package-name>")   # available datasets
 | 
			
		||||
#
 | 
			
		||||
#  ... and you can load data sets with:
 | 
			
		||||
#  data(<data-set-name>, package = "<package-name>")
 | 
			
		||||
#
 | 
			
		||||
#  All packages here are installed only when they have not been installed
 | 
			
		||||
#  before, using the following idiom:
 | 
			
		||||
#
 | 
			
		||||
#     if (! requireNamespace("<package-name>", quietly=TRUE)) {
 | 
			
		||||
#       install.packages("<package-name>")
 | 
			
		||||
#     }
 | 
			
		||||
#
 | 
			
		||||
#  ... or its BiocManager::install() equivalent:
 | 
			
		||||
#
 | 
			
		||||
# if (! requireNamespace("<bioconductor-package-name>", quietly=TRUE)) {
 | 
			
		||||
#   BiocManager::install("<bioconductor-package-name>")
 | 
			
		||||
# }
 | 
			
		||||
#
 | 
			
		||||
#  If you want to _force_ a re-installation of the package, simply issue
 | 
			
		||||
#  the install.packages("<package-name>") command on its own. For compactness
 | 
			
		||||
#  we wrap the idiom into a function, which can also switch between CRAN
 | 
			
		||||
#  and BIOconductor sources:
 | 
			
		||||
 | 
			
		||||
installIfNeeded <- function(package, s = "CRAN") {
 | 
			
		||||
  # s: "CRAN" or "BIO"
 | 
			
		||||
  if (s == "CRAN") {
 | 
			
		||||
    if (! requireNamespace(package, quietly=TRUE)) {
 | 
			
		||||
      install.packages(package)
 | 
			
		||||
    }
 | 
			
		||||
  } else if (s == "BIO") {
 | 
			
		||||
    if (! requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
      install.packages("BiocManager")
 | 
			
		||||
    }
 | 
			
		||||
    if (! requireNamespace(package, quietly=TRUE)) {
 | 
			
		||||
      BiocManager::install(package)
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(sprintf("Unknown source \"%s\".", s))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  CRAN packages  =======================================================
 | 
			
		||||
 | 
			
		||||
installIfNeeded("ape")
 | 
			
		||||
installIfNeeded("BiocManager")
 | 
			
		||||
installIfNeeded("bio3d")
 | 
			
		||||
installIfNeeded("evd")
 | 
			
		||||
installIfNeeded("ggseqlogo")
 | 
			
		||||
installIfNeeded("ggtern")
 | 
			
		||||
installIfNeeded("hexbin")
 | 
			
		||||
installIfNeeded("httr")
 | 
			
		||||
installIfNeeded("igraph")
 | 
			
		||||
installIfNeeded("jsonlite")
 | 
			
		||||
installIfNeeded("magrittr")
 | 
			
		||||
installIfNeeded("MASS")
 | 
			
		||||
installIfNeeded("microbenchmark")
 | 
			
		||||
installIfNeeded("phangorn")
 | 
			
		||||
installIfNeeded("plotly")
 | 
			
		||||
installIfNeeded("plotrix")
 | 
			
		||||
installIfNeeded("profvis")
 | 
			
		||||
installIfNeeded("robustbase")
 | 
			
		||||
installIfNeeded("RColorBrewer")
 | 
			
		||||
installIfNeeded("Rphylip")
 | 
			
		||||
installIfNeeded("rvest")
 | 
			
		||||
installIfNeeded("seqinr")
 | 
			
		||||
installIfNeeded("stringi")
 | 
			
		||||
installIfNeeded("taxize")
 | 
			
		||||
installIfNeeded("testthat")
 | 
			
		||||
installIfNeeded("xml2")
 | 
			
		||||
 | 
			
		||||
# =    3  Bioconductor packages  ===============================================
 | 
			
		||||
 | 
			
		||||
installIfNeeded("Biobase",       s = "BIO")
 | 
			
		||||
installIfNeeded("biomaRt",       s = "BIO")
 | 
			
		||||
installIfNeeded("Biostrings",    s = "BIO")
 | 
			
		||||
installIfNeeded("DECIPHER",      s = "BIO")
 | 
			
		||||
installIfNeeded("GEOquery",      s = "BIO")
 | 
			
		||||
installIfNeeded("GOSim",         s = "BIO")
 | 
			
		||||
installIfNeeded("limma",         s = "BIO")
 | 
			
		||||
installIfNeeded("msa",           s = "BIO")
 | 
			
		||||
installIfNeeded("org.Sc.sgd.db", s = "BIO")
 | 
			
		||||
installIfNeeded("prada",         s = "BIO")
 | 
			
		||||
installIfNeeded("topGO",         s = "BIO")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Other package sources  ===============================================
 | 
			
		||||
 | 
			
		||||
# Using sources other than CRAN or Bioconductor to download general-purpose
 | 
			
		||||
# programs that run on your computer is not generally recommended.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Updating packages  ===================================================
 | 
			
		||||
 | 
			
		||||
# From time to time, update CRAN packages with the following command ...
 | 
			
		||||
 | 
			
		||||
update.packages()
 | 
			
		||||
 | 
			
		||||
# ... and also update Bioconductor packages as follows:
 | 
			
		||||
 | 
			
		||||
BiocManager::install()
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,100 +1,100 @@
 | 
			
		||||
# addSACCE_APSESproteins.R
 | 
			
		||||
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "SWI4_SACCE",
 | 
			
		||||
              RefSeqID = "NP_011036",
 | 
			
		||||
              UniProtID = "P25302",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
 | 
			
		||||
       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
 | 
			
		||||
       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
 | 
			
		||||
       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
 | 
			
		||||
       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
 | 
			
		||||
       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
 | 
			
		||||
       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
 | 
			
		||||
       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
 | 
			
		||||
       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
 | 
			
		||||
       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
 | 
			
		||||
       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
 | 
			
		||||
       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
 | 
			
		||||
       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
 | 
			
		||||
       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
 | 
			
		||||
       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
 | 
			
		||||
       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
 | 
			
		||||
       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
 | 
			
		||||
       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
 | 
			
		||||
       1081 klddiekdlr ana"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "PHD1_SACCE",
 | 
			
		||||
              RefSeqID = "NP_012881",
 | 
			
		||||
              UniProtID = "P36093",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
 | 
			
		||||
       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
 | 
			
		||||
      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
 | 
			
		||||
      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
 | 
			
		||||
      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
 | 
			
		||||
      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
 | 
			
		||||
      361 aknels"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "SOK2_SACCE",
 | 
			
		||||
              RefSeqID = "NP_013729",
 | 
			
		||||
              UniProtID = "P53438",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
 | 
			
		||||
       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
 | 
			
		||||
      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
 | 
			
		||||
      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
 | 
			
		||||
      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
 | 
			
		||||
      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
 | 
			
		||||
      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
 | 
			
		||||
      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
 | 
			
		||||
      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
 | 
			
		||||
      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
 | 
			
		||||
      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
 | 
			
		||||
      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
 | 
			
		||||
      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
 | 
			
		||||
      781 kkqek"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "XBP1_SACCE",
 | 
			
		||||
              RefSeqID = "NP_012165",
 | 
			
		||||
              UniProtID = "P40489",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
 | 
			
		||||
       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
 | 
			
		||||
      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
 | 
			
		||||
      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
 | 
			
		||||
      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
 | 
			
		||||
      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
 | 
			
		||||
      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
 | 
			
		||||
      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
 | 
			
		||||
      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
 | 
			
		||||
      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
 | 
			
		||||
      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# addSACCE_APSESproteins.R
 | 
			
		||||
# Adds the Saccharomyces cerevisiae APSES proteins to myDB
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "SWI4_SACCE",
 | 
			
		||||
              RefSeqID = "NP_011036",
 | 
			
		||||
              UniProtID = "P25302",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mpfdvlisnq kdntnhqnit pisksvllap hsnhpvieia tysetdvyec yirgfetkiv
 | 
			
		||||
       61 mrrtkddwin itqvfkiaqf sktkrtkile kesndmqhek vqggygrfqg twipldsakf
 | 
			
		||||
       121 lvnkyeiidp vvnsiltfqf dpnnpppkrs knsilrktsp gtkitspssy nktprkknss
 | 
			
		||||
       181 sstsatttaa nkkgkknasi nqpnpsplqn lvfqtpqqfq vnssmnimnn ndnhttmnfn
 | 
			
		||||
       241 ndtrhnlinn isnnsnqsti iqqqksihen sfnnnysatq kplqffpipt nlqnknvaln
 | 
			
		||||
       301 npnnndsnsy shnidnvins snnnnngnnn nliivpdgpm qsqqqqqhhh eyltnnfnhs
 | 
			
		||||
       361 mmdsitngns kkrrkklnqs neqqfynqqe kiqrhfklmk qpllwqsfqn pndhhneycd
 | 
			
		||||
       421 sngsnnnnnt vasngssiev fssnendnsm nmssrsmtpf sagntssqnk lenkmtdqey
 | 
			
		||||
       481 kqtiltilss erssdvdqal latlypapkn fninfeiddq ghtplhwata maniplikml
 | 
			
		||||
       541 itlnanalqc nklgfncitk sifynncyke nafdeiisil kiclitpdvn grlpfhylie
 | 
			
		||||
       601 lsvnksknpm iiksymdsii lslgqqdynl lkiclnyqdn igntplhlsa lnlnfevynr
 | 
			
		||||
       661 lvylgastdi lnldnespas imnkfntpag gsnsrnnntk adrklarnlp qknyyqqqqq
 | 
			
		||||
       721 qqqpqnnvki pkiiktqhpd kedstadvni aktdsevnes qylhsnqpns tnmntimedl
 | 
			
		||||
       781 sninsfvtss vikdikstps kilenspily rrrsqsisde kekakdnenq vekkkdplns
 | 
			
		||||
       841 vktampsles pssllpiqms plgkyskpls qqinklntkv sslqrimgee iknldnevve
 | 
			
		||||
       901 tessisnnkk rlitiahqie dafdsvsnkt pinsisdlqs riketsskln sekqnfiqsl
 | 
			
		||||
       961 eksqalklat ivqdeeskvd mntnssshpe kqedeepipk stsetsspkn tkadakfsnt
 | 
			
		||||
       1021 vqesydvnet lrlateltil qfkrrmttlk iseakskins svkldkyrnl igitienids
 | 
			
		||||
       1081 klddiekdlr ana"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "PHD1_SACCE",
 | 
			
		||||
              RefSeqID = "NP_012881",
 | 
			
		||||
              UniProtID = "P36093",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 myhvpemrlh yplvntqsna aitptrsydn tlpsfnelsh qstinlpfvq retpnayanv
 | 
			
		||||
       61 aqlatsptqa ksgyycryya vpfptypqqp qspyqqavlp yatipnsnfq pssfpvmavm
 | 
			
		||||
      121 ppevqfdgsf lntlhphtel ppiiqntndt svarpnnlks iaaasptvta ttrtpgvsst
 | 
			
		||||
      181 svlkprvitt mwedenticy qveangisvv rradnnming tkllnvtkmt rgrrdgilrs
 | 
			
		||||
      241 ekvrevvkig smhlkgvwip ferayilaqr eqildhlypl fvkdiesivd arkpsnkasl
 | 
			
		||||
      301 tpksspapik qepsdnkhei ateikpksid alsngastqg agelphlkin hidteaqtsr
 | 
			
		||||
      361 aknels"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "SOK2_SACCE",
 | 
			
		||||
              RefSeqID = "NP_013729",
 | 
			
		||||
              UniProtID = "P53438",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mpignpintn diksnrmrqe snmsavsnse stigqstqqq qqqqqylgqs vqplmpvsyq
 | 
			
		||||
       61 yvvpeqwpyp qyyqqpqsqs qqqlqsqpqm yqvqesfqss gsdsnasnpp stsvgvpsna
 | 
			
		||||
      121 tatalpngsa ittkksnnst nisnnvpyyy yfpqmqaqqs maysypqayy yypangdgtt
 | 
			
		||||
      181 ngatpsvtsn qvqnpnlekt ystfeqqqqh qqqqqlqaqt ypaqppkign afskfsksgp
 | 
			
		||||
      241 psdsssgsms pnsnrtsrns nsisslaqqp pmsnypqpst yqypgfhkts sipnshspip
 | 
			
		||||
      301 prslttptqg ptsqngplsy nlpqvgllpp qqqqqvsply dgnsitppvk pstdqetylt
 | 
			
		||||
      361 anrhgvsdqq ydsmaktmns fqtttirhpm pliattnatg sntsgtsasi irprvtttmw
 | 
			
		||||
      421 edektlcyqv eangisvvrr adndmvngtk llnvtkmtrg rrdgilkaek irhvvkigsm
 | 
			
		||||
      481 hlkgvwipfe ralaiaqrek iadylyplfi rdiqsvlkqn npsndsssss sstgiksisp
 | 
			
		||||
      541 rtyyqpinny qnpngpsnis aaqltyssmn lnnkiipnns ipavstiaag ekplkkctmp
 | 
			
		||||
      601 nsnqleghti tnlqtlsatm pmkqqlmgni asplsyprna tmnsastlgi tpadskpltp
 | 
			
		||||
      661 sptttntnqs sesnvgsiht gitlprvese sashskwske adsgntvpdn qtlkeprssq
 | 
			
		||||
      721 lpisaltstd tdkiktstsd eatqpnepse aepvkesess ksqvdgagdv sneeiaaddt
 | 
			
		||||
      781 kkqek"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
myDB$protein <-
 | 
			
		||||
    rbind(myDB$protein,
 | 
			
		||||
          data.frame(
 | 
			
		||||
              ID = dbAutoincrement(myDB$protein$ID, ns = "ref"),
 | 
			
		||||
              name = "XBP1_SACCE",
 | 
			
		||||
              RefSeqID = "NP_012165",
 | 
			
		||||
              UniProtID = "P40489",
 | 
			
		||||
              taxonomy.ID = as.integer(4932),
 | 
			
		||||
              sequence = dbSanitizeSequence("
 | 
			
		||||
        1 mkypafsins dtvhltdnpl ddyqrlylvs vldrdsppas fsaglnirkv nykssiaaqf
 | 
			
		||||
       61 thpnfiisar dagngeeaaa qnvlncfeyq fpnlqtiqsl vheqtllsql assatphsal
 | 
			
		||||
      121 hlhdknilmg kiilpsrsnk tpvsasptkq ekkalstasr enatssltkn qqfkltkmdh
 | 
			
		||||
      181 nlindklinp nncviwshds gyvfmtgiwr lyqdvmkgli nlprgdsvst sqqqffckae
 | 
			
		||||
      241 fekilsfcfy nhssftsees ssvllsssts sppkrrtstg stfldanass sstsstqann
 | 
			
		||||
      301 yidfhwnnik pelrdlicqs ykdflinelg pdqidlpnln panftkrirg gyikiqgtwl
 | 
			
		||||
      361 pmeisrllcl rfcfpiryfl vpifgpdfpk dceswylahq nvtfassttg agaataataa
 | 
			
		||||
      421 antstnftst avarprqkpr prprqrstsm shskaqklvi edalpsfdsf venlglssnd
 | 
			
		||||
      481 knfikknskr qksstytsqt sspigprdpt vqilsnlasf ynthghrysy pgniyipqqr
 | 
			
		||||
      541 yslpppnqls spqrqlnyty dhihpvpsqy qsprhynvps spiapapptf pqpygddhyh
 | 
			
		||||
      601 flkyasevyk qqnqrpahnt ntnmdtsfsp rannslnnfk fktnskq"),
 | 
			
		||||
              stringsAsFactors = FALSE))
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							
							
						
						
									
										138
									
								
								ABC-units.R
									
									
									
									
									
								
							@@ -1,69 +1,69 @@
 | 
			
		||||
# ABC-units.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course: R code for learning units
 | 
			
		||||
#
 | 
			
		||||
# Version: 4.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2020  09  16
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
# V 4.0    2020 version
 | 
			
		||||
# V 3.0    2019 version
 | 
			
		||||
# V 2.0    2018 version
 | 
			
		||||
# V 1.0    2017 version
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# The R-scripts and datasets in this project will be continuously updated,
 | 
			
		||||
# and updates will be posted on GitHub. To bring your version into the latest
 | 
			
		||||
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
 | 
			
		||||
# repository. However, this will overwrite locally edited version of files.
 | 
			
		||||
 | 
			
		||||
# To edit code and experiment with it, for example to add your own comments and
 | 
			
		||||
# examples, save your edited version into the "myScripts" folder. Otherwise you
 | 
			
		||||
# may have problems with git when you update the project to a new version. It's
 | 
			
		||||
# good practice to change the filename, for example by prepending your initials.
 | 
			
		||||
# This helps distinguish the files you are working with e.g. in a list of
 | 
			
		||||
# recent files. For example if your name is Honjo Tasuku, your edited
 | 
			
		||||
# BIN-Sequence.R might be named HT-BIN-Sequence.R
 | 
			
		||||
 | 
			
		||||
# If you pull from github and get the following type of error ...
 | 
			
		||||
#     ---------------
 | 
			
		||||
#     error: Your local changes to the following files would be
 | 
			
		||||
#     overwritten by merge
 | 
			
		||||
#     ...
 | 
			
		||||
#     Please commit your changes or stash them before you can merge.
 | 
			
		||||
#     ---------------
 | 
			
		||||
# ... then, you need to bring the offending file into its original state.
 | 
			
		||||
# Open the Commit window, select the file, and click on the Revert button.
 | 
			
		||||
#
 | 
			
		||||
# When working with these script DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# Once you have typed and executed the function init(), you will find a file
 | 
			
		||||
# called myScript.R in the project directory.
 | 
			
		||||
#
 | 
			
		||||
# Open it, you can place all of your code-experiments and notes into that
 | 
			
		||||
# file. This will complement your "Course Journal". If you keep all contents in
 | 
			
		||||
# this one file, you can find everything by using the <cmd>-F find function. To
 | 
			
		||||
# cross-reference code in your journal, create section headings.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# The individual learning units' files can be opened by simply clicking on them
 | 
			
		||||
# in the File pane.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# ABC-units.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course: R code for learning units
 | 
			
		||||
#
 | 
			
		||||
# Version: 4.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2020  09  16
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
# V 4.0    2020 version
 | 
			
		||||
# V 3.0    2019 version
 | 
			
		||||
# V 2.0    2018 version
 | 
			
		||||
# V 1.0    2017 version
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# The R-scripts and datasets in this project will be continuously updated,
 | 
			
		||||
# and updates will be posted on GitHub. To bring your version into the latest
 | 
			
		||||
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
 | 
			
		||||
# repository. However, this will overwrite locally edited version of files.
 | 
			
		||||
 | 
			
		||||
# To edit code and experiment with it, for example to add your own comments and
 | 
			
		||||
# examples, save your edited version into the "myScripts" folder. Otherwise you
 | 
			
		||||
# may have problems with git when you update the project to a new version. It's
 | 
			
		||||
# good practice to change the filename, for example by prepending your initials.
 | 
			
		||||
# This helps distinguish the files you are working with e.g. in a list of
 | 
			
		||||
# recent files. For example if your name is Honjo Tasuku, your edited
 | 
			
		||||
# BIN-Sequence.R might be named HT-BIN-Sequence.R
 | 
			
		||||
 | 
			
		||||
# If you pull from github and get the following type of error ...
 | 
			
		||||
#     ---------------
 | 
			
		||||
#     error: Your local changes to the following files would be
 | 
			
		||||
#     overwritten by merge
 | 
			
		||||
#     ...
 | 
			
		||||
#     Please commit your changes or stash them before you can merge.
 | 
			
		||||
#     ---------------
 | 
			
		||||
# ... then, you need to bring the offending file into its original state.
 | 
			
		||||
# Open the Commit window, select the file, and click on the Revert button.
 | 
			
		||||
#
 | 
			
		||||
# When working with these script DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# Once you have typed and executed the function init(), you will find a file
 | 
			
		||||
# called myScript.R in the project directory.
 | 
			
		||||
#
 | 
			
		||||
# Open it, you can place all of your code-experiments and notes into that
 | 
			
		||||
# file. This will complement your "Course Journal". If you keep all contents in
 | 
			
		||||
# this one file, you can find everything by using the <cmd>-F find function. To
 | 
			
		||||
# cross-reference code in your journal, create section headings.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# The individual learning units' files can be opened by simply clicking on them
 | 
			
		||||
# in the File pane.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,16 +1,16 @@
 | 
			
		||||
Version: 1.0
 | 
			
		||||
 | 
			
		||||
RestoreWorkspace: No
 | 
			
		||||
SaveWorkspace: No
 | 
			
		||||
AlwaysSaveHistory: No
 | 
			
		||||
 | 
			
		||||
EnableCodeIndexing: Yes
 | 
			
		||||
UseSpacesForTab: Yes
 | 
			
		||||
NumSpacesForTab: 2
 | 
			
		||||
Encoding: UTF-8
 | 
			
		||||
 | 
			
		||||
RnwWeave: knitr
 | 
			
		||||
LaTeX: XeLaTeX
 | 
			
		||||
 | 
			
		||||
AutoAppendNewline: Yes
 | 
			
		||||
StripTrailingWhitespace: Yes
 | 
			
		||||
Version: 1.0
 | 
			
		||||
 | 
			
		||||
RestoreWorkspace: No
 | 
			
		||||
SaveWorkspace: No
 | 
			
		||||
AlwaysSaveHistory: No
 | 
			
		||||
 | 
			
		||||
EnableCodeIndexing: Yes
 | 
			
		||||
UseSpacesForTab: Yes
 | 
			
		||||
NumSpacesForTab: 2
 | 
			
		||||
Encoding: UTF-8
 | 
			
		||||
 | 
			
		||||
RnwWeave: knitr
 | 
			
		||||
LaTeX: XeLaTeX
 | 
			
		||||
 | 
			
		||||
AutoAppendNewline: Yes
 | 
			
		||||
StripTrailingWhitespace: Yes
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										222
									
								
								BIN-ALI-BLAST.R
									
									
									
									
									
								
							@@ -1,111 +1,111 @@
 | 
			
		||||
# tocID <- "BIN-ALI-BLAST.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-BLAST unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance
 | 
			
		||||
#           1.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.1    Fixed parsing logic.
 | 
			
		||||
#           1.0    First live version 2017.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                               Line
 | 
			
		||||
#TOC> ---------------------------------------------------
 | 
			
		||||
#TOC>   1        Defining the APSES domain             45
 | 
			
		||||
#TOC>   2        Executing the BLAST search            75
 | 
			
		||||
#TOC>   3        Analysing results                     97
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Defining the APSES domain  ===========================================
 | 
			
		||||
 | 
			
		||||
# Load your protein database
 | 
			
		||||
source("makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
 | 
			
		||||
# have entered this data into your database in the
 | 
			
		||||
# BIN-ALI-Optimal_sequence_alignment unit.)
 | 
			
		||||
 | 
			
		||||
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
 | 
			
		||||
                                                # name of the Mbp1 orthologue
 | 
			
		||||
                                                # of Mbp1 in your protein
 | 
			
		||||
                                                # database, DON'T continue. We
 | 
			
		||||
                                                # need to fix this problem.
 | 
			
		||||
                                                # Get in touch.
 | 
			
		||||
 | 
			
		||||
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                               myDB$annotation$featureID == ftrID])
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
 | 
			
		||||
# BLAST search.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Executing the BLAST search  ==========================================
 | 
			
		||||
 | 
			
		||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
 | 
			
		||||
# through its Web API, and to parse results. Have a look at the script, then
 | 
			
		||||
# source it:
 | 
			
		||||
 | 
			
		||||
source("./scripts/BLAST.R")
 | 
			
		||||
 | 
			
		||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
 | 
			
		||||
# cerevisiae:
 | 
			
		||||
 | 
			
		||||
BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
 | 
			
		||||
                     db = "refseq_protein",        # database to search in
 | 
			
		||||
                     nHits = 10,                   #
 | 
			
		||||
                     E = 0.01,                     #
 | 
			
		||||
                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
length(BLASTresults$hits)  # There should be at least one hit there. Ask for
 | 
			
		||||
                           # advice in case this step fails.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Analysing results  ===================================================
 | 
			
		||||
 | 
			
		||||
(topHit <- BLASTresults$hits[[1]])   # Get the top hit
 | 
			
		||||
 | 
			
		||||
# What is the refseq ID of the top hit
 | 
			
		||||
topHit$accession
 | 
			
		||||
 | 
			
		||||
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
 | 
			
		||||
# domain. If it is not, ask me for advice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-ALI-BLAST.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-BLAST unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance
 | 
			
		||||
#           1.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.1    Fixed parsing logic.
 | 
			
		||||
#           1.0    First live version 2017.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                               Line
 | 
			
		||||
#TOC> ---------------------------------------------------
 | 
			
		||||
#TOC>   1        Defining the APSES domain             45
 | 
			
		||||
#TOC>   2        Executing the BLAST search            75
 | 
			
		||||
#TOC>   3        Analysing results                     97
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Defining the APSES domain  ===========================================
 | 
			
		||||
 | 
			
		||||
# Load your protein database
 | 
			
		||||
source("makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
# Get the APSES domain sequence via your MBP1_MYSPE feature annotation. (You
 | 
			
		||||
# have entered this data into your database in the
 | 
			
		||||
# BIN-ALI-Optimal_sequence_alignment unit.)
 | 
			
		||||
 | 
			
		||||
( myOrth <- sprintf("MBP1_%s", biCode(MYSPE)) ) # If this is not the correct
 | 
			
		||||
                                                # name of the Mbp1 orthologue
 | 
			
		||||
                                                # of Mbp1 in your protein
 | 
			
		||||
                                                # database, DON'T continue. We
 | 
			
		||||
                                                # need to fix this problem.
 | 
			
		||||
                                                # Get in touch.
 | 
			
		||||
 | 
			
		||||
(proID <- myDB$protein$ID[myDB$protein$name == myOrth])
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                               myDB$annotation$featureID == ftrID])
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
# The MYSPE "apses" sequence is the sequence that we will use for our reverse
 | 
			
		||||
# BLAST search.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Executing the BLAST search  ==========================================
 | 
			
		||||
 | 
			
		||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
 | 
			
		||||
# through its Web API, and to parse results. Have a look at the script, then
 | 
			
		||||
# source it:
 | 
			
		||||
 | 
			
		||||
source("./scripts/BLAST.R")
 | 
			
		||||
 | 
			
		||||
# Use BLAST() to find the best match to the MYSPE APSES domain in Saccharomyces
 | 
			
		||||
# cerevisiae:
 | 
			
		||||
 | 
			
		||||
BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence
 | 
			
		||||
                     db = "refseq_protein",        # database to search in
 | 
			
		||||
                     nHits = 10,                   #
 | 
			
		||||
                     E = 0.01,                     #
 | 
			
		||||
                     limits = "txid559292[ORGN]")  # S. cerevisiae S288c
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
length(BLASTresults$hits)  # There should be at least one hit there. Ask for
 | 
			
		||||
                           # advice in case this step fails.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Analysing results  ===================================================
 | 
			
		||||
 | 
			
		||||
(topHit <- BLASTresults$hits[[1]])   # Get the top hit
 | 
			
		||||
 | 
			
		||||
# What is the refseq ID of the top hit
 | 
			
		||||
topHit$accession
 | 
			
		||||
 | 
			
		||||
# If this is "NP_010227.1" you have confirmed the RBM of the MYSPE apses
 | 
			
		||||
# domain. If it is not, ask me for advice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,195 +1,195 @@
 | 
			
		||||
# tocID <- "BIN-ALI-Dotplot.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Dotplot unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  0.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2019  01  07
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           0.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                  Line
 | 
			
		||||
#TOC> --------------------------------------
 | 
			
		||||
#TOC>   1        ___Section___            42
 | 
			
		||||
#TOC>   2        Tasks                   190
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ___Section___  =======================================================
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's load BLOSUM62
 | 
			
		||||
data(BLOSUM62, package = "Biostrings")
 | 
			
		||||
 | 
			
		||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
 | 
			
		||||
# matrix that has as many rows as one sequence, as many columns as another. Then
 | 
			
		||||
# we go through every cell of the matrix and enter the pairscore we encounter
 | 
			
		||||
# for the amino acid pair whose position corresponds to the row and column
 | 
			
		||||
# index. Finally we visualize the matrix in a plot.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# First we fetch our sequences and split them into single characters.
 | 
			
		||||
sel <- myDB$protein$name == "MBP1_SACCE"
 | 
			
		||||
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
# Check that we have two character vectors of the expected length.
 | 
			
		||||
str(MBP1_SACCE)
 | 
			
		||||
str(MBP1_MYSPE)
 | 
			
		||||
 | 
			
		||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
 | 
			
		||||
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
 | 
			
		||||
MBP1_SACCE[13]
 | 
			
		||||
MBP1_MYSPE[21]
 | 
			
		||||
 | 
			
		||||
# ... using these as subsetting expressions, we can pull the pairscore
 | 
			
		||||
# from the MDM
 | 
			
		||||
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
 | 
			
		||||
 | 
			
		||||
# First we build an empty matrix that will hold all pairscores ...
 | 
			
		||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
 | 
			
		||||
                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
 | 
			
		||||
 | 
			
		||||
# ... then we loop over the sequences and store the scores in the matrix.
 | 
			
		||||
#
 | 
			
		||||
for (i in 1:length(MBP1_SACCE)) {
 | 
			
		||||
  for (j in 1:length(MBP1_MYSPE)) {
 | 
			
		||||
    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Even though this is a large matrix, this does not take much time ...
 | 
			
		||||
# Let's have a look at a small block of the values:
 | 
			
		||||
 | 
			
		||||
dotMat[1:10, 1:10]
 | 
			
		||||
 | 
			
		||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
 | 
			
		||||
# the matrix correspond to an amino acid from MBP1_MYSPE.
 | 
			
		||||
 | 
			
		||||
# To plot this, we use the image() function. Here, with default parameters.
 | 
			
		||||
 | 
			
		||||
image(dotMat)
 | 
			
		||||
 | 
			
		||||
# Be patient, this takes a few moments to render: more than 500,000 values.
 | 
			
		||||
# Nice.
 | 
			
		||||
# What do you expect?
 | 
			
		||||
# What would similar sequences look like?
 | 
			
		||||
# What do you see?
 | 
			
		||||
 | 
			
		||||
#You migh notice a thin line of yellow along the diagonal, moving approximately
 | 
			
		||||
# from bottom left to top right, fading in and out of existence. This is the
 | 
			
		||||
# signature of extended sequence similarity.
 | 
			
		||||
 | 
			
		||||
# Let's magnify this a bit by looking at only the first 200 amino acids ...
 | 
			
		||||
image(dotMat[1:200, 1:200])
 | 
			
		||||
 | 
			
		||||
# ... and, according to our normal writing convention, we would like the
 | 
			
		||||
# diagonal to run from top-left to bottom-right since we write from left to
 | 
			
		||||
# right and from top to bottom...
 | 
			
		||||
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
 | 
			
		||||
 | 
			
		||||
# ... and we would like the range of the x- and y- axis to correspond to the
 | 
			
		||||
# sequence position ...
 | 
			
		||||
image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
 | 
			
		||||
 | 
			
		||||
# ... and labels! Axis labels would be nice ...
 | 
			
		||||
image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
 | 
			
		||||
      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
 | 
			
		||||
 | 
			
		||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
 | 
			
		||||
# too ...
 | 
			
		||||
len <- 200
 | 
			
		||||
image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
 | 
			
		||||
      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
 | 
			
		||||
box()
 | 
			
		||||
axis(1, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(2, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(3, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(4, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
 | 
			
		||||
# ... you get the idea, we can infinitely customize our plot. However a good way
 | 
			
		||||
# to do this is to develop a particular view for, say, a report or publication
 | 
			
		||||
# in a script and then put it into a function. I have put a function into the
 | 
			
		||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
 | 
			
		||||
# there already is a dotplot function in the seqinr package:
 | 
			
		||||
 | 
			
		||||
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
 | 
			
		||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
 | 
			
		||||
 | 
			
		||||
# Which one do you prefer? You can probably see the block patterns that arise
 | 
			
		||||
# from segments of repetitive, low complexity sequence. But you probably have to
 | 
			
		||||
# look very closely to discern the faint diagonals that correspond to similar
 | 
			
		||||
# sequence.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's see if we can enhance the contrast between distributed noise and the
 | 
			
		||||
# actual alignment of conserved residues. We can filter the dot matrix with a
 | 
			
		||||
# pattern that enhances diagonally repeated values. Every value in the matrix
 | 
			
		||||
# will be replaced by a weighted average of its neighborhood. Here is  a
 | 
			
		||||
# diagonal-filter:
 | 
			
		||||
 | 
			
		||||
myFilter <- matrix(numeric(25), nrow = 5)
 | 
			
		||||
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
 | 
			
		||||
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
 | 
			
		||||
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
 | 
			
		||||
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
 | 
			
		||||
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
 | 
			
		||||
 | 
			
		||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
 | 
			
		||||
 | 
			
		||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
 | 
			
		||||
 | 
			
		||||
# I think the result shows quite nicely how the two sequences are globally
 | 
			
		||||
# related and where the regions of sequence similarity are. Play with this a bit
 | 
			
		||||
# ...  Can you come up with a better filter? If so, eMail us.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Tasks  ===============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-ALI-Dotplot.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Dotplot unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  0.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2019  01  07
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           0.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                  Line
 | 
			
		||||
#TOC> --------------------------------------
 | 
			
		||||
#TOC>   1        ___Section___            42
 | 
			
		||||
#TOC>   2        Tasks                   190
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ___Section___  =======================================================
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's load BLOSUM62
 | 
			
		||||
data(BLOSUM62, package = "Biostrings")
 | 
			
		||||
 | 
			
		||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
 | 
			
		||||
# matrix that has as many rows as one sequence, as many columns as another. Then
 | 
			
		||||
# we go through every cell of the matrix and enter the pairscore we encounter
 | 
			
		||||
# for the amino acid pair whose position corresponds to the row and column
 | 
			
		||||
# index. Finally we visualize the matrix in a plot.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# First we fetch our sequences and split them into single characters.
 | 
			
		||||
sel <- myDB$protein$name == "MBP1_SACCE"
 | 
			
		||||
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
# Check that we have two character vectors of the expected length.
 | 
			
		||||
str(MBP1_SACCE)
 | 
			
		||||
str(MBP1_MYSPE)
 | 
			
		||||
 | 
			
		||||
# How do we get the pairscore values? Consider: a single pair of amino acids can
 | 
			
		||||
# be obtained from sequence SACCE and MYSPE eg. from position 13 and 21 ...
 | 
			
		||||
MBP1_SACCE[13]
 | 
			
		||||
MBP1_MYSPE[21]
 | 
			
		||||
 | 
			
		||||
# ... using these as subsetting expressions, we can pull the pairscore
 | 
			
		||||
# from the MDM
 | 
			
		||||
BLOSUM62[MBP1_SACCE[13], MBP1_MYSPE[21]]
 | 
			
		||||
 | 
			
		||||
# First we build an empty matrix that will hold all pairscores ...
 | 
			
		||||
dotMat <- matrix(numeric(length(MBP1_SACCE) * length(MBP1_MYSPE)),
 | 
			
		||||
                 nrow = length(MBP1_SACCE), ncol = length(MBP1_MYSPE))
 | 
			
		||||
 | 
			
		||||
# ... then we loop over the sequences and store the scores in the matrix.
 | 
			
		||||
#
 | 
			
		||||
for (i in 1:length(MBP1_SACCE)) {
 | 
			
		||||
  for (j in 1:length(MBP1_MYSPE)) {
 | 
			
		||||
    dotMat[i, j] <- BLOSUM62[MBP1_SACCE[i], MBP1_MYSPE[j]]
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Even though this is a large matrix, this does not take much time ...
 | 
			
		||||
# Let's have a look at a small block of the values:
 | 
			
		||||
 | 
			
		||||
dotMat[1:10, 1:10]
 | 
			
		||||
 | 
			
		||||
# Rows in this matrix correspond to an amino acid from MBP1_SACCE, columns in
 | 
			
		||||
# the matrix correspond to an amino acid from MBP1_MYSPE.
 | 
			
		||||
 | 
			
		||||
# To plot this, we use the image() function. Here, with default parameters.
 | 
			
		||||
 | 
			
		||||
image(dotMat)
 | 
			
		||||
 | 
			
		||||
# Be patient, this takes a few moments to render: more than 500,000 values.
 | 
			
		||||
# Nice.
 | 
			
		||||
# What do you expect?
 | 
			
		||||
# What would similar sequences look like?
 | 
			
		||||
# What do you see?
 | 
			
		||||
 | 
			
		||||
#You migh notice a thin line of yellow along the diagonal, moving approximately
 | 
			
		||||
# from bottom left to top right, fading in and out of existence. This is the
 | 
			
		||||
# signature of extended sequence similarity.
 | 
			
		||||
 | 
			
		||||
# Let's magnify this a bit by looking at only the first 200 amino acids ...
 | 
			
		||||
image(dotMat[1:200, 1:200])
 | 
			
		||||
 | 
			
		||||
# ... and, according to our normal writing convention, we would like the
 | 
			
		||||
# diagonal to run from top-left to bottom-right since we write from left to
 | 
			
		||||
# right and from top to bottom...
 | 
			
		||||
image(dotMat[1:200, 1:200], ylim = 1.0:0.0)
 | 
			
		||||
 | 
			
		||||
# ... and we would like the range of the x- and y- axis to correspond to the
 | 
			
		||||
# sequence position ...
 | 
			
		||||
image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1))
 | 
			
		||||
 | 
			
		||||
# ... and labels! Axis labels would be nice ...
 | 
			
		||||
image(x = 1:200, y = 1:200,  dotMat[1:200, 1:200], ylim=c(200,1),
 | 
			
		||||
      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE" )
 | 
			
		||||
 | 
			
		||||
# ... and why don't we have axis-numbers on all four sides? Go, make that right
 | 
			
		||||
# too ...
 | 
			
		||||
len <- 200
 | 
			
		||||
image(x = 1:len, y = 1:len,  dotMat[1:len, 1:len], ylim=c(len,1),
 | 
			
		||||
      xlab = "MBP1_MYSPE", ylab = "MBP1_SACCE", axes = FALSE)
 | 
			
		||||
box()
 | 
			
		||||
axis(1, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(2, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(3, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
axis(4, at = c(1, seq(10, len, by=10)))
 | 
			
		||||
 | 
			
		||||
# ... you get the idea, we can infinitely customize our plot. However a good way
 | 
			
		||||
# to do this is to develop a particular view for, say, a report or publication
 | 
			
		||||
# in a script and then put it into a function. I have put a function into the
 | 
			
		||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
 | 
			
		||||
# there already is a dotplot function in the seqinr package:
 | 
			
		||||
 | 
			
		||||
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr
 | 
			
		||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's
 | 
			
		||||
 | 
			
		||||
# Which one do you prefer? You can probably see the block patterns that arise
 | 
			
		||||
# from segments of repetitive, low complexity sequence. But you probably have to
 | 
			
		||||
# look very closely to discern the faint diagonals that correspond to similar
 | 
			
		||||
# sequence.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's see if we can enhance the contrast between distributed noise and the
 | 
			
		||||
# actual alignment of conserved residues. We can filter the dot matrix with a
 | 
			
		||||
# pattern that enhances diagonally repeated values. Every value in the matrix
 | 
			
		||||
# will be replaced by a weighted average of its neighborhood. Here is  a
 | 
			
		||||
# diagonal-filter:
 | 
			
		||||
 | 
			
		||||
myFilter <- matrix(numeric(25), nrow = 5)
 | 
			
		||||
myFilter[1, ] <- c( 1, 0, 0, 0, 0)
 | 
			
		||||
myFilter[2, ] <- c( 0, 1, 0, 0, 0)
 | 
			
		||||
myFilter[3, ] <- c( 0, 0, 1, 0, 0)
 | 
			
		||||
myFilter[4, ] <- c( 0, 0, 0, 1, 0)
 | 
			
		||||
myFilter[5, ] <- c( 0, 0, 0, 0, 1)
 | 
			
		||||
 | 
			
		||||
# I have added the option to read such filters (or others that you could define on your own) as a parameter of the function.
 | 
			
		||||
 | 
			
		||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
 | 
			
		||||
 | 
			
		||||
# I think the result shows quite nicely how the two sequences are globally
 | 
			
		||||
# related and where the regions of sequence similarity are. Play with this a bit
 | 
			
		||||
# ...  Can you come up with a better filter? If so, eMail us.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Tasks  ===============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
							
						
						
									
										1256
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,365 +1,365 @@
 | 
			
		||||
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# Version:  1.7.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09   -   2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
 | 
			
		||||
#           1.7    2020 updates
 | 
			
		||||
#           1.6    Maintenance
 | 
			
		||||
#           1.5    Change from require() to requireNamespace(),
 | 
			
		||||
#                    use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.4    Pull s2c() from seqinr package, rather then loading the
 | 
			
		||||
#                    entire library.
 | 
			
		||||
#           1.3    Updated confirmation task with correct logic
 | 
			
		||||
#           1.2    Added missing load of seqinr package
 | 
			
		||||
#           1.1    Update annotation file logic - it could already have been
 | 
			
		||||
#                    prepared in the BIN-FUNC-Annotation unit.
 | 
			
		||||
#           1.0.1  bugfix
 | 
			
		||||
#           1.0    First 2017 live version.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                      Line
 | 
			
		||||
#TOC> --------------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Prepare                                                      58
 | 
			
		||||
#TOC>   2        Biostrings Pairwise Alignment                                75
 | 
			
		||||
#TOC>   2.1        Optimal global alignment                                   93
 | 
			
		||||
#TOC>   2.2        Optimal local alignment                                   156
 | 
			
		||||
#TOC>   3        APSES Domain annotation by alignment                        180
 | 
			
		||||
#TOC>   4        Update your database script                                 261
 | 
			
		||||
#TOC>   4.1        Preparing an annotation file ...                          267
 | 
			
		||||
#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
 | 
			
		||||
#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
 | 
			
		||||
#TOC>   4.2        Execute and Validate                                      338
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Prepare  =============================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# You can get package information with the following commands:
 | 
			
		||||
# library(help = seqinr)       # basic information
 | 
			
		||||
# browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
# data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You need to recreate the protein database that you have constructed in the
 | 
			
		||||
# BIN-Storing_data unit.
 | 
			
		||||
 | 
			
		||||
source("./myScripts/makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Biostrings Pairwise Alignment  =======================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Biostrings stores sequences in "XString" objects. Once we have converted our
 | 
			
		||||
# target sequences to AAString objects, the alignment itself is straightforward.
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Optimal global alignment  ==========================================
 | 
			
		||||
 | 
			
		||||
# The pairwiseAlignment() function was written to behave
 | 
			
		||||
# exactly like the functions you encountered on the EMBOSS server.
 | 
			
		||||
 | 
			
		||||
# First: make AAString objects ...
 | 
			
		||||
sel <- myDB$protein$name == "MBP1_SACCE"
 | 
			
		||||
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
?pairwiseAlignment
 | 
			
		||||
# ... and align.
 | 
			
		||||
# Global optimal alignment with end-gap penalties is default.
 | 
			
		||||
ali1 <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMBP1_SACCE,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 10,
 | 
			
		||||
  gapExtension = 0.5)
 | 
			
		||||
 | 
			
		||||
str(ali1)  # ... it's complicated
 | 
			
		||||
 | 
			
		||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
 | 
			
		||||
# tame it:
 | 
			
		||||
ali1
 | 
			
		||||
Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
 | 
			
		||||
 | 
			
		||||
# And we can make the internal structure work for us  (@ is for classes as
 | 
			
		||||
# $ is for lists ...)
 | 
			
		||||
str(ali1@pattern)
 | 
			
		||||
ali1@pattern
 | 
			
		||||
ali1@pattern@range
 | 
			
		||||
ali1@pattern@indel
 | 
			
		||||
ali1@pattern@mismatch
 | 
			
		||||
 | 
			
		||||
# or work with "normal" R functions
 | 
			
		||||
# the alignment length
 | 
			
		||||
nchar(as.character(ali1@pattern))
 | 
			
		||||
 | 
			
		||||
# the number of identities
 | 
			
		||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
 | 
			
		||||
    seqinr::s2c(as.character(ali1@subject)))
 | 
			
		||||
 | 
			
		||||
# ... e.g. to calculate the percentage of identities
 | 
			
		||||
100 *
 | 
			
		||||
  sum(seqinr::s2c(as.character(ali1@pattern)) ==
 | 
			
		||||
      seqinr::s2c(as.character(ali1@subject))) /
 | 
			
		||||
  nchar(as.character(ali1@pattern))
 | 
			
		||||
# ... which should be the same as reported in the writePairwiseAlignments()
 | 
			
		||||
# output. Awkward to type? Then it calls for a function:
 | 
			
		||||
#
 | 
			
		||||
percentID <- function(al) {
 | 
			
		||||
  # returns the percent-identity of a Biostrings alignment object
 | 
			
		||||
  return(100 *
 | 
			
		||||
         sum(seqinr::s2c(as.character(al@pattern)) ==
 | 
			
		||||
             seqinr::s2c(as.character(al@subject))) /
 | 
			
		||||
         nchar(as.character(al@pattern)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
percentID(ali1)
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Optimal local alignment  ===========================================
 | 
			
		||||
 | 
			
		||||
# Compare with local optimal alignment (like EMBOSS Water)
 | 
			
		||||
ali2 <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMBP1_SACCE,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  type = "local",
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 50,
 | 
			
		||||
  gapExtension = 10)
 | 
			
		||||
 | 
			
		||||
Biostrings::writePairwiseAlignments(ali2)
 | 
			
		||||
# This has probably only aligned the N-terminal DNA binding domain - but that
 | 
			
		||||
# one has quite high sequence identity:
 | 
			
		||||
percentID(ali2)
 | 
			
		||||
 | 
			
		||||
# == TASK: ==
 | 
			
		||||
 | 
			
		||||
# Compare the two alignments. I have weighted the local alignment heavily
 | 
			
		||||
# towards an ungapped alignment by setting very high gap penalties. Try changing
 | 
			
		||||
# the gap penalties and see what happens: how does the number of indels change,
 | 
			
		||||
# how does the length of indels change...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  APSES Domain annotation by alignment  ================================
 | 
			
		||||
 | 
			
		||||
# In this section we define the MYSPE APSES sequence by performing a global,
 | 
			
		||||
# optimal sequence alignment of the yeast APSES domain with the full length
 | 
			
		||||
# protein sequence of the protein that was the most similar to the yeast APSES
 | 
			
		||||
# domain.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# I have annotated the yeast APSES domain as a feature in the
 | 
			
		||||
# database. To view the annotation, we can retrieve it via the proteinID and
 | 
			
		||||
# featureID. Here is the yeast protein ID:
 | 
			
		||||
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ... and if you look at the feature table, you can identify the feature ID
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
 | 
			
		||||
# ... and with the two annotations we can get the corresponding ID from the
 | 
			
		||||
# annotation table
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                             myDB$annotation$featureID == ftrID])
 | 
			
		||||
 | 
			
		||||
myDB$annotation[myDB$annotation$ID == proID &
 | 
			
		||||
                myDB$annotation$ID == ftrID, ]
 | 
			
		||||
 | 
			
		||||
# The annotation record contains the start and end coordinates which we can use
 | 
			
		||||
# to define the APSES domain sequence with a substr() expression.
 | 
			
		||||
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
 | 
			
		||||
# have selected from the sequence column of the protein table the sequence whose
 | 
			
		||||
# name is "MBP1_SACCE", and selected from the annotation table the start
 | 
			
		||||
# and end coordinates of the annotation that joins an "APSES fold" feature with
 | 
			
		||||
# the sequence, and used the start and end coordinates to extract a substring.
 | 
			
		||||
 | 
			
		||||
# Let's convert this to an AAstring and assign it:
 | 
			
		||||
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
 | 
			
		||||
 | 
			
		||||
# Now let's align these two sequences of very different length without end-gap
 | 
			
		||||
# penalties using the "overlap" type. "overlap" turns the
 | 
			
		||||
# end-gap penalties off and that is crucially important since
 | 
			
		||||
# the sequences have very different length.
 | 
			
		||||
 | 
			
		||||
aliApses <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMB1_SACCE_APSES,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  type = "overlap",
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 10,
 | 
			
		||||
  gapExtension = 0.5)
 | 
			
		||||
 | 
			
		||||
# Inspect the result. The aligned sequences should be clearly
 | 
			
		||||
# homologous, and have (almost) no indels. The entire "pattern"
 | 
			
		||||
# sequence from QIYSAR ... to ... KPLFDF  should be matched
 | 
			
		||||
# with the "query". Is this correct?
 | 
			
		||||
Biostrings::writePairwiseAlignments(aliApses)
 | 
			
		||||
 | 
			
		||||
# If this is correct, you can extract the matched sequence from
 | 
			
		||||
# the alignment object. The syntax is a bit different from what
 | 
			
		||||
# you have seen before: this is an "S4 object", not a list. No
 | 
			
		||||
# worries: as.character() returns a normal string.
 | 
			
		||||
as.character(aliApses@subject)
 | 
			
		||||
 | 
			
		||||
# Now, what are the aligned start and end coordinates? You can read them from
 | 
			
		||||
# the output of writePairwiseAlignments(), or you can get them from the range of
 | 
			
		||||
# the match.
 | 
			
		||||
 | 
			
		||||
str(aliApses@subject@range)
 | 
			
		||||
 | 
			
		||||
# start is:
 | 
			
		||||
aliApses@subject@range@start
 | 
			
		||||
 | 
			
		||||
# ... and end is:
 | 
			
		||||
aliApses@subject@range@start + aliApses@subject@range@width - 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Update your database script  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Since we have this feature defined now, we can create a feature annotation
 | 
			
		||||
# right away and store it in myDB.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Preparing an annotation file ...  ==================================
 | 
			
		||||
#
 | 
			
		||||
# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
 | 
			
		||||
#     myScripts/ directory.
 | 
			
		||||
#
 | 
			
		||||
#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
 | 
			
		||||
#     if MYSPE is called "Crptycoccus neoformans", your file should be called
 | 
			
		||||
#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
 | 
			
		||||
#     "MBP1_CRYNE").
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor and delete all blocks for
 | 
			
		||||
#     the Mbp1 protein annotations except the first one.
 | 
			
		||||
#
 | 
			
		||||
#   - From that block, delete all lines except for the line that says:
 | 
			
		||||
#
 | 
			
		||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
 | 
			
		||||
#
 | 
			
		||||
#   - Then delete the comma at the end of the line (your file will just have
 | 
			
		||||
#     this one annotation).
 | 
			
		||||
#
 | 
			
		||||
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
 | 
			
		||||
#     "start" and "end" features to the coordinates you just discovered for the
 | 
			
		||||
#     APSES domain in your sequence.
 | 
			
		||||
#
 | 
			
		||||
#   - Save the file in your myScripts/ directory
 | 
			
		||||
#
 | 
			
		||||
##   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#   - Update your "./myScripts/makeProteinDB.R" script to load your new
 | 
			
		||||
#     annotation when you recreate the database. Open the script in the
 | 
			
		||||
#     RStudio editor, and add the following command at the end:
 | 
			
		||||
#
 | 
			
		||||
#     myDB <- dbAddAnnotation(myDB,
 | 
			
		||||
#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
 | 
			
		||||
#                                                 ^^^^^^^
 | 
			
		||||
#                                                edit this!
 | 
			
		||||
#   - save and close the file.
 | 
			
		||||
#
 | 
			
		||||
# Then SKIP the next section.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#   You DO already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor.
 | 
			
		||||
#
 | 
			
		||||
#   - Below the last feature lines (but before the closing "]") add the
 | 
			
		||||
#     following feature line (without the "#")
 | 
			
		||||
#
 | 
			
		||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
 | 
			
		||||
#
 | 
			
		||||
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
 | 
			
		||||
#     "start" and "end" features to the coordinates you just discovered for the
 | 
			
		||||
#     APSES domain in your sequence.
 | 
			
		||||
#
 | 
			
		||||
#   - Add a comma after the preceding feature line.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==   4.2  Execute and Validate  ==============================================
 | 
			
		||||
#
 | 
			
		||||
#   - source() your database creation script:
 | 
			
		||||
#
 | 
			
		||||
#  source("./myScripts/makeProteinDB.R")
 | 
			
		||||
#
 | 
			
		||||
#     This should run without errors or warnings. If it doesn't work and you
 | 
			
		||||
#     can't figure out quickly what's happening, ask on the mailing list for
 | 
			
		||||
#     help.
 | 
			
		||||
#
 | 
			
		||||
#   - Confirm
 | 
			
		||||
#     The following commands should retrieve the correct start and end
 | 
			
		||||
#     coordinates and sequence of the MBP1_MYSPE APSES domain:
 | 
			
		||||
 | 
			
		||||
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
 | 
			
		||||
 | 
			
		||||
(proID <- myDB$protein$ID[sel])
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                             myDB$annotation$featureID == ftrID])
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# Version:  1.7.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09   -   2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.7.1  add jsonlite:: to fromjJSON() in code sample and ./myScripts/
 | 
			
		||||
#           1.7    2020 updates
 | 
			
		||||
#           1.6    Maintenance
 | 
			
		||||
#           1.5    Change from require() to requireNamespace(),
 | 
			
		||||
#                    use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.4    Pull s2c() from seqinr package, rather then loading the
 | 
			
		||||
#                    entire library.
 | 
			
		||||
#           1.3    Updated confirmation task with correct logic
 | 
			
		||||
#           1.2    Added missing load of seqinr package
 | 
			
		||||
#           1.1    Update annotation file logic - it could already have been
 | 
			
		||||
#                    prepared in the BIN-FUNC-Annotation unit.
 | 
			
		||||
#           1.0.1  bugfix
 | 
			
		||||
#           1.0    First 2017 live version.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                      Line
 | 
			
		||||
#TOC> --------------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Prepare                                                      58
 | 
			
		||||
#TOC>   2        Biostrings Pairwise Alignment                                75
 | 
			
		||||
#TOC>   2.1        Optimal global alignment                                   93
 | 
			
		||||
#TOC>   2.2        Optimal local alignment                                   156
 | 
			
		||||
#TOC>   3        APSES Domain annotation by alignment                        180
 | 
			
		||||
#TOC>   4        Update your database script                                 261
 | 
			
		||||
#TOC>   4.1        Preparing an annotation file ...                          267
 | 
			
		||||
#TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     269
 | 
			
		||||
#TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         314
 | 
			
		||||
#TOC>   4.2        Execute and Validate                                      338
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Prepare  =============================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# You can get package information with the following commands:
 | 
			
		||||
# library(help = seqinr)       # basic information
 | 
			
		||||
# browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
# data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You need to recreate the protein database that you have constructed in the
 | 
			
		||||
# BIN-Storing_data unit.
 | 
			
		||||
 | 
			
		||||
source("./myScripts/makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Biostrings Pairwise Alignment  =======================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Biostrings stores sequences in "XString" objects. Once we have converted our
 | 
			
		||||
# target sequences to AAString objects, the alignment itself is straightforward.
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Optimal global alignment  ==========================================
 | 
			
		||||
 | 
			
		||||
# The pairwiseAlignment() function was written to behave
 | 
			
		||||
# exactly like the functions you encountered on the EMBOSS server.
 | 
			
		||||
 | 
			
		||||
# First: make AAString objects ...
 | 
			
		||||
sel <- myDB$protein$name == "MBP1_SACCE"
 | 
			
		||||
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel])
 | 
			
		||||
 | 
			
		||||
?pairwiseAlignment
 | 
			
		||||
# ... and align.
 | 
			
		||||
# Global optimal alignment with end-gap penalties is default.
 | 
			
		||||
ali1 <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMBP1_SACCE,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 10,
 | 
			
		||||
  gapExtension = 0.5)
 | 
			
		||||
 | 
			
		||||
str(ali1)  # ... it's complicated
 | 
			
		||||
 | 
			
		||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
 | 
			
		||||
# tame it:
 | 
			
		||||
ali1
 | 
			
		||||
Biostrings::writePairwiseAlignments(ali1)   # That should look familiar
 | 
			
		||||
 | 
			
		||||
# And we can make the internal structure work for us  (@ is for classes as
 | 
			
		||||
# $ is for lists ...)
 | 
			
		||||
str(ali1@pattern)
 | 
			
		||||
ali1@pattern
 | 
			
		||||
ali1@pattern@range
 | 
			
		||||
ali1@pattern@indel
 | 
			
		||||
ali1@pattern@mismatch
 | 
			
		||||
 | 
			
		||||
# or work with "normal" R functions
 | 
			
		||||
# the alignment length
 | 
			
		||||
nchar(as.character(ali1@pattern))
 | 
			
		||||
 | 
			
		||||
# the number of identities
 | 
			
		||||
sum(seqinr::s2c(as.character(ali1@pattern)) ==
 | 
			
		||||
    seqinr::s2c(as.character(ali1@subject)))
 | 
			
		||||
 | 
			
		||||
# ... e.g. to calculate the percentage of identities
 | 
			
		||||
100 *
 | 
			
		||||
  sum(seqinr::s2c(as.character(ali1@pattern)) ==
 | 
			
		||||
      seqinr::s2c(as.character(ali1@subject))) /
 | 
			
		||||
  nchar(as.character(ali1@pattern))
 | 
			
		||||
# ... which should be the same as reported in the writePairwiseAlignments()
 | 
			
		||||
# output. Awkward to type? Then it calls for a function:
 | 
			
		||||
#
 | 
			
		||||
percentID <- function(al) {
 | 
			
		||||
  # returns the percent-identity of a Biostrings alignment object
 | 
			
		||||
  return(100 *
 | 
			
		||||
         sum(seqinr::s2c(as.character(al@pattern)) ==
 | 
			
		||||
             seqinr::s2c(as.character(al@subject))) /
 | 
			
		||||
         nchar(as.character(al@pattern)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
percentID(ali1)
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Optimal local alignment  ===========================================
 | 
			
		||||
 | 
			
		||||
# Compare with local optimal alignment (like EMBOSS Water)
 | 
			
		||||
ali2 <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMBP1_SACCE,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  type = "local",
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 50,
 | 
			
		||||
  gapExtension = 10)
 | 
			
		||||
 | 
			
		||||
Biostrings::writePairwiseAlignments(ali2)
 | 
			
		||||
# This has probably only aligned the N-terminal DNA binding domain - but that
 | 
			
		||||
# one has quite high sequence identity:
 | 
			
		||||
percentID(ali2)
 | 
			
		||||
 | 
			
		||||
# == TASK: ==
 | 
			
		||||
 | 
			
		||||
# Compare the two alignments. I have weighted the local alignment heavily
 | 
			
		||||
# towards an ungapped alignment by setting very high gap penalties. Try changing
 | 
			
		||||
# the gap penalties and see what happens: how does the number of indels change,
 | 
			
		||||
# how does the length of indels change...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  APSES Domain annotation by alignment  ================================
 | 
			
		||||
 | 
			
		||||
# In this section we define the MYSPE APSES sequence by performing a global,
 | 
			
		||||
# optimal sequence alignment of the yeast APSES domain with the full length
 | 
			
		||||
# protein sequence of the protein that was the most similar to the yeast APSES
 | 
			
		||||
# domain.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# I have annotated the yeast APSES domain as a feature in the
 | 
			
		||||
# database. To view the annotation, we can retrieve it via the proteinID and
 | 
			
		||||
# featureID. Here is the yeast protein ID:
 | 
			
		||||
(proID <- myDB$protein$ID[myDB$protein$name == "MBP1_SACCE"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ... and if you look at the feature table, you can identify the feature ID
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
 | 
			
		||||
# ... and with the two annotations we can get the corresponding ID from the
 | 
			
		||||
# annotation table
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                             myDB$annotation$featureID == ftrID])
 | 
			
		||||
 | 
			
		||||
myDB$annotation[myDB$annotation$ID == proID &
 | 
			
		||||
                myDB$annotation$ID == ftrID, ]
 | 
			
		||||
 | 
			
		||||
# The annotation record contains the start and end coordinates which we can use
 | 
			
		||||
# to define the APSES domain sequence with a substr() expression.
 | 
			
		||||
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
# Lots of code. But don't get lost. Let's recapitulate what we have done: we
 | 
			
		||||
# have selected from the sequence column of the protein table the sequence whose
 | 
			
		||||
# name is "MBP1_SACCE", and selected from the annotation table the start
 | 
			
		||||
# and end coordinates of the annotation that joins an "APSES fold" feature with
 | 
			
		||||
# the sequence, and used the start and end coordinates to extract a substring.
 | 
			
		||||
 | 
			
		||||
# Let's convert this to an AAstring and assign it:
 | 
			
		||||
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
 | 
			
		||||
 | 
			
		||||
# Now let's align these two sequences of very different length without end-gap
 | 
			
		||||
# penalties using the "overlap" type. "overlap" turns the
 | 
			
		||||
# end-gap penalties off and that is crucially important since
 | 
			
		||||
# the sequences have very different length.
 | 
			
		||||
 | 
			
		||||
aliApses <-  Biostrings::pairwiseAlignment(
 | 
			
		||||
  aaMB1_SACCE_APSES,
 | 
			
		||||
  aaMBP1_MYSPE,
 | 
			
		||||
  type = "overlap",
 | 
			
		||||
  substitutionMatrix = "BLOSUM62",
 | 
			
		||||
  gapOpening = 10,
 | 
			
		||||
  gapExtension = 0.5)
 | 
			
		||||
 | 
			
		||||
# Inspect the result. The aligned sequences should be clearly
 | 
			
		||||
# homologous, and have (almost) no indels. The entire "pattern"
 | 
			
		||||
# sequence from QIYSAR ... to ... KPLFDF  should be matched
 | 
			
		||||
# with the "query". Is this correct?
 | 
			
		||||
Biostrings::writePairwiseAlignments(aliApses)
 | 
			
		||||
 | 
			
		||||
# If this is correct, you can extract the matched sequence from
 | 
			
		||||
# the alignment object. The syntax is a bit different from what
 | 
			
		||||
# you have seen before: this is an "S4 object", not a list. No
 | 
			
		||||
# worries: as.character() returns a normal string.
 | 
			
		||||
as.character(aliApses@subject)
 | 
			
		||||
 | 
			
		||||
# Now, what are the aligned start and end coordinates? You can read them from
 | 
			
		||||
# the output of writePairwiseAlignments(), or you can get them from the range of
 | 
			
		||||
# the match.
 | 
			
		||||
 | 
			
		||||
str(aliApses@subject@range)
 | 
			
		||||
 | 
			
		||||
# start is:
 | 
			
		||||
aliApses@subject@range@start
 | 
			
		||||
 | 
			
		||||
# ... and end is:
 | 
			
		||||
aliApses@subject@range@start + aliApses@subject@range@width - 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Update your database script  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Since we have this feature defined now, we can create a feature annotation
 | 
			
		||||
# right away and store it in myDB.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Preparing an annotation file ...  ==================================
 | 
			
		||||
#
 | 
			
		||||
# ===   4.1.1  If you HAVE NOT done the BIN-FUNC-Annotation unit
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
 | 
			
		||||
#     myScripts/ directory.
 | 
			
		||||
#
 | 
			
		||||
#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
 | 
			
		||||
#     if MYSPE is called "Crptycoccus neoformans", your file should be called
 | 
			
		||||
#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
 | 
			
		||||
#     "MBP1_CRYNE").
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor and delete all blocks for
 | 
			
		||||
#     the Mbp1 protein annotations except the first one.
 | 
			
		||||
#
 | 
			
		||||
#   - From that block, delete all lines except for the line that says:
 | 
			
		||||
#
 | 
			
		||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
 | 
			
		||||
#
 | 
			
		||||
#   - Then delete the comma at the end of the line (your file will just have
 | 
			
		||||
#     this one annotation).
 | 
			
		||||
#
 | 
			
		||||
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
 | 
			
		||||
#     "start" and "end" features to the coordinates you just discovered for the
 | 
			
		||||
#     APSES domain in your sequence.
 | 
			
		||||
#
 | 
			
		||||
#   - Save the file in your myScripts/ directory
 | 
			
		||||
#
 | 
			
		||||
##   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#   - Update your "./myScripts/makeProteinDB.R" script to load your new
 | 
			
		||||
#     annotation when you recreate the database. Open the script in the
 | 
			
		||||
#     RStudio editor, and add the following command at the end:
 | 
			
		||||
#
 | 
			
		||||
#     myDB <- dbAddAnnotation(myDB,
 | 
			
		||||
#                 jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
 | 
			
		||||
#                                                 ^^^^^^^
 | 
			
		||||
#                                                edit this!
 | 
			
		||||
#   - save and close the file.
 | 
			
		||||
#
 | 
			
		||||
# Then SKIP the next section.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ===   4.1.2  If you HAVE done the BIN-FUNC-Annotation unit    
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
#   You DO already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor.
 | 
			
		||||
#
 | 
			
		||||
#   - Below the last feature lines (but before the closing "]") add the
 | 
			
		||||
#     following feature line (without the "#")
 | 
			
		||||
#
 | 
			
		||||
# {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"}
 | 
			
		||||
#
 | 
			
		||||
#   - Edit that annotation: change MBP1_SACCE  to MBP1_<MYSPE> and change the
 | 
			
		||||
#     "start" and "end" features to the coordinates you just discovered for the
 | 
			
		||||
#     APSES domain in your sequence.
 | 
			
		||||
#
 | 
			
		||||
#   - Add a comma after the preceding feature line.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==   4.2  Execute and Validate  ==============================================
 | 
			
		||||
#
 | 
			
		||||
#   - source() your database creation script:
 | 
			
		||||
#
 | 
			
		||||
#  source("./myScripts/makeProteinDB.R")
 | 
			
		||||
#
 | 
			
		||||
#     This should run without errors or warnings. If it doesn't work and you
 | 
			
		||||
#     can't figure out quickly what's happening, ask on the mailing list for
 | 
			
		||||
#     help.
 | 
			
		||||
#
 | 
			
		||||
#   - Confirm
 | 
			
		||||
#     The following commands should retrieve the correct start and end
 | 
			
		||||
#     coordinates and sequence of the MBP1_MYSPE APSES domain:
 | 
			
		||||
 | 
			
		||||
sel <- which(myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = ""))
 | 
			
		||||
 | 
			
		||||
(proID <- myDB$protein$ID[sel])
 | 
			
		||||
(ftrID <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanID <- myDB$annotation$ID[myDB$annotation$proteinID == proID &
 | 
			
		||||
                             myDB$annotation$featureID == ftrID])
 | 
			
		||||
(start <- myDB$annotation$start[myDB$annotation$ID == fanID])
 | 
			
		||||
(end   <- myDB$annotation$end[myDB$annotation$ID == fanID])
 | 
			
		||||
(apses <- substr(myDB$protein$sequence[myDB$protein$ID == proID],
 | 
			
		||||
                 start,
 | 
			
		||||
                 end))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,313 +1,313 @@
 | 
			
		||||
# tocID <- "BIN-ALI-Similarity.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Similarity unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    Refactored for 2017; add aaindex, ternary plot.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   Update ggtern:: ternary plot to use aacol dots under text
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                          Line
 | 
			
		||||
#TOC> ----------------------------------------------
 | 
			
		||||
#TOC>   1        Amino Acid Properties            43
 | 
			
		||||
#TOC>   2        Mutation Data matrix            189
 | 
			
		||||
#TOC>   3        Background score                230
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Amino Acid Properties  ===============================================
 | 
			
		||||
 | 
			
		||||
# A large collection of amino acid property tables is available via the seqinr
 | 
			
		||||
# package:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
 | 
			
		||||
#  data:
 | 
			
		||||
 | 
			
		||||
?aaindex
 | 
			
		||||
data(aaindex, package = "seqinr")  # load the aaindex list from the package
 | 
			
		||||
 | 
			
		||||
length(aaindex)
 | 
			
		||||
 | 
			
		||||
# Here are all the index descriptions
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# It's a bit cumbersome to search through the descriptions ... here is a
 | 
			
		||||
# function to make this easier:
 | 
			
		||||
 | 
			
		||||
searchAAindex <- function(patt) {
 | 
			
		||||
  # Searches the aaindex descriptions for regular expression "patt"
 | 
			
		||||
  # and prints index number and description.
 | 
			
		||||
  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
 | 
			
		||||
  for (i in seq_along(hits)) {
 | 
			
		||||
    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
searchAAindex("free energy")          # Search for "free energy"
 | 
			
		||||
searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's examine ...
 | 
			
		||||
# ... a hydrophobicity index
 | 
			
		||||
(Y <- aaindex[[528]][c("D", "I")])
 | 
			
		||||
 | 
			
		||||
# ... a volume index
 | 
			
		||||
(V <- aaindex[[150]][c("D", "I")])
 | 
			
		||||
 | 
			
		||||
# ... and one of our own: side-chain pK values as reported by
 | 
			
		||||
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
 | 
			
		||||
# to 7.4 (physiological pH)
 | 
			
		||||
K <- list(I = c( 7.4,   # Ala
 | 
			
		||||
                12.3,   # Arg
 | 
			
		||||
                 7.4,   # Asn
 | 
			
		||||
                 3.9,   # Asp
 | 
			
		||||
                 8.6,   # Cys
 | 
			
		||||
                 7.4,   # Gln
 | 
			
		||||
                 4.3,   # Glu
 | 
			
		||||
                 7.4,   # Gly
 | 
			
		||||
                 6.5,   # His
 | 
			
		||||
                 7.4,   # Ile
 | 
			
		||||
                 7.4,   # Leu
 | 
			
		||||
                10.4,   # Lys
 | 
			
		||||
                 7.4,   # Met
 | 
			
		||||
                 7.4,   # Phe
 | 
			
		||||
                 7.4,   # Pro
 | 
			
		||||
                 7.4,   # Ser
 | 
			
		||||
                 7.4,   # Thr
 | 
			
		||||
                 7.4,   # Trp
 | 
			
		||||
                 9.8,   # Tyr
 | 
			
		||||
                 7.4))  # Val
 | 
			
		||||
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
 | 
			
		||||
                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
 | 
			
		||||
 | 
			
		||||
# pull the names from Y$I, convert them to single letter code, and reorder the
 | 
			
		||||
# AACOLS palette accordingly ...
 | 
			
		||||
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
 | 
			
		||||
 | 
			
		||||
plot(Y$I, V$I,
 | 
			
		||||
     xlab = "hydrophobicity", ylab = "volume",
 | 
			
		||||
     pch = 21,
 | 
			
		||||
     cex = 6,
 | 
			
		||||
     col = aac,
 | 
			
		||||
     bg  = aac)
 | 
			
		||||
text(Y$I, V$I, names(Y$I), cex = 0.8)
 | 
			
		||||
 | 
			
		||||
plot(Y$I, K$I,
 | 
			
		||||
     xlab = "hydrophobicity", ylab = "pK",
 | 
			
		||||
     pch = 21,
 | 
			
		||||
     cex = 6,
 | 
			
		||||
     col = aac,
 | 
			
		||||
     bg  = aac)
 | 
			
		||||
text(Y$I, K$I, names(Y$I), cex = 0.8)
 | 
			
		||||
 | 
			
		||||
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
 | 
			
		||||
# plots are in general unintuitive and hard to interpret. One alternative is a
 | 
			
		||||
# so-called "ternary plot":
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("ggtern", quietly=TRUE)) {
 | 
			
		||||
  install.packages("ggtern")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = ggtern)       # basic information
 | 
			
		||||
#  browseVignettes("ggtern")    # available vignettes
 | 
			
		||||
#  data(package = "ggtern")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# collect into data frame, normalize to (0.05, 0.95)
 | 
			
		||||
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
 | 
			
		||||
                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
 | 
			
		||||
                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
 | 
			
		||||
                    stringsAsFactors = FALSE)
 | 
			
		||||
rownames(myDat) <- names(Y$I)
 | 
			
		||||
 | 
			
		||||
ggtern::ggtern(data = myDat,
 | 
			
		||||
               ggplot2::aes(x = vol,
 | 
			
		||||
                   y = phi,
 | 
			
		||||
                   z = pK,
 | 
			
		||||
                   label = rownames(myDat))) + ggplot2::geom_text()
 | 
			
		||||
 | 
			
		||||
# This results in a mapping of amino acids relative to each other that is
 | 
			
		||||
# similar to the Venn diagram you have seen in the notes.
 | 
			
		||||
 | 
			
		||||
# ... or we could use principal components analysis, to pull out the
 | 
			
		||||
# best projection of the three feature dimensions into two. (Done here without delving
 | 
			
		||||
# into the theory ...)
 | 
			
		||||
prc <- prcomp(myDat)
 | 
			
		||||
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
 | 
			
		||||
     pch=19, cex=6, col=aad, cex.main=0.7,
 | 
			
		||||
     main="Principal Component Analysis of Amino Acid Features")
 | 
			
		||||
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
 | 
			
		||||
 | 
			
		||||
# This matches the intuition rather well in that "similar" amino acids are close
 | 
			
		||||
# on the plot. But we can't interpret the distances in terms of just one of the
 | 
			
		||||
# parameters. Whatever - nature has a different way to define similarity:
 | 
			
		||||
# mutations to similar amino acids are less likely to break the protein.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Mutation Data matrix  ================================================
 | 
			
		||||
 | 
			
		||||
# A mutation data matrix encodes all amino acid pairscores in a matrix.
 | 
			
		||||
 | 
			
		||||
# The Biostrings package contains the most common mutation data matrices.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help=Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")  # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")   # available datasets
 | 
			
		||||
 | 
			
		||||
# Let's attach the BLOSUM62 mutation data matrix from the package
 | 
			
		||||
data(BLOSUM62, package = "Biostrings")
 | 
			
		||||
 | 
			
		||||
# ... and see what it contains. (You've seen this matrix before.)
 | 
			
		||||
BLOSUM62
 | 
			
		||||
 | 
			
		||||
# We can simply access values via the row/column names.
 | 
			
		||||
# Identical amino acids have high scores ...
 | 
			
		||||
BLOSUM62["H", "H"]   # Score for a pair of two histidines
 | 
			
		||||
BLOSUM62["S", "S"]   # Score for a pair of two serines
 | 
			
		||||
 | 
			
		||||
# Similar amino acids have low positive scores ...
 | 
			
		||||
BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
 | 
			
		||||
BLOSUM62["F", "Y"]   # etc.
 | 
			
		||||
 | 
			
		||||
# Dissimilar amino acids have negative scores ...
 | 
			
		||||
BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
 | 
			
		||||
BLOSUM62["Q", "P"]   # etc.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BLOSUM62["R", "W"]   # the matrix is symmetric!
 | 
			
		||||
BLOSUM62["W", "R"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Background score  ====================================================
 | 
			
		||||
 | 
			
		||||
# The mutation data matrix is designed to give high scores to homologous
 | 
			
		||||
# sequences, low scores to non-homologous sequences. What score on average
 | 
			
		||||
# should we expect for a random sequence?
 | 
			
		||||
 | 
			
		||||
# If we sample amino acid pairs at random, we will get a score that is the
 | 
			
		||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
 | 
			
		||||
# codes and the gap character:
 | 
			
		||||
 | 
			
		||||
sum(BLOSUM62[1:20, 1:20])/400
 | 
			
		||||
 | 
			
		||||
# But that score could be higher for real sequences, for which the amino acid
 | 
			
		||||
# distribution is not random. For example membrane proteins have a large number
 | 
			
		||||
# of hydrophobic residues - an alignment of unrelated proteins might produce
 | 
			
		||||
# positive scores. And there are other proteins with biased amino acid
 | 
			
		||||
# compositions, in particular poteins that interact with multiple other
 | 
			
		||||
# proteins. Let's test how this impacts the background score by comparing a
 | 
			
		||||
# sequence with shuffled sequences. These have the same composition, but are
 | 
			
		||||
# obvioulsy not homologous. The data directory contains the FASTA file for the
 | 
			
		||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
 | 
			
		||||
# low-complexity amino acid sequence ...
 | 
			
		||||
 | 
			
		||||
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
 | 
			
		||||
 | 
			
		||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
 | 
			
		||||
# with an exceptionally high percentage of hydrophobic residues.
 | 
			
		||||
 | 
			
		||||
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
 | 
			
		||||
 | 
			
		||||
# Here is a function that takes two sequences and
 | 
			
		||||
# returns their average pairscore.
 | 
			
		||||
 | 
			
		||||
averagePairScore <- function(a, b, MDM = BLOSUM62) {
 | 
			
		||||
  # Returns average pairscore of two sequences.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    a, b   chr   amino acid sequence string
 | 
			
		||||
  #    MDM          mutation data matrix. Default is BLOSUM62
 | 
			
		||||
  # Value:    num   average pairscore.
 | 
			
		||||
  a <- unlist(strsplit(a, ""))
 | 
			
		||||
  b <- unlist(strsplit(b, ""))
 | 
			
		||||
  v <- 0
 | 
			
		||||
  for (i in seq_along(a)) {
 | 
			
		||||
    v <- v + MDM[ a[i], b[i] ]
 | 
			
		||||
  }
 | 
			
		||||
  return(v / length(a))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
orig3FG7 <- toString(aa3FG7)
 | 
			
		||||
orig2F1C <- toString(aa2F1C)
 | 
			
		||||
N <- 1000
 | 
			
		||||
scores3FG7 <- numeric(N)
 | 
			
		||||
scores2F1C <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
 | 
			
		||||
  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Plot the distributions
 | 
			
		||||
hist(scores3FG7,
 | 
			
		||||
     col="#5599EE33",
 | 
			
		||||
     breaks = seq(-1.5, 0, by=0.1),
 | 
			
		||||
     main = "Pairscores for randomly shuffled sequences",
 | 
			
		||||
     xlab = "Average pairscore from BLOSUM 62")
 | 
			
		||||
hist(scores2F1C,
 | 
			
		||||
     col="#55EE9933",
 | 
			
		||||
     breaks = seq(-1.5, 0, by=0.1),
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
 | 
			
		||||
legend('topright',
 | 
			
		||||
       c("3FG7 (villin)", "2F1C (OmpG)"),
 | 
			
		||||
       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
 | 
			
		||||
       inset = 0.1)
 | 
			
		||||
 | 
			
		||||
# This is an important result: even though we have shuffled significantly biased
 | 
			
		||||
# sequences, and the average scores trend above the average of the mutation data
 | 
			
		||||
# matrix, the average scores still remain comfortably below zero. This means
 | 
			
		||||
# that we can't (in general) improve a high-scoring alignment by simply
 | 
			
		||||
# extending it with randomly matched residues. We will only improve the score if
 | 
			
		||||
# the similarity of newly added residues is larger than what we expect to get by
 | 
			
		||||
# random chance!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-ALI-Similarity.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-ALI-Similarity unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    Refactored for 2017; add aaindex, ternary plot.
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   Update ggtern:: ternary plot to use aacol dots under text
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                          Line
 | 
			
		||||
#TOC> ----------------------------------------------
 | 
			
		||||
#TOC>   1        Amino Acid Properties            43
 | 
			
		||||
#TOC>   2        Mutation Data matrix            189
 | 
			
		||||
#TOC>   3        Background score                230
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Amino Acid Properties  ===============================================
 | 
			
		||||
 | 
			
		||||
# A large collection of amino acid property tables is available via the seqinr
 | 
			
		||||
# package:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# A true Labor of Love has gone into the compilation of the seqinr "aaindex"
 | 
			
		||||
#  data:
 | 
			
		||||
 | 
			
		||||
?aaindex
 | 
			
		||||
data(aaindex, package = "seqinr")  # load the aaindex list from the package
 | 
			
		||||
 | 
			
		||||
length(aaindex)
 | 
			
		||||
 | 
			
		||||
# Here are all the index descriptions
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# It's a bit cumbersome to search through the descriptions ... here is a
 | 
			
		||||
# function to make this easier:
 | 
			
		||||
 | 
			
		||||
searchAAindex <- function(patt) {
 | 
			
		||||
  # Searches the aaindex descriptions for regular expression "patt"
 | 
			
		||||
  # and prints index number and description.
 | 
			
		||||
  hits <- which(sapply(aaindex, function(x) length(grep(patt, x$D)) > 0))
 | 
			
		||||
  for (i in seq_along(hits)) {
 | 
			
		||||
    cat(sprintf("%3d\t%s\n", hits[i], aaindex[[ hits[i] ]]$D))
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
searchAAindex("free energy")          # Search for "free energy"
 | 
			
		||||
searchAAindex("(size)|(volume)")      # Search for "size" or "volume":
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's examine ...
 | 
			
		||||
# ... a hydrophobicity index
 | 
			
		||||
(Y <- aaindex[[528]][c("D", "I")])
 | 
			
		||||
 | 
			
		||||
# ... a volume index
 | 
			
		||||
(V <- aaindex[[150]][c("D", "I")])
 | 
			
		||||
 | 
			
		||||
# ... and one of our own: side-chain pK values as reported by
 | 
			
		||||
# Pace et al. (2009) JBC 284:13285-13289, with non-ionizable pKs set
 | 
			
		||||
# to 7.4 (physiological pH)
 | 
			
		||||
K <- list(I = c( 7.4,   # Ala
 | 
			
		||||
                12.3,   # Arg
 | 
			
		||||
                 7.4,   # Asn
 | 
			
		||||
                 3.9,   # Asp
 | 
			
		||||
                 8.6,   # Cys
 | 
			
		||||
                 7.4,   # Gln
 | 
			
		||||
                 4.3,   # Glu
 | 
			
		||||
                 7.4,   # Gly
 | 
			
		||||
                 6.5,   # His
 | 
			
		||||
                 7.4,   # Ile
 | 
			
		||||
                 7.4,   # Leu
 | 
			
		||||
                10.4,   # Lys
 | 
			
		||||
                 7.4,   # Met
 | 
			
		||||
                 7.4,   # Phe
 | 
			
		||||
                 7.4,   # Pro
 | 
			
		||||
                 7.4,   # Ser
 | 
			
		||||
                 7.4,   # Thr
 | 
			
		||||
                 7.4,   # Trp
 | 
			
		||||
                 9.8,   # Tyr
 | 
			
		||||
                 7.4))  # Val
 | 
			
		||||
names(K$I) <- c("Ala","Arg","Asn","Asp","Cys","Gln","Glu","Gly","His","Ile",
 | 
			
		||||
                "Leu","Lys","Met","Phe","Pro","Ser","Thr","Trp","Tyr","Val")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Given these biophysical indices, how similar are the amino acids? We have three-dimensions of measures here. Scatterplots can only display two dimensions ...
 | 
			
		||||
 | 
			
		||||
# pull the names from Y$I, convert them to single letter code, and reorder the
 | 
			
		||||
# AACOLS palette accordingly ...
 | 
			
		||||
aac <- AACOLS[toupper(seqinr::a(names(Y$I)))]
 | 
			
		||||
 | 
			
		||||
plot(Y$I, V$I,
 | 
			
		||||
     xlab = "hydrophobicity", ylab = "volume",
 | 
			
		||||
     pch = 21,
 | 
			
		||||
     cex = 6,
 | 
			
		||||
     col = aac,
 | 
			
		||||
     bg  = aac)
 | 
			
		||||
text(Y$I, V$I, names(Y$I), cex = 0.8)
 | 
			
		||||
 | 
			
		||||
plot(Y$I, K$I,
 | 
			
		||||
     xlab = "hydrophobicity", ylab = "pK",
 | 
			
		||||
     pch = 21,
 | 
			
		||||
     cex = 6,
 | 
			
		||||
     col = aac,
 | 
			
		||||
     bg  = aac)
 | 
			
		||||
text(Y$I, K$I, names(Y$I), cex = 0.8)
 | 
			
		||||
 | 
			
		||||
# ... but how do we plot 3D data? Plotting into a 3D cube is possible, but such
 | 
			
		||||
# plots are in general unintuitive and hard to interpret. One alternative is a
 | 
			
		||||
# so-called "ternary plot":
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("ggtern", quietly=TRUE)) {
 | 
			
		||||
  install.packages("ggtern")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = ggtern)       # basic information
 | 
			
		||||
#  browseVignettes("ggtern")    # available vignettes
 | 
			
		||||
#  data(package = "ggtern")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# collect into data frame, normalize to (0.05, 0.95)
 | 
			
		||||
myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
 | 
			
		||||
                    "vol" = 0.9*(((V$I-min(V$I))/(max(V$I)-min(V$I))))+0.05,
 | 
			
		||||
                    "pK"  = 0.9*(((K$I-min(K$I))/(max(K$I)-min(K$I))))+0.05,
 | 
			
		||||
                    stringsAsFactors = FALSE)
 | 
			
		||||
rownames(myDat) <- names(Y$I)
 | 
			
		||||
 | 
			
		||||
ggtern::ggtern(data = myDat,
 | 
			
		||||
               ggplot2::aes(x = vol,
 | 
			
		||||
                   y = phi,
 | 
			
		||||
                   z = pK,
 | 
			
		||||
                   label = rownames(myDat))) + ggplot2::geom_text()
 | 
			
		||||
 | 
			
		||||
# This results in a mapping of amino acids relative to each other that is
 | 
			
		||||
# similar to the Venn diagram you have seen in the notes.
 | 
			
		||||
 | 
			
		||||
# ... or we could use principal components analysis, to pull out the
 | 
			
		||||
# best projection of the three feature dimensions into two. (Done here without delving
 | 
			
		||||
# into the theory ...)
 | 
			
		||||
prc <- prcomp(myDat)
 | 
			
		||||
plot(prc$x[,1], prc$x[,2], xlab="", ylab="", xaxt="n", yaxt="n",
 | 
			
		||||
     pch=19, cex=6, col=aad, cex.main=0.7,
 | 
			
		||||
     main="Principal Component Analysis of Amino Acid Features")
 | 
			
		||||
text(prc$x[,1], prc$x[,2], names(Y$I), cex = 0.8, col="#00000088")
 | 
			
		||||
 | 
			
		||||
# This matches the intuition rather well in that "similar" amino acids are close
 | 
			
		||||
# on the plot. But we can't interpret the distances in terms of just one of the
 | 
			
		||||
# parameters. Whatever - nature has a different way to define similarity:
 | 
			
		||||
# mutations to similar amino acids are less likely to break the protein.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Mutation Data matrix  ================================================
 | 
			
		||||
 | 
			
		||||
# A mutation data matrix encodes all amino acid pairscores in a matrix.
 | 
			
		||||
 | 
			
		||||
# The Biostrings package contains the most common mutation data matrices.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help=Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")  # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")   # available datasets
 | 
			
		||||
 | 
			
		||||
# Let's attach the BLOSUM62 mutation data matrix from the package
 | 
			
		||||
data(BLOSUM62, package = "Biostrings")
 | 
			
		||||
 | 
			
		||||
# ... and see what it contains. (You've seen this matrix before.)
 | 
			
		||||
BLOSUM62
 | 
			
		||||
 | 
			
		||||
# We can simply access values via the row/column names.
 | 
			
		||||
# Identical amino acids have high scores ...
 | 
			
		||||
BLOSUM62["H", "H"]   # Score for a pair of two histidines
 | 
			
		||||
BLOSUM62["S", "S"]   # Score for a pair of two serines
 | 
			
		||||
 | 
			
		||||
# Similar amino acids have low positive scores ...
 | 
			
		||||
BLOSUM62["L", "I"]   # Score for a leucine / lysine pair
 | 
			
		||||
BLOSUM62["F", "Y"]   # etc.
 | 
			
		||||
 | 
			
		||||
# Dissimilar amino acids have negative scores ...
 | 
			
		||||
BLOSUM62["L", "K"]   # Score for a leucine / lysine pair
 | 
			
		||||
BLOSUM62["Q", "P"]   # etc.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BLOSUM62["R", "W"]   # the matrix is symmetric!
 | 
			
		||||
BLOSUM62["W", "R"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Background score  ====================================================
 | 
			
		||||
 | 
			
		||||
# The mutation data matrix is designed to give high scores to homologous
 | 
			
		||||
# sequences, low scores to non-homologous sequences. What score on average
 | 
			
		||||
# should we expect for a random sequence?
 | 
			
		||||
 | 
			
		||||
# If we sample amino acid pairs at random, we will get a score that is the
 | 
			
		||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
 | 
			
		||||
# codes and the gap character:
 | 
			
		||||
 | 
			
		||||
sum(BLOSUM62[1:20, 1:20])/400
 | 
			
		||||
 | 
			
		||||
# But that score could be higher for real sequences, for which the amino acid
 | 
			
		||||
# distribution is not random. For example membrane proteins have a large number
 | 
			
		||||
# of hydrophobic residues - an alignment of unrelated proteins might produce
 | 
			
		||||
# positive scores. And there are other proteins with biased amino acid
 | 
			
		||||
# compositions, in particular poteins that interact with multiple other
 | 
			
		||||
# proteins. Let's test how this impacts the background score by comparing a
 | 
			
		||||
# sequence with shuffled sequences. These have the same composition, but are
 | 
			
		||||
# obvioulsy not homologous. The data directory contains the FASTA file for the
 | 
			
		||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
 | 
			
		||||
# low-complexity amino acid sequence ...
 | 
			
		||||
 | 
			
		||||
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
 | 
			
		||||
 | 
			
		||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
 | 
			
		||||
# with an exceptionally high percentage of hydrophobic residues.
 | 
			
		||||
 | 
			
		||||
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
 | 
			
		||||
 | 
			
		||||
# Here is a function that takes two sequences and
 | 
			
		||||
# returns their average pairscore.
 | 
			
		||||
 | 
			
		||||
averagePairScore <- function(a, b, MDM = BLOSUM62) {
 | 
			
		||||
  # Returns average pairscore of two sequences.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    a, b   chr   amino acid sequence string
 | 
			
		||||
  #    MDM          mutation data matrix. Default is BLOSUM62
 | 
			
		||||
  # Value:    num   average pairscore.
 | 
			
		||||
  a <- unlist(strsplit(a, ""))
 | 
			
		||||
  b <- unlist(strsplit(b, ""))
 | 
			
		||||
  v <- 0
 | 
			
		||||
  for (i in seq_along(a)) {
 | 
			
		||||
    v <- v + MDM[ a[i], b[i] ]
 | 
			
		||||
  }
 | 
			
		||||
  return(v / length(a))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
orig3FG7 <- toString(aa3FG7)
 | 
			
		||||
orig2F1C <- toString(aa2F1C)
 | 
			
		||||
N <- 1000
 | 
			
		||||
scores3FG7 <- numeric(N)
 | 
			
		||||
scores2F1C <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  scores3FG7[i] <- averagePairScore(orig3FG7, toString(sample(aa3FG7)))
 | 
			
		||||
  scores2F1C[i] <- averagePairScore(orig2F1C, toString(sample(aa2F1C)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Plot the distributions
 | 
			
		||||
hist(scores3FG7,
 | 
			
		||||
     col="#5599EE33",
 | 
			
		||||
     breaks = seq(-1.5, 0, by=0.1),
 | 
			
		||||
     main = "Pairscores for randomly shuffled sequences",
 | 
			
		||||
     xlab = "Average pairscore from BLOSUM 62")
 | 
			
		||||
hist(scores2F1C,
 | 
			
		||||
     col="#55EE9933",
 | 
			
		||||
     breaks = seq(-1.5, 0, by=0.1),
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
abline(v = sum(BLOSUM62[1:20, 1:20])/400, col = "firebrick", lwd = 2)
 | 
			
		||||
legend('topright',
 | 
			
		||||
       c("3FG7 (villin)", "2F1C (OmpG)"),
 | 
			
		||||
       fill = c("#5599EE33", "#55EE9933"), bty = 'n',
 | 
			
		||||
       inset = 0.1)
 | 
			
		||||
 | 
			
		||||
# This is an important result: even though we have shuffled significantly biased
 | 
			
		||||
# sequences, and the average scores trend above the average of the mutation data
 | 
			
		||||
# matrix, the average scores still remain comfortably below zero. This means
 | 
			
		||||
# that we can't (in general) improve a high-scoring alignment by simply
 | 
			
		||||
# extending it with randomly matched residues. We will only improve the score if
 | 
			
		||||
# the similarity of newly added residues is larger than what we expect to get by
 | 
			
		||||
# random chance!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,216 +1,216 @@
 | 
			
		||||
# tocID <- "BIN-Data_integration.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-Data_integration unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2018-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance and updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0.1  Bugfix: UniProt ID Mapping service API change
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Develop a fungi-specific BioMart example.
 | 
			
		||||
#           (cf.
 | 
			
		||||
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Identifier mapping                  42
 | 
			
		||||
#TOC>   2        Cross-referencing tables           165
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Identifier mapping  ==================================================
 | 
			
		||||
 | 
			
		||||
# UniProt provides a well-designed ID mapping tool that can be accessed
 | 
			
		||||
# online at     http://www.uniprot.org/mapping/
 | 
			
		||||
#
 | 
			
		||||
# Here we will use the UniProt Web API for this tool to map identifiers. The
 | 
			
		||||
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
 | 
			
		||||
# simply via a Web- browsers request. Such requests are commonly sent via the
 | 
			
		||||
# GET or POST verbs that a Webserver responds to, when a client asks for data.
 | 
			
		||||
# GET requests are visible in the URL of the request; POST requests are not
 | 
			
		||||
# directly visible, they are commonly used to send the contents of forms, or
 | 
			
		||||
# when transmitting larger, complex data items. The UniProt ID mapping sevice
 | 
			
		||||
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
 | 
			
		||||
# and  POST() functions are part of the httr package.
 | 
			
		||||
 | 
			
		||||
# To begin, we load  httr, which supports sending and receiving data via the
 | 
			
		||||
# http protocol, just like a Web browser.
 | 
			
		||||
if (! requireNamespace("httr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We will walk through the process with the refSeqID
 | 
			
		||||
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
 | 
			
		||||
# happens if the ID can't be mapped:
 | 
			
		||||
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The UniProt ID mapping service API is very straightforward to use: just define
 | 
			
		||||
# the URL of the server and send a list of items labelled as "query" in the body
 | 
			
		||||
# of the request. GET() and POST() are functions from httr.
 | 
			
		||||
 | 
			
		||||
# Note. A recent bug in the interaction between the server expectations and the
 | 
			
		||||
# curl client libraries requires the following initialization
 | 
			
		||||
httr::set_config(httr::config(http_version = 0))
 | 
			
		||||
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
URL <- "https://www.uniprot.org/mapping/"
 | 
			
		||||
response <- httr::POST(URL,
 | 
			
		||||
                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
 | 
			
		||||
                                   to = "ACC",             # UniProt ID
 | 
			
		||||
                                   format = "tab",
 | 
			
		||||
                                   query = myQueryIDs))
 | 
			
		||||
 | 
			
		||||
cat(httr::content(response))
 | 
			
		||||
 | 
			
		||||
# We need to check the status code - if it is not 200, an error ocurred and we
 | 
			
		||||
# can't process the result:
 | 
			
		||||
httr::status_code(response)
 | 
			
		||||
 | 
			
		||||
# If the query is successful, tabbed text is returned. We can assign that to a
 | 
			
		||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
 | 
			
		||||
 | 
			
		||||
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
 | 
			
		||||
                          sep = "\t",
 | 
			
		||||
                          stringsAsFactors = FALSE)
 | 
			
		||||
myMappedIDs
 | 
			
		||||
 | 
			
		||||
# If this works as expected, you should see:
 | 
			
		||||
#        From     To
 | 
			
		||||
# 1 NP_010227 P39678
 | 
			
		||||
# 2 NP_011036 P25302
 | 
			
		||||
#
 | 
			
		||||
# ... and note that there are only two entries, because nothing was returned
 | 
			
		||||
# for the dummy "RefSeq ID" NP_00000
 | 
			
		||||
 | 
			
		||||
# If the query can't be fulfilled because of a problem with the server, a
 | 
			
		||||
# WebPage is returned. But the server status is also returned and we can check
 | 
			
		||||
# the status code. I have lately gotten many "503" status codes: Server Not
 | 
			
		||||
# Available...
 | 
			
		||||
 | 
			
		||||
# We wrap this into a function:
 | 
			
		||||
 | 
			
		||||
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 | 
			
		||||
  # Use UniProt ID mapping service to map one or more IDs
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    s  char  A string of separated IDs
 | 
			
		||||
  #    mapFrom  char  the database in which the IDs in s are valid. Default
 | 
			
		||||
  #                     is RefSeq protein
 | 
			
		||||
  #    mapTo    char  the database in which the target IDs are valid. Default
 | 
			
		||||
  #                     is UniProtKB
 | 
			
		||||
  # Value
 | 
			
		||||
  #    a data frame of mapped IDs, with column names From and To, or an
 | 
			
		||||
  #    empty data frame if the mapping was unsuccessful. No rows are returned
 | 
			
		||||
  #    for IDs that are not mapped.
 | 
			
		||||
 | 
			
		||||
  # Initialize curl
 | 
			
		||||
  httr::set_config(httr::config(http_version = 0))
 | 
			
		||||
 | 
			
		||||
  URL <- "https://www.uniprot.org/uploadlists/"
 | 
			
		||||
  response <- httr::POST(URL,
 | 
			
		||||
                         body = list(from = mapFrom,
 | 
			
		||||
                                     to = mapTo,
 | 
			
		||||
                                     format = "tab",
 | 
			
		||||
                                     query = s))
 | 
			
		||||
 | 
			
		||||
  if (httr::status_code(response) == 200) { # 200: oK
 | 
			
		||||
    myMap <- read.delim(file = textConnection(httr::content(response)),
 | 
			
		||||
                        sep = "\t",
 | 
			
		||||
                        stringsAsFactors = FALSE)
 | 
			
		||||
    colnames(myMap) <- c("From", "To")
 | 
			
		||||
  } else {
 | 
			
		||||
    myMap <- data.frame()
 | 
			
		||||
    warning(paste("No uniProt ID mapping returned:",
 | 
			
		||||
                  "server sent status",
 | 
			
		||||
                  httr::status_code(response)))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(myMap)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Try it out ...
 | 
			
		||||
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
 | 
			
		||||
 | 
			
		||||
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
 | 
			
		||||
# into your workspace on startup.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Cross-referencing tables  ============================================
 | 
			
		||||
 | 
			
		||||
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
 | 
			
		||||
# genes in a model organism database such as SGD, or from the Human Genen
 | 
			
		||||
# Nomenclature commission. How do we map one set of identifiers to another one?
 | 
			
		||||
 | 
			
		||||
# The function to use is match().
 | 
			
		||||
# Here is a tiny set of identifiers taken from a much larger table to
 | 
			
		||||
# illustrate the principle:
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
 | 
			
		||||
                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
 | 
			
		||||
                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
 | 
			
		||||
                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
 | 
			
		||||
                    refID = c("NP_014657", "NP_009386",
 | 
			
		||||
                              "NP_012683", "NP_012559",
 | 
			
		||||
                              "NP_010038", "NP_014882",
 | 
			
		||||
                              "NP_012616", "NP_013254",
 | 
			
		||||
                              "NP_014555", "NP_013629"))
 | 
			
		||||
 | 
			
		||||
myIDs
 | 
			
		||||
 | 
			
		||||
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
 | 
			
		||||
# their gene names.
 | 
			
		||||
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
 | 
			
		||||
 | 
			
		||||
# %in% will only tell us if these IDs are present in the table:
 | 
			
		||||
myQuery %in% myIDs$refID
 | 
			
		||||
 | 
			
		||||
# ... but not where they are located. But match() does what we need here:
 | 
			
		||||
match(myQuery, myIDs$refID)
 | 
			
		||||
 | 
			
		||||
# ... and we can use the result to subset the column that we want to map to:
 | 
			
		||||
myIDs$name[match(myQuery, myIDs$refID)]
 | 
			
		||||
 | 
			
		||||
# Note that the output preserves the NA - i.e. the length of the mapped
 | 
			
		||||
# values is exactly the same as the length of the query.
 | 
			
		||||
 | 
			
		||||
# task: map the three genes to their UniProt Identifier.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Note: if you want to do very many queries in very large tables, use the
 | 
			
		||||
# fmatch() function in the "fastmatch" package for a considerable
 | 
			
		||||
# speedup.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-Data_integration.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-Data_integration unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2018-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance and updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0.1  Bugfix: UniProt ID Mapping service API change
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Develop a fungi-specific BioMart example.
 | 
			
		||||
#           (cf.
 | 
			
		||||
# https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html )
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Identifier mapping                  42
 | 
			
		||||
#TOC>   2        Cross-referencing tables           165
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Identifier mapping  ==================================================
 | 
			
		||||
 | 
			
		||||
# UniProt provides a well-designed ID mapping tool that can be accessed
 | 
			
		||||
# online at     http://www.uniprot.org/mapping/
 | 
			
		||||
#
 | 
			
		||||
# Here we will use the UniProt Web API for this tool to map identifiers. The
 | 
			
		||||
# UniProt ID mapping service supports a "RESTful API": responses can be obtained
 | 
			
		||||
# simply via a Web- browsers request. Such requests are commonly sent via the
 | 
			
		||||
# GET or POST verbs that a Webserver responds to, when a client asks for data.
 | 
			
		||||
# GET requests are visible in the URL of the request; POST requests are not
 | 
			
		||||
# directly visible, they are commonly used to send the contents of forms, or
 | 
			
		||||
# when transmitting larger, complex data items. The UniProt ID mapping sevice
 | 
			
		||||
# can accept long lists of IDs, thus using the POST mechanism makes sense. GET()
 | 
			
		||||
# and  POST() functions are part of the httr package.
 | 
			
		||||
 | 
			
		||||
# To begin, we load  httr, which supports sending and receiving data via the
 | 
			
		||||
# http protocol, just like a Web browser.
 | 
			
		||||
if (! requireNamespace("httr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We will walk through the process with the refSeqID
 | 
			
		||||
# of yeast Mbp1 and Swi4, and we will also enter a dummy ID to check what
 | 
			
		||||
# happens if the ID can't be mapped:
 | 
			
		||||
myQueryIDs <- "NP_010227 NP_00000 NP_011036"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The UniProt ID mapping service API is very straightforward to use: just define
 | 
			
		||||
# the URL of the server and send a list of items labelled as "query" in the body
 | 
			
		||||
# of the request. GET() and POST() are functions from httr.
 | 
			
		||||
 | 
			
		||||
# Note. A recent bug in the interaction between the server expectations and the
 | 
			
		||||
# curl client libraries requires the following initialization
 | 
			
		||||
httr::set_config(httr::config(http_version = 0))
 | 
			
		||||
# cf. https://stackoverflow.com/questions/44610845/stream-error-in-the-http-2-framing-layer-bigrquery-commands-error-in-r-studio-b
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
URL <- "https://www.uniprot.org/mapping/"
 | 
			
		||||
response <- httr::POST(URL,
 | 
			
		||||
                       body = list(from = "P_REFSEQ_AC",   # Refseq Protein
 | 
			
		||||
                                   to = "ACC",             # UniProt ID
 | 
			
		||||
                                   format = "tab",
 | 
			
		||||
                                   query = myQueryIDs))
 | 
			
		||||
 | 
			
		||||
cat(httr::content(response))
 | 
			
		||||
 | 
			
		||||
# We need to check the status code - if it is not 200, an error ocurred and we
 | 
			
		||||
# can't process the result:
 | 
			
		||||
httr::status_code(response)
 | 
			
		||||
 | 
			
		||||
# If the query is successful, tabbed text is returned. We can assign that to a
 | 
			
		||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
 | 
			
		||||
 | 
			
		||||
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
 | 
			
		||||
                          sep = "\t",
 | 
			
		||||
                          stringsAsFactors = FALSE)
 | 
			
		||||
myMappedIDs
 | 
			
		||||
 | 
			
		||||
# If this works as expected, you should see:
 | 
			
		||||
#        From     To
 | 
			
		||||
# 1 NP_010227 P39678
 | 
			
		||||
# 2 NP_011036 P25302
 | 
			
		||||
#
 | 
			
		||||
# ... and note that there are only two entries, because nothing was returned
 | 
			
		||||
# for the dummy "RefSeq ID" NP_00000
 | 
			
		||||
 | 
			
		||||
# If the query can't be fulfilled because of a problem with the server, a
 | 
			
		||||
# WebPage is returned. But the server status is also returned and we can check
 | 
			
		||||
# the status code. I have lately gotten many "503" status codes: Server Not
 | 
			
		||||
# Available...
 | 
			
		||||
 | 
			
		||||
# We wrap this into a function:
 | 
			
		||||
 | 
			
		||||
myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
 | 
			
		||||
  # Use UniProt ID mapping service to map one or more IDs
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    s  char  A string of separated IDs
 | 
			
		||||
  #    mapFrom  char  the database in which the IDs in s are valid. Default
 | 
			
		||||
  #                     is RefSeq protein
 | 
			
		||||
  #    mapTo    char  the database in which the target IDs are valid. Default
 | 
			
		||||
  #                     is UniProtKB
 | 
			
		||||
  # Value
 | 
			
		||||
  #    a data frame of mapped IDs, with column names From and To, or an
 | 
			
		||||
  #    empty data frame if the mapping was unsuccessful. No rows are returned
 | 
			
		||||
  #    for IDs that are not mapped.
 | 
			
		||||
 | 
			
		||||
  # Initialize curl
 | 
			
		||||
  httr::set_config(httr::config(http_version = 0))
 | 
			
		||||
 | 
			
		||||
  URL <- "https://www.uniprot.org/uploadlists/"
 | 
			
		||||
  response <- httr::POST(URL,
 | 
			
		||||
                         body = list(from = mapFrom,
 | 
			
		||||
                                     to = mapTo,
 | 
			
		||||
                                     format = "tab",
 | 
			
		||||
                                     query = s))
 | 
			
		||||
 | 
			
		||||
  if (httr::status_code(response) == 200) { # 200: oK
 | 
			
		||||
    myMap <- read.delim(file = textConnection(httr::content(response)),
 | 
			
		||||
                        sep = "\t",
 | 
			
		||||
                        stringsAsFactors = FALSE)
 | 
			
		||||
    colnames(myMap) <- c("From", "To")
 | 
			
		||||
  } else {
 | 
			
		||||
    myMap <- data.frame()
 | 
			
		||||
    warning(paste("No uniProt ID mapping returned:",
 | 
			
		||||
                  "server sent status",
 | 
			
		||||
                  httr::status_code(response)))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(myMap)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Try it out ...
 | 
			
		||||
myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
 | 
			
		||||
 | 
			
		||||
# A function UniProtIDmap() is in the ABC-dbUtilities.R script and it is loaded
 | 
			
		||||
# into your workspace on startup.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Cross-referencing tables  ============================================
 | 
			
		||||
 | 
			
		||||
# Sometimes we get the IDs we need to map in a large table, e.g. from a list of
 | 
			
		||||
# genes in a model organism database such as SGD, or from the Human Genen
 | 
			
		||||
# Nomenclature commission. How do we map one set of identifiers to another one?
 | 
			
		||||
 | 
			
		||||
# The function to use is match().
 | 
			
		||||
# Here is a tiny set of identifiers taken from a much larger table to
 | 
			
		||||
# illustrate the principle:
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747",
 | 
			
		||||
                              "Q08641", "P47129", "P52910", "P00330", "P81450"),
 | 
			
		||||
                    name =  c("2A5D", "2AAA", "2NDP", "3HAO", "AAD4",
 | 
			
		||||
                              "AB140", "ACF4", "ACS2", "ADH1", "ATP18"),
 | 
			
		||||
                    refID = c("NP_014657", "NP_009386",
 | 
			
		||||
                              "NP_012683", "NP_012559",
 | 
			
		||||
                              "NP_010038", "NP_014882",
 | 
			
		||||
                              "NP_012616", "NP_013254",
 | 
			
		||||
                              "NP_014555", "NP_013629"))
 | 
			
		||||
 | 
			
		||||
myIDs
 | 
			
		||||
 | 
			
		||||
# Say we want to map "NP_010038", "NP_012559", and "NP_013629", in that order to
 | 
			
		||||
# their gene names.
 | 
			
		||||
myQuery <- c("NP_010038", "NP_999999", "NP_013629")
 | 
			
		||||
 | 
			
		||||
# %in% will only tell us if these IDs are present in the table:
 | 
			
		||||
myQuery %in% myIDs$refID
 | 
			
		||||
 | 
			
		||||
# ... but not where they are located. But match() does what we need here:
 | 
			
		||||
match(myQuery, myIDs$refID)
 | 
			
		||||
 | 
			
		||||
# ... and we can use the result to subset the column that we want to map to:
 | 
			
		||||
myIDs$name[match(myQuery, myIDs$refID)]
 | 
			
		||||
 | 
			
		||||
# Note that the output preserves the NA - i.e. the length of the mapped
 | 
			
		||||
# values is exactly the same as the length of the query.
 | 
			
		||||
 | 
			
		||||
# task: map the three genes to their UniProt Identifier.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Note: if you want to do very many queries in very large tables, use the
 | 
			
		||||
# fmatch() function in the "fastmatch" package for a considerable
 | 
			
		||||
# speedup.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,435 +1,435 @@
 | 
			
		||||
# tocID <- "BIN-FUNC-Domain_annotation.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-FUNC-Domain_annotation unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# Version:  1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.4    Add code for shared data import from the Wiki
 | 
			
		||||
#           1.3    Add code for database export to JSON and instructions
 | 
			
		||||
#                  for uploading annotations to the Public Student Wiki page
 | 
			
		||||
#           1.2    Consistently: data in ./myScripts/ ;
 | 
			
		||||
#                    begin SHARING DATA section
 | 
			
		||||
#           1.1    2020 Updates
 | 
			
		||||
#           1.0    Live version 2017
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Put the domain plot into a function
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                 Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Update your database script                             51
 | 
			
		||||
#TOC>   1.1        Preparing an annotation file ...                      58
 | 
			
		||||
#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
 | 
			
		||||
#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
 | 
			
		||||
#TOC>   1.2        Execute and Validate                                 136
 | 
			
		||||
#TOC>   2        Plot Annotations                                       161
 | 
			
		||||
#TOC>   3        SHARING DATA                                           287
 | 
			
		||||
#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
 | 
			
		||||
#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Update your database script  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Since you have recorded domain features at the SMART database, we can store
 | 
			
		||||
# the feature annotations in myDB ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Preparing an annotation file ...  ==================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
 | 
			
		||||
#
 | 
			
		||||
#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 | 
			
		||||
#
 | 
			
		||||
#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
 | 
			
		||||
#     myScripts/ directory.
 | 
			
		||||
#
 | 
			
		||||
#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
 | 
			
		||||
#     if MYSPE is called "Crptycoccus neoformans", your file should be called
 | 
			
		||||
#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
 | 
			
		||||
#     "MBP1_CRYNE").
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor and delete all blocks for
 | 
			
		||||
#     the Mbp1 protein annotations except the first one.
 | 
			
		||||
#
 | 
			
		||||
#   - From that block, delete all lines that have annotations you did not
 | 
			
		||||
#     find in SMART for MBP1_MYSPE.
 | 
			
		||||
#
 | 
			
		||||
#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
 | 
			
		||||
#     lines to have a line for each feature you found.
 | 
			
		||||
#
 | 
			
		||||
#   - Then delete the comma at the end of the last line.
 | 
			
		||||
#
 | 
			
		||||
#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
 | 
			
		||||
#     and change the "start" and "end" features to the coordinates you
 | 
			
		||||
#     recorded in the SMART database.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file in the ./myScripts/ folder.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#   - Update your "./myScripts/makeProteinDB.R" script to load your new
 | 
			
		||||
#     annotation when you recreate the database. Open the script in the
 | 
			
		||||
#     RStudio editor, and add the following command at the end:
 | 
			
		||||
#
 | 
			
		||||
#     myDB <- dbAddAnnotation(myDB,
 | 
			
		||||
#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
 | 
			
		||||
#                                         ^^^^^^^
 | 
			
		||||
#                                        edit this!
 | 
			
		||||
#
 | 
			
		||||
#   - save and close the file.
 | 
			
		||||
#
 | 
			
		||||
# Then SKIP the next section.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
 | 
			
		||||
#
 | 
			
		||||
#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 | 
			
		||||
#
 | 
			
		||||
#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#  ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor.
 | 
			
		||||
#
 | 
			
		||||
#   - Make as many copies of the "APSES fold" line as you have found
 | 
			
		||||
#     features in SMART.
 | 
			
		||||
#
 | 
			
		||||
#   - Add a comma after every line except for the last one
 | 
			
		||||
#
 | 
			
		||||
#   - Edit the annotations but include only features that are in the
 | 
			
		||||
#     myDB$feature table. Check which features are in the database by executing
 | 
			
		||||
#
 | 
			
		||||
#        myDB$feature$name
 | 
			
		||||
#
 | 
			
		||||
#   - Update the "start" and "end" coordinates for each feature to the
 | 
			
		||||
#     values you found.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==   1.2  Execute and Validate  ==============================================
 | 
			
		||||
#
 | 
			
		||||
#   - source() your database creation script:
 | 
			
		||||
#
 | 
			
		||||
#     source("./myScripts/makeProteinDB.R")
 | 
			
		||||
#
 | 
			
		||||
#     This should run without errors or warnings. If it doesn't work and you
 | 
			
		||||
#     can't figure out quickly what's happening, ask for help on the
 | 
			
		||||
#     Discussion Board.
 | 
			
		||||
#
 | 
			
		||||
#   - Confirm
 | 
			
		||||
#     The following commands should retrieve all of the features that have been
 | 
			
		||||
#     annotated for MBP1_MYSPE
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
 | 
			
		||||
(proID  <- myDB$protein$ID[sel])
 | 
			
		||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
 | 
			
		||||
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
 | 
			
		||||
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
 | 
			
		||||
                          # (once). If not, consider what could have gone wrong
 | 
			
		||||
                          # and ask on the list if you have difficulties fixing
 | 
			
		||||
                          # it.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Plot Annotations  ====================================================
 | 
			
		||||
 | 
			
		||||
# In this section we will plot domain annotations as colored rectangles on a
 | 
			
		||||
# sequence, as an example of using the R plotting system for generic, data
 | 
			
		||||
# driven images.
 | 
			
		||||
 | 
			
		||||
# We need a small utility function that draws the annotation boxes on a
 | 
			
		||||
# representation of sequence. It should accept the start and end coordinates,
 | 
			
		||||
# the y value where it should be plotted and the color of the box, and plot a
 | 
			
		||||
# rectangle using R's rect() function.
 | 
			
		||||
 | 
			
		||||
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
 | 
			
		||||
  # Draw a box from xStart to xEnd at y, filled with colour myCol
 | 
			
		||||
  # The height of the box is y +- DELTA
 | 
			
		||||
  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
 | 
			
		||||
       border = "black", col = myCol)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# test this:
 | 
			
		||||
plot(c(-1.5, 1.5), c(0, 0), type = "l")
 | 
			
		||||
drawBox(-1, 1, 0.0, "peachpuff")
 | 
			
		||||
 | 
			
		||||
# Next, we define a function to plot annotations for one protein: the name of
 | 
			
		||||
# the protein, a horizontal grey line for its length, and all of its features.
 | 
			
		||||
 | 
			
		||||
plotProtein <- function(DB, name, y) {
 | 
			
		||||
  # DB: protein database
 | 
			
		||||
  # name: the name of the protein in the database.
 | 
			
		||||
  # y: height where to draw the plot
 | 
			
		||||
  #
 | 
			
		||||
  # Define colors: we create a vector of color values, one for
 | 
			
		||||
  # each feature, and we give it names of the feature ID. Then we
 | 
			
		||||
  # can easily get the color value from the feature name.
 | 
			
		||||
  # A: make a vector of color values. The syntax may appear unusual -
 | 
			
		||||
  #    colorRampPalette() returns a function, and we simply append
 | 
			
		||||
  #    the parameter (number-of-features) without assigning the function
 | 
			
		||||
  #    to its own variable name.
 | 
			
		||||
  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
 | 
			
		||||
                               "#62C923", "#0A9A9B", "#1958C3",
 | 
			
		||||
                               "#8000D3", "#D0007F"),
 | 
			
		||||
                             space="Lab",
 | 
			
		||||
                             interpolate="linear")(nrow(DB$feature))
 | 
			
		||||
  # B: Features may overlap, so we make the colors transparent by setting
 | 
			
		||||
  #    their "alpha channel" to 1/3  (hex: 55)
 | 
			
		||||
  ftrCol <- paste0(ftrCol, "55")
 | 
			
		||||
  # C: we asssign names
 | 
			
		||||
  names(ftrCol) <- DB$feature$ID
 | 
			
		||||
  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
 | 
			
		||||
 | 
			
		||||
  # find the row-index of the protein ID in the protein table of DB
 | 
			
		||||
  iProtein <- which(DB$protein$name == name)
 | 
			
		||||
 | 
			
		||||
  # write the name of the protein
 | 
			
		||||
  text(-30, y, adj=1, labels=name, cex=0.75 )
 | 
			
		||||
 | 
			
		||||
  #draw a line from 0 to nchar(sequence-of-the-protein)
 | 
			
		||||
  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
 | 
			
		||||
        lwd=3, col="#999999")
 | 
			
		||||
 | 
			
		||||
  # get the rows of feature annotations for the protein
 | 
			
		||||
  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
 | 
			
		||||
 | 
			
		||||
  # draw a colored box for each feature
 | 
			
		||||
  for (i in iFtr) {
 | 
			
		||||
    drawBox(DB$annotation$start[i],
 | 
			
		||||
            DB$annotation$end[i],
 | 
			
		||||
            y,
 | 
			
		||||
            ftrCol[ DB$annotation$featureID[i] ])
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Plot each annotated protein:
 | 
			
		||||
# Get the rows of all unique annotated Mbp1 proteins in myDB
 | 
			
		||||
 | 
			
		||||
iRows <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
 | 
			
		||||
# define the size of the plot-frame to accomodate all proteins
 | 
			
		||||
yMax <- length(iRows) * 1.1
 | 
			
		||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
 | 
			
		||||
 | 
			
		||||
# plot an empty frame
 | 
			
		||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
 | 
			
		||||
                                        # decrease margins
 | 
			
		||||
plot(1, 1,
 | 
			
		||||
     xlim = c(-200, xMax + 100),
 | 
			
		||||
     ylim = c(0, yMax),
 | 
			
		||||
     type = "n",
 | 
			
		||||
     axes = FALSE,
 | 
			
		||||
     bty = "n",
 | 
			
		||||
     main = "Mbp1 orthologue domain annotations",
 | 
			
		||||
     xlab = "sequence position",
 | 
			
		||||
     cex.axis = 0.8,
 | 
			
		||||
     ylab="")
 | 
			
		||||
axis(1, at = seq(0, xMax, by = 100))
 | 
			
		||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
 | 
			
		||||
                            "#f0ea00", "#62C923",
 | 
			
		||||
                            "#0A9A9B", "#1958C3",
 | 
			
		||||
                            "#8000D3", "#D0007F"),
 | 
			
		||||
                          space="Lab",
 | 
			
		||||
                          interpolate="linear")(nrow(myDB$feature))
 | 
			
		||||
myCol <- paste0(myCol, "55")
 | 
			
		||||
legend(xMax - 150, 7,
 | 
			
		||||
       legend = myDB$feature$name,
 | 
			
		||||
       cex = 0.7,
 | 
			
		||||
       fill = myCol,
 | 
			
		||||
       bty = "n")
 | 
			
		||||
 | 
			
		||||
# Finally, iterate over all proteins and call plotProtein()
 | 
			
		||||
for (i in seq_along(iRows)) {
 | 
			
		||||
  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
 | 
			
		||||
}
 | 
			
		||||
par(oPar)  # reset the plot parameters
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The plot shows what is variable and what is constant about the annotations in
 | 
			
		||||
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
 | 
			
		||||
# top.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Put a copy of the plot into your journal and interpret it with respect
 | 
			
		||||
#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    It would be better to align the motif borders, at least approximately (not
 | 
			
		||||
#    all proteins have all motifs). How would you go about doing that?
 | 
			
		||||
 | 
			
		||||
# =    3  SHARING DATA  ========================================================
 | 
			
		||||
 | 
			
		||||
# It's particularly interesting to compare such annotations across many
 | 
			
		||||
# homologous proteins. I have created a page on the Student Wiki () that you can
 | 
			
		||||
# edit, and then download the data from the entire class directly to your
 | 
			
		||||
# RStudio project.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# I have provided a function that extracts all information that refers to a
 | 
			
		||||
# single protein from the database, and prints it out as well-formatted JSON,
 | 
			
		||||
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
 | 
			
		||||
# bookkeeping involved, but the code is not otherwise very enlightening so I
 | 
			
		||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
 | 
			
		||||
# would want to have a look.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
# 1: Run the following code:
 | 
			
		||||
 | 
			
		||||
cat("{{Vspace}}",
 | 
			
		||||
    "<!-- ==== BEGIN  PROTEIN ==== -->",
 | 
			
		||||
    "<pre class=\"protein-data\">",
 | 
			
		||||
    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
 | 
			
		||||
    "</pre>",
 | 
			
		||||
    "<!-- ===== END PROTEIN ====== -->",
 | 
			
		||||
    "", sep = "\n"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# 2: Copy the entire output from the console.
 | 
			
		||||
# 3: Navigate to
 | 
			
		||||
#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
 | 
			
		||||
#    ... edit the page, and paste your output at the top.
 | 
			
		||||
# 4: Save your edits.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
 | 
			
		||||
 | 
			
		||||
# Once we have collected a number of protein annotations, we can access the
 | 
			
		||||
# Wiki-page and import the data into our database. The Wiki page is  an html
 | 
			
		||||
# document with lots of MediaWiki specific stuff - but the contents we are
 | 
			
		||||
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
 | 
			
		||||
# work like normal HTML <pre> tags, but we have defined a special class for them
 | 
			
		||||
# to make it easy to parse out the contents we want. The rvest:: package in
 | 
			
		||||
# combination with xml2:: provides us with all the tools we need for such
 | 
			
		||||
# "Webscraping" of data....
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("rvest", quietly=TRUE)) {
 | 
			
		||||
  install.packages("rvest")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Here's the process:
 | 
			
		||||
# The URL is an "open" page on the student Wiki. Users that are not logged in
 | 
			
		||||
# can view the contents, but you can only edit if you are logged in.
 | 
			
		||||
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
 | 
			
		||||
 | 
			
		||||
# First thing is to retrieve the HTML from the url...
 | 
			
		||||
x <- xml2::read_html(myURL)
 | 
			
		||||
 | 
			
		||||
# This retrieves the page source, but that still needs to be parsed into its
 | 
			
		||||
# logical elements. HTML is a subset of XML and such documents are structured as
 | 
			
		||||
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
 | 
			
		||||
# parses out the document structure and then uses a so-called "xpath" expression
 | 
			
		||||
# to select nodes we are interested in. Now, xpath is one of those specialized
 | 
			
		||||
# languages of which there are a few more to learn than one would care for. You
 | 
			
		||||
# MUST know how to format sprintf() expressions, and you SHOULD be competent
 | 
			
		||||
# with regular expressions. But if you want to be really competent in your work,
 | 
			
		||||
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
 | 
			
		||||
# to search on Stackoverflow for what you need for parsing data out of Web
 | 
			
		||||
# documents...
 | 
			
		||||
 | 
			
		||||
# The expression we use below is:
 | 
			
		||||
#   - get any node anywhere in the tree ("//*") ...
 | 
			
		||||
#   - that has a particular attribute("[@ ... ]").
 | 
			
		||||
#   - The attribute we want is that the class of the node is "protein-data";
 | 
			
		||||
#      that is the class we have defined for our <pre> tags.
 | 
			
		||||
# As a result of this selection, we get a list of pointers to the document tree.
 | 
			
		||||
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
 | 
			
		||||
 | 
			
		||||
# Next we fetch the actual payload - the text - from the tree:
 | 
			
		||||
# rvest::html_text() gets the text from the list of pointers. The result is a
 | 
			
		||||
# normal list of character strings.
 | 
			
		||||
z <- rvest::html_text(y)
 | 
			
		||||
 | 
			
		||||
# Finally we can iterate over the list, and add all proteins we don't already
 | 
			
		||||
# have to our database. There may well be items that are rejected because they
 | 
			
		||||
# are already present in the database - for example, unless somebody has
 | 
			
		||||
# annotated new features, all of the features are already there. Don't worry -
 | 
			
		||||
# that is intended; we don't want duplicate entries.
 | 
			
		||||
 | 
			
		||||
for (thisJSON in z) {
 | 
			
		||||
  thisData <- jsonlite::fromJSON(thisJSON)
 | 
			
		||||
  if (! thisData$protein$name %in% myDB$protein$name) {
 | 
			
		||||
    myDB <- dbAddProtein(myDB, thisData$protein)
 | 
			
		||||
    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
 | 
			
		||||
    myDB <- dbAddFeature(myDB, thisData$feature)
 | 
			
		||||
    myDB <- dbAddAnnotation(myDB, thisData$annotation)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
 | 
			
		||||
 | 
			
		||||
iRows <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
yMax <- length(iRows) * 1.1
 | 
			
		||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
 | 
			
		||||
 | 
			
		||||
# plot an empty frame
 | 
			
		||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
 | 
			
		||||
plot(1, 1,
 | 
			
		||||
     xlim = c(-200, xMax + 100),
 | 
			
		||||
     ylim = c(0, yMax),
 | 
			
		||||
     type = "n",
 | 
			
		||||
     axes = FALSE,
 | 
			
		||||
     bty = "n",
 | 
			
		||||
     main = "Mbp1 orthologue domain annotations",
 | 
			
		||||
     xlab = "sequence position",
 | 
			
		||||
     cex.axis = 0.8,
 | 
			
		||||
     ylab="")
 | 
			
		||||
axis(1, at = seq(0, xMax, by = 100))
 | 
			
		||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
 | 
			
		||||
                            "#f0ea00", "#62C923",
 | 
			
		||||
                            "#0A9A9B", "#1958C3",
 | 
			
		||||
                            "#8000D3", "#D0007F"),
 | 
			
		||||
                          space="Lab",
 | 
			
		||||
                          interpolate="linear")(nrow(myDB$feature))
 | 
			
		||||
myCol <- paste0(myCol, "55")
 | 
			
		||||
legend(xMax - 150, 7,
 | 
			
		||||
       legend = myDB$feature$name,
 | 
			
		||||
       cex = 0.7,
 | 
			
		||||
       fill = myCol,
 | 
			
		||||
       bty = "n")
 | 
			
		||||
 | 
			
		||||
for (i in seq_along(iRows)) {
 | 
			
		||||
  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
 | 
			
		||||
}
 | 
			
		||||
par(oPar)  # reset the plot parameters
 | 
			
		||||
 | 
			
		||||
# ... the more proteins we can compare, the more we learn about the
 | 
			
		||||
# architectural principles of this family's domains.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-FUNC-Domain_annotation.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-FUNC-Domain_annotation unit.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# Version:  1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.4    Add code for shared data import from the Wiki
 | 
			
		||||
#           1.3    Add code for database export to JSON and instructions
 | 
			
		||||
#                  for uploading annotations to the Public Student Wiki page
 | 
			
		||||
#           1.2    Consistently: data in ./myScripts/ ;
 | 
			
		||||
#                    begin SHARING DATA section
 | 
			
		||||
#           1.1    2020 Updates
 | 
			
		||||
#           1.0    Live version 2017
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Put the domain plot into a function
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                 Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Update your database script                             51
 | 
			
		||||
#TOC>   1.1        Preparing an annotation file ...                      58
 | 
			
		||||
#TOC>   1.1.1          BEFORE  "BIN-ALI-Optimal_sequence_alignment"      61
 | 
			
		||||
#TOC>   1.1.2          AFTER "BIN-ALI-Optimal_sequence_alignment"       109
 | 
			
		||||
#TOC>   1.2        Execute and Validate                                 136
 | 
			
		||||
#TOC>   2        Plot Annotations                                       161
 | 
			
		||||
#TOC>   3        SHARING DATA                                           287
 | 
			
		||||
#TOC>   3.1        Post MBP1_MYSPE as JSON data                         303
 | 
			
		||||
#TOC>   3.2        Import shared MBP1_MYSPE from the Wiki               326
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Update your database script  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Since you have recorded domain features at the SMART database, we can store
 | 
			
		||||
# the feature annotations in myDB ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Preparing an annotation file ...  ==================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   1.1.1  BEFORE  "BIN-ALI-Optimal_sequence_alignment"
 | 
			
		||||
#
 | 
			
		||||
#   IF YOU HAVE NOT YET COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 | 
			
		||||
#
 | 
			
		||||
#   You DON'T already have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#   ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Make a copy of the file "./data/refAnnotations.json" and put it in your
 | 
			
		||||
#     myScripts/ directory.
 | 
			
		||||
#
 | 
			
		||||
#   - Give it a name that is structured like "<MYSPE>-Annotations.json" - e.g.
 | 
			
		||||
#     if MYSPE is called "Crptycoccus neoformans", your file should be called
 | 
			
		||||
#     "CRYNE-Annotations.json" (and the "name" of your Mbp1 orthologue is
 | 
			
		||||
#     "MBP1_CRYNE").
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor and delete all blocks for
 | 
			
		||||
#     the Mbp1 protein annotations except the first one.
 | 
			
		||||
#
 | 
			
		||||
#   - From that block, delete all lines that have annotations you did not
 | 
			
		||||
#     find in SMART for MBP1_MYSPE.
 | 
			
		||||
#
 | 
			
		||||
#   - Make enough copies of the "Ankyrin fold" and "low complexity" region
 | 
			
		||||
#     lines to have a line for each feature you found.
 | 
			
		||||
#
 | 
			
		||||
#   - Then delete the comma at the end of the last line.
 | 
			
		||||
#
 | 
			
		||||
#   - Edit the annotations: change MBP1_SACCE  to MBP1_<MYSPE> everywhere
 | 
			
		||||
#     and change the "start" and "end" features to the coordinates you
 | 
			
		||||
#     recorded in the SMART database.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file in the ./myScripts/ folder.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#   - Update your "./myScripts/makeProteinDB.R" script to load your new
 | 
			
		||||
#     annotation when you recreate the database. Open the script in the
 | 
			
		||||
#     RStudio editor, and add the following command at the end:
 | 
			
		||||
#
 | 
			
		||||
#     myDB <- dbAddAnnotation(myDB,
 | 
			
		||||
#         jsonlite::fromJSON("./myScripts/<MYSPE>-Annotations.json"))
 | 
			
		||||
#                                         ^^^^^^^
 | 
			
		||||
#                                        edit this!
 | 
			
		||||
#
 | 
			
		||||
#   - save and close the file.
 | 
			
		||||
#
 | 
			
		||||
# Then SKIP the next section.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ===   1.1.2  AFTER "BIN-ALI-Optimal_sequence_alignment"  
 | 
			
		||||
#
 | 
			
		||||
#   IF YOU HAVE ALREADY COMPLETED THE BIN-ALI-OPTIMAL_SEQUENCE_ALIGNMENT UNIT:
 | 
			
		||||
#
 | 
			
		||||
#   You SHOULD have a file called "<MYSPE>-Annotations.json" in the
 | 
			
		||||
#  ./myScripts/ directory:
 | 
			
		||||
#
 | 
			
		||||
#   - Open the file in the RStudio editor.
 | 
			
		||||
#
 | 
			
		||||
#   - Make as many copies of the "APSES fold" line as you have found
 | 
			
		||||
#     features in SMART.
 | 
			
		||||
#
 | 
			
		||||
#   - Add a comma after every line except for the last one
 | 
			
		||||
#
 | 
			
		||||
#   - Edit the annotations but include only features that are in the
 | 
			
		||||
#     myDB$feature table. Check which features are in the database by executing
 | 
			
		||||
#
 | 
			
		||||
#        myDB$feature$name
 | 
			
		||||
#
 | 
			
		||||
#   - Update the "start" and "end" coordinates for each feature to the
 | 
			
		||||
#     values you found.
 | 
			
		||||
#
 | 
			
		||||
#   - Save your file.
 | 
			
		||||
#
 | 
			
		||||
#   - Validate your file online at https://jsonlint.com/
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==   1.2  Execute and Validate  ==============================================
 | 
			
		||||
#
 | 
			
		||||
#   - source() your database creation script:
 | 
			
		||||
#
 | 
			
		||||
#     source("./myScripts/makeProteinDB.R")
 | 
			
		||||
#
 | 
			
		||||
#     This should run without errors or warnings. If it doesn't work and you
 | 
			
		||||
#     can't figure out quickly what's happening, ask for help on the
 | 
			
		||||
#     Discussion Board.
 | 
			
		||||
#
 | 
			
		||||
#   - Confirm
 | 
			
		||||
#     The following commands should retrieve all of the features that have been
 | 
			
		||||
#     annotated for MBP1_MYSPE
 | 
			
		||||
 | 
			
		||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
 | 
			
		||||
 | 
			
		||||
(proID  <- myDB$protein$ID[sel])
 | 
			
		||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID == proID])
 | 
			
		||||
(ftrIDs <- unique(myDB$annotation$featureID[fanIDs]))
 | 
			
		||||
myDB$feature$name[ftrIDs] # This should list ALL of your annotated features
 | 
			
		||||
                          # (once). If not, consider what could have gone wrong
 | 
			
		||||
                          # and ask on the list if you have difficulties fixing
 | 
			
		||||
                          # it.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Plot Annotations  ====================================================
 | 
			
		||||
 | 
			
		||||
# In this section we will plot domain annotations as colored rectangles on a
 | 
			
		||||
# sequence, as an example of using the R plotting system for generic, data
 | 
			
		||||
# driven images.
 | 
			
		||||
 | 
			
		||||
# We need a small utility function that draws the annotation boxes on a
 | 
			
		||||
# representation of sequence. It should accept the start and end coordinates,
 | 
			
		||||
# the y value where it should be plotted and the color of the box, and plot a
 | 
			
		||||
# rectangle using R's rect() function.
 | 
			
		||||
 | 
			
		||||
drawBox <- function(xStart, xEnd, y, myCol, DELTA = 0.2) {
 | 
			
		||||
  # Draw a box from xStart to xEnd at y, filled with colour myCol
 | 
			
		||||
  # The height of the box is y +- DELTA
 | 
			
		||||
  rect(xStart, (y - DELTA), xEnd, (y + DELTA),
 | 
			
		||||
       border = "black", col = myCol)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# test this:
 | 
			
		||||
plot(c(-1.5, 1.5), c(0, 0), type = "l")
 | 
			
		||||
drawBox(-1, 1, 0.0, "peachpuff")
 | 
			
		||||
 | 
			
		||||
# Next, we define a function to plot annotations for one protein: the name of
 | 
			
		||||
# the protein, a horizontal grey line for its length, and all of its features.
 | 
			
		||||
 | 
			
		||||
plotProtein <- function(DB, name, y) {
 | 
			
		||||
  # DB: protein database
 | 
			
		||||
  # name: the name of the protein in the database.
 | 
			
		||||
  # y: height where to draw the plot
 | 
			
		||||
  #
 | 
			
		||||
  # Define colors: we create a vector of color values, one for
 | 
			
		||||
  # each feature, and we give it names of the feature ID. Then we
 | 
			
		||||
  # can easily get the color value from the feature name.
 | 
			
		||||
  # A: make a vector of color values. The syntax may appear unusual -
 | 
			
		||||
  #    colorRampPalette() returns a function, and we simply append
 | 
			
		||||
  #    the parameter (number-of-features) without assigning the function
 | 
			
		||||
  #    to its own variable name.
 | 
			
		||||
  ftrCol <- colorRampPalette(c("#f2003c", "#F0A200", "#f0ea00",
 | 
			
		||||
                               "#62C923", "#0A9A9B", "#1958C3",
 | 
			
		||||
                               "#8000D3", "#D0007F"),
 | 
			
		||||
                             space="Lab",
 | 
			
		||||
                             interpolate="linear")(nrow(DB$feature))
 | 
			
		||||
  # B: Features may overlap, so we make the colors transparent by setting
 | 
			
		||||
  #    their "alpha channel" to 1/3  (hex: 55)
 | 
			
		||||
  ftrCol <- paste0(ftrCol, "55")
 | 
			
		||||
  # C: we asssign names
 | 
			
		||||
  names(ftrCol) <- DB$feature$ID
 | 
			
		||||
  # E.g. color for the third feature: ftrCol[ DB$feature$ID[3] ]
 | 
			
		||||
 | 
			
		||||
  # find the row-index of the protein ID in the protein table of DB
 | 
			
		||||
  iProtein <- which(DB$protein$name == name)
 | 
			
		||||
 | 
			
		||||
  # write the name of the protein
 | 
			
		||||
  text(-30, y, adj=1, labels=name, cex=0.75 )
 | 
			
		||||
 | 
			
		||||
  #draw a line from 0 to nchar(sequence-of-the-protein)
 | 
			
		||||
  lines(c(0, nchar(DB$protein$sequence[iProtein])), c(y, y),
 | 
			
		||||
        lwd=3, col="#999999")
 | 
			
		||||
 | 
			
		||||
  # get the rows of feature annotations for the protein
 | 
			
		||||
  iFtr <- which(DB$annotation$proteinID == DB$protein$ID[iProtein])
 | 
			
		||||
 | 
			
		||||
  # draw a colored box for each feature
 | 
			
		||||
  for (i in iFtr) {
 | 
			
		||||
    drawBox(DB$annotation$start[i],
 | 
			
		||||
            DB$annotation$end[i],
 | 
			
		||||
            y,
 | 
			
		||||
            ftrCol[ DB$annotation$featureID[i] ])
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Plot each annotated protein:
 | 
			
		||||
# Get the rows of all unique annotated Mbp1 proteins in myDB
 | 
			
		||||
 | 
			
		||||
iRows <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
 | 
			
		||||
# define the size of the plot-frame to accomodate all proteins
 | 
			
		||||
yMax <- length(iRows) * 1.1
 | 
			
		||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
 | 
			
		||||
 | 
			
		||||
# plot an empty frame
 | 
			
		||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))  # save the current plot parameters and
 | 
			
		||||
                                        # decrease margins
 | 
			
		||||
plot(1, 1,
 | 
			
		||||
     xlim = c(-200, xMax + 100),
 | 
			
		||||
     ylim = c(0, yMax),
 | 
			
		||||
     type = "n",
 | 
			
		||||
     axes = FALSE,
 | 
			
		||||
     bty = "n",
 | 
			
		||||
     main = "Mbp1 orthologue domain annotations",
 | 
			
		||||
     xlab = "sequence position",
 | 
			
		||||
     cex.axis = 0.8,
 | 
			
		||||
     ylab="")
 | 
			
		||||
axis(1, at = seq(0, xMax, by = 100))
 | 
			
		||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
 | 
			
		||||
                            "#f0ea00", "#62C923",
 | 
			
		||||
                            "#0A9A9B", "#1958C3",
 | 
			
		||||
                            "#8000D3", "#D0007F"),
 | 
			
		||||
                          space="Lab",
 | 
			
		||||
                          interpolate="linear")(nrow(myDB$feature))
 | 
			
		||||
myCol <- paste0(myCol, "55")
 | 
			
		||||
legend(xMax - 150, 7,
 | 
			
		||||
       legend = myDB$feature$name,
 | 
			
		||||
       cex = 0.7,
 | 
			
		||||
       fill = myCol,
 | 
			
		||||
       bty = "n")
 | 
			
		||||
 | 
			
		||||
# Finally, iterate over all proteins and call plotProtein()
 | 
			
		||||
for (i in seq_along(iRows)) {
 | 
			
		||||
  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
 | 
			
		||||
}
 | 
			
		||||
par(oPar)  # reset the plot parameters
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The plot shows what is variable and what is constant about the annotations in
 | 
			
		||||
# a group of related proteins. Your MBP1_MYSPE annotations should appear at the
 | 
			
		||||
# top.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Put a copy of the plot into your journal and interpret it with respect
 | 
			
		||||
#    to MBP1_MYSPE, i.e. and note what you learn about MBP1_MYSPE from the plot.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    It would be better to align the motif borders, at least approximately (not
 | 
			
		||||
#    all proteins have all motifs). How would you go about doing that?
 | 
			
		||||
 | 
			
		||||
# =    3  SHARING DATA  ========================================================
 | 
			
		||||
 | 
			
		||||
# It's particularly interesting to compare such annotations across many
 | 
			
		||||
# homologous proteins. I have created a page on the Student Wiki () that you can
 | 
			
		||||
# edit, and then download the data from the entire class directly to your
 | 
			
		||||
# RStudio project.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# I have provided a function that extracts all information that refers to a
 | 
			
		||||
# single protein from the database, and prints it out as well-formatted JSON,
 | 
			
		||||
# suitable to be pasted into our shareable Wiki-page. There is a fair amount of
 | 
			
		||||
# bookkeeping involved, but the code is not otherwise very enlightening so I
 | 
			
		||||
# will spare you the details - it's in "./scripts/ABC-dbUtilities.R" if you
 | 
			
		||||
# would want to have a look.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Post MBP1_MYSPE as JSON data  ======================================
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
# 1: Run the following code:
 | 
			
		||||
 | 
			
		||||
cat("{{Vspace}}",
 | 
			
		||||
    "<!-- ==== BEGIN  PROTEIN ==== -->",
 | 
			
		||||
    "<pre class=\"protein-data\">",
 | 
			
		||||
    dbProt2JSON(sprintf("MBP1_%s", biCode(MYSPE))),
 | 
			
		||||
    "</pre>",
 | 
			
		||||
    "<!-- ===== END PROTEIN ====== -->",
 | 
			
		||||
    "", sep = "\n"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# 2: Copy the entire output from the console.
 | 
			
		||||
# 3: Navigate to
 | 
			
		||||
#      http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public
 | 
			
		||||
#    ... edit the page, and paste your output at the top.
 | 
			
		||||
# 4: Save your edits.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Import shared MBP1_MYSPE from the Wiki  ============================
 | 
			
		||||
 | 
			
		||||
# Once we have collected a number of protein annotations, we can access the
 | 
			
		||||
# Wiki-page and import the data into our database. The Wiki page is  an html
 | 
			
		||||
# document with lots of MediaWiki specific stuff - but the contents we are
 | 
			
		||||
# interested in is enclosed in <pre class="protein-data"> ... </pre> tags. These
 | 
			
		||||
# work like normal HTML <pre> tags, but we have defined a special class for them
 | 
			
		||||
# to make it easy to parse out the contents we want. The rvest:: package in
 | 
			
		||||
# combination with xml2:: provides us with all the tools we need for such
 | 
			
		||||
# "Webscraping" of data....
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("rvest", quietly=TRUE)) {
 | 
			
		||||
  install.packages("rvest")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Here's the process:
 | 
			
		||||
# The URL is an "open" page on the student Wiki. Users that are not logged in
 | 
			
		||||
# can view the contents, but you can only edit if you are logged in.
 | 
			
		||||
myURL <- "http://steipe.biochemistry.utoronto.ca/abc/students/index.php/Public"
 | 
			
		||||
 | 
			
		||||
# First thing is to retrieve the HTML from the url...
 | 
			
		||||
x <- xml2::read_html(myURL)
 | 
			
		||||
 | 
			
		||||
# This retrieves the page source, but that still needs to be parsed into its
 | 
			
		||||
# logical elements. HTML is a subset of XML and such documents are structured as
 | 
			
		||||
# trees, that have "nodes" which are demarcated with "tags". rvest::html_nodes()
 | 
			
		||||
# parses out the document structure and then uses a so-called "xpath" expression
 | 
			
		||||
# to select nodes we are interested in. Now, xpath is one of those specialized
 | 
			
		||||
# languages of which there are a few more to learn than one would care for. You
 | 
			
		||||
# MUST know how to format sprintf() expressions, and you SHOULD be competent
 | 
			
		||||
# with regular expressions. But if you want to be really competent in your work,
 | 
			
		||||
# basic HTML and CSS is required ... and enough knowledge about xpath to be able
 | 
			
		||||
# to search on Stackoverflow for what you need for parsing data out of Web
 | 
			
		||||
# documents...
 | 
			
		||||
 | 
			
		||||
# The expression we use below is:
 | 
			
		||||
#   - get any node anywhere in the tree ("//*") ...
 | 
			
		||||
#   - that has a particular attribute("[@ ... ]").
 | 
			
		||||
#   - The attribute we want is that the class of the node is "protein-data";
 | 
			
		||||
#      that is the class we have defined for our <pre> tags.
 | 
			
		||||
# As a result of this selection, we get a list of pointers to the document tree.
 | 
			
		||||
y <- rvest::html_nodes(x, xpath ='//*[@class="protein-data"]')
 | 
			
		||||
 | 
			
		||||
# Next we fetch the actual payload - the text - from the tree:
 | 
			
		||||
# rvest::html_text() gets the text from the list of pointers. The result is a
 | 
			
		||||
# normal list of character strings.
 | 
			
		||||
z <- rvest::html_text(y)
 | 
			
		||||
 | 
			
		||||
# Finally we can iterate over the list, and add all proteins we don't already
 | 
			
		||||
# have to our database. There may well be items that are rejected because they
 | 
			
		||||
# are already present in the database - for example, unless somebody has
 | 
			
		||||
# annotated new features, all of the features are already there. Don't worry -
 | 
			
		||||
# that is intended; we don't want duplicate entries.
 | 
			
		||||
 | 
			
		||||
for (thisJSON in z) {
 | 
			
		||||
  thisData <- jsonlite::fromJSON(thisJSON)
 | 
			
		||||
  if (! thisData$protein$name %in% myDB$protein$name) {
 | 
			
		||||
    myDB <- dbAddProtein(myDB, thisData$protein)
 | 
			
		||||
    myDB <- dbAddTaxonomy(myDB, thisData$taxonomy)
 | 
			
		||||
    myDB <- dbAddFeature(myDB, thisData$feature)
 | 
			
		||||
    myDB <- dbAddAnnotation(myDB, thisData$annotation)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Finally, we can repeat our domain plot with the results - which now includes the shared proteins:
 | 
			
		||||
 | 
			
		||||
iRows <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
yMax <- length(iRows) * 1.1
 | 
			
		||||
xMax <- max(nchar(myDB$protein$sequence[iRows])) * 1.1  # longest sequence
 | 
			
		||||
 | 
			
		||||
# plot an empty frame
 | 
			
		||||
oPar <- par(mar = c(4.2, 0.1, 3, 0.1))
 | 
			
		||||
plot(1, 1,
 | 
			
		||||
     xlim = c(-200, xMax + 100),
 | 
			
		||||
     ylim = c(0, yMax),
 | 
			
		||||
     type = "n",
 | 
			
		||||
     axes = FALSE,
 | 
			
		||||
     bty = "n",
 | 
			
		||||
     main = "Mbp1 orthologue domain annotations",
 | 
			
		||||
     xlab = "sequence position",
 | 
			
		||||
     cex.axis = 0.8,
 | 
			
		||||
     ylab="")
 | 
			
		||||
axis(1, at = seq(0, xMax, by = 100))
 | 
			
		||||
myCol <- colorRampPalette(c("#f2003c", "#F0A200",
 | 
			
		||||
                            "#f0ea00", "#62C923",
 | 
			
		||||
                            "#0A9A9B", "#1958C3",
 | 
			
		||||
                            "#8000D3", "#D0007F"),
 | 
			
		||||
                          space="Lab",
 | 
			
		||||
                          interpolate="linear")(nrow(myDB$feature))
 | 
			
		||||
myCol <- paste0(myCol, "55")
 | 
			
		||||
legend(xMax - 150, 7,
 | 
			
		||||
       legend = myDB$feature$name,
 | 
			
		||||
       cex = 0.7,
 | 
			
		||||
       fill = myCol,
 | 
			
		||||
       bty = "n")
 | 
			
		||||
 | 
			
		||||
for (i in seq_along(iRows)) {
 | 
			
		||||
  plotProtein(myDB, myDB$protein$name[iRows[i]], i)
 | 
			
		||||
}
 | 
			
		||||
par(oPar)  # reset the plot parameters
 | 
			
		||||
 | 
			
		||||
# ... the more proteins we can compare, the more we learn about the
 | 
			
		||||
# architectural principles of this family's domains.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,169 +1,169 @@
 | 
			
		||||
# tocID <- "BIN-FUNC-Semantic_similarity.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    New code.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                Line
 | 
			
		||||
#TOC> --------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
 | 
			
		||||
#TOC>   2        Fetch GO Annotations                                  100
 | 
			
		||||
#TOC>   3        Semantic Similarities                                 109
 | 
			
		||||
#TOC>   4        GO Term Enrichment in Gene Sets                       127
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# GOSim is an R-package in the Bioconductor project.
 | 
			
		||||
if (! requireNamespace("GOSim", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("GOSim")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = GOSim)       # basic information
 | 
			
		||||
#  browseVignettes("GOSim")    # available vignettes
 | 
			
		||||
#  data(package = "GOSim")     # available datasets
 | 
			
		||||
 | 
			
		||||
# GOSim makes extensive assumptions about loaded packages, and many base
 | 
			
		||||
# methods are masked. We will thus use library(GOSim) to load it
 | 
			
		||||
# in its entirety and with all packages it depends on. We will still use
 | 
			
		||||
# the <package>::<function>() syntax in the code below, but this now serves
 | 
			
		||||
# more of a didactic purpose, rather than actual syntax requirements.
 | 
			
		||||
 | 
			
		||||
library(GOSim)
 | 
			
		||||
 | 
			
		||||
# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
 | 
			
		||||
# annotations instead...
 | 
			
		||||
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("org.Sc.sgd.db")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Bioconductor annotation packages won't work stably unless we actually load
 | 
			
		||||
# them:
 | 
			
		||||
library(org.Sc.sgd.db)
 | 
			
		||||
 | 
			
		||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
 | 
			
		||||
# databases exist for all model organisms. It's a kind of a fancy data frame
 | 
			
		||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
 | 
			
		||||
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
 | 
			
		||||
 | 
			
		||||
# ... and the types of available annotations with the columns() function
 | 
			
		||||
AnnotationDbi::columns(org.Sc.sgd.db)
 | 
			
		||||
 | 
			
		||||
# Note that one of the columns is "GO" ... and we load that into the
 | 
			
		||||
# datastructures used by GOSim:
 | 
			
		||||
 | 
			
		||||
# Choose GOterms to use
 | 
			
		||||
GOSim::setEvidenceLevel(evidences = "all",
 | 
			
		||||
                        organism = org.Sc.sgdORGANISM,
 | 
			
		||||
                        gomap = org.Sc.sgdGO)
 | 
			
		||||
 | 
			
		||||
# Use Biological Process ontology
 | 
			
		||||
GOSim::setOntology("BP", loadIC = FALSE)
 | 
			
		||||
 | 
			
		||||
# confirm that we loaded the correct ontology
 | 
			
		||||
head(get("gomap", envir = GOSimEnv))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Fetch GO Annotations  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# All keys being used here are yeast systematic names.
 | 
			
		||||
 | 
			
		||||
# Get one set of annotations
 | 
			
		||||
GOSim::getGOInfo(c("YDL056W"))  # Mbp1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Semantic Similarities  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Get semantic similarities between genes
 | 
			
		||||
?getGeneSim
 | 
			
		||||
 | 
			
		||||
# There are _many_ different metrics of term similarity implemented
 | 
			
		||||
# in this package.
 | 
			
		||||
 | 
			
		||||
                                                         # Mbp1 and...
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  GO Term Enrichment in Gene Sets  =====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Calculating GO term enrichment in gene sets is done with the Bioconductor
 | 
			
		||||
# topGO package.
 | 
			
		||||
if (! requireNamespace("topGO", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("topGO")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = topGO)       # basic information
 | 
			
		||||
#  browseVignettes("topGO")    # available vignettes
 | 
			
		||||
#  data(package = "topGO")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Once again - assumptions are made by GOsim that require us to load the
 | 
			
		||||
# topGO package wholesale:
 | 
			
		||||
library(topGO)
 | 
			
		||||
 | 
			
		||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
 | 
			
		||||
mySet <- c("YFR028C", # Cdc14
 | 
			
		||||
           "YDL056W", # Mbp1
 | 
			
		||||
           "YLR182W", # Swi6
 | 
			
		||||
           "YER111C", # Swi4
 | 
			
		||||
           "YOR083W", # Whi5
 | 
			
		||||
           "YBR160W", # Cdc28
 | 
			
		||||
           "YMR199W", # Cln1
 | 
			
		||||
           "YPL256C", # Cln2
 | 
			
		||||
           "YAL040C") # Cln3
 | 
			
		||||
 | 
			
		||||
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
 | 
			
		||||
allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
 | 
			
		||||
                                            # we define enrichment
 | 
			
		||||
 | 
			
		||||
myEnr <- GOenrichment(mySet, allGenes)
 | 
			
		||||
 | 
			
		||||
sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
 | 
			
		||||
 | 
			
		||||
#Most significantly enriched is GO:0071931. What is this?
 | 
			
		||||
annotate::getGOTerm("GO:0071931")  # ... makes sense.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-FUNC-Semantic_similarity.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-FUNC_Semantic_similarity unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-11  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    New code.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                Line
 | 
			
		||||
#TOC> --------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparations: Packages, AnnotationDB, Setup            43
 | 
			
		||||
#TOC>   2        Fetch GO Annotations                                  100
 | 
			
		||||
#TOC>   3        Semantic Similarities                                 109
 | 
			
		||||
#TOC>   4        GO Term Enrichment in Gene Sets                       127
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparations: Packages, AnnotationDB, Setup  =========================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# GOSim is an R-package in the Bioconductor project.
 | 
			
		||||
if (! requireNamespace("GOSim", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("GOSim")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = GOSim)       # basic information
 | 
			
		||||
#  browseVignettes("GOSim")    # available vignettes
 | 
			
		||||
#  data(package = "GOSim")     # available datasets
 | 
			
		||||
 | 
			
		||||
# GOSim makes extensive assumptions about loaded packages, and many base
 | 
			
		||||
# methods are masked. We will thus use library(GOSim) to load it
 | 
			
		||||
# in its entirety and with all packages it depends on. We will still use
 | 
			
		||||
# the <package>::<function>() syntax in the code below, but this now serves
 | 
			
		||||
# more of a didactic purpose, rather than actual syntax requirements.
 | 
			
		||||
 | 
			
		||||
library(GOSim)
 | 
			
		||||
 | 
			
		||||
# GOSim loads human annotations in  org.Hs.eg.db  by default. We load yeast
 | 
			
		||||
# annotations instead...
 | 
			
		||||
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("org.Sc.sgd.db")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Bioconductor annotation packages won't work stably unless we actually load
 | 
			
		||||
# them:
 | 
			
		||||
library(org.Sc.sgd.db)
 | 
			
		||||
 | 
			
		||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
 | 
			
		||||
# databases exist for all model organisms. It's a kind of a fancy data frame
 | 
			
		||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
 | 
			
		||||
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
 | 
			
		||||
 | 
			
		||||
# ... and the types of available annotations with the columns() function
 | 
			
		||||
AnnotationDbi::columns(org.Sc.sgd.db)
 | 
			
		||||
 | 
			
		||||
# Note that one of the columns is "GO" ... and we load that into the
 | 
			
		||||
# datastructures used by GOSim:
 | 
			
		||||
 | 
			
		||||
# Choose GOterms to use
 | 
			
		||||
GOSim::setEvidenceLevel(evidences = "all",
 | 
			
		||||
                        organism = org.Sc.sgdORGANISM,
 | 
			
		||||
                        gomap = org.Sc.sgdGO)
 | 
			
		||||
 | 
			
		||||
# Use Biological Process ontology
 | 
			
		||||
GOSim::setOntology("BP", loadIC = FALSE)
 | 
			
		||||
 | 
			
		||||
# confirm that we loaded the correct ontology
 | 
			
		||||
head(get("gomap", envir = GOSimEnv))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Fetch GO Annotations  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# All keys being used here are yeast systematic names.
 | 
			
		||||
 | 
			
		||||
# Get one set of annotations
 | 
			
		||||
GOSim::getGOInfo(c("YDL056W"))  # Mbp1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Semantic Similarities  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Get semantic similarities between genes
 | 
			
		||||
?getGeneSim
 | 
			
		||||
 | 
			
		||||
# There are _many_ different metrics of term similarity implemented
 | 
			
		||||
# in this package.
 | 
			
		||||
 | 
			
		||||
                                                         # Mbp1 and...
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
 | 
			
		||||
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  GO Term Enrichment in Gene Sets  =====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Calculating GO term enrichment in gene sets is done with the Bioconductor
 | 
			
		||||
# topGO package.
 | 
			
		||||
if (! requireNamespace("topGO", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("topGO")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = topGO)       # basic information
 | 
			
		||||
#  browseVignettes("topGO")    # available vignettes
 | 
			
		||||
#  data(package = "topGO")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Once again - assumptions are made by GOsim that require us to load the
 | 
			
		||||
# topGO package wholesale:
 | 
			
		||||
library(topGO)
 | 
			
		||||
 | 
			
		||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
 | 
			
		||||
mySet <- c("YFR028C", # Cdc14
 | 
			
		||||
           "YDL056W", # Mbp1
 | 
			
		||||
           "YLR182W", # Swi6
 | 
			
		||||
           "YER111C", # Swi4
 | 
			
		||||
           "YOR083W", # Whi5
 | 
			
		||||
           "YBR160W", # Cdc28
 | 
			
		||||
           "YMR199W", # Cln1
 | 
			
		||||
           "YPL256C", # Cln2
 | 
			
		||||
           "YAL040C") # Cln3
 | 
			
		||||
 | 
			
		||||
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
 | 
			
		||||
allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which
 | 
			
		||||
                                            # we define enrichment
 | 
			
		||||
 | 
			
		||||
myEnr <- GOenrichment(mySet, allGenes)
 | 
			
		||||
 | 
			
		||||
sort(myEnr$p.values)  # Any significantly enriched terms? All of these are ...
 | 
			
		||||
 | 
			
		||||
#Most significantly enriched is GO:0071931. What is this?
 | 
			
		||||
annotate::getGOTerm("GO:0071931")  # ... makes sense.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							
							
						
						
									
										702
									
								
								BIN-MYSPE.R
									
									
									
									
									
								
							@@ -1,351 +1,351 @@
 | 
			
		||||
# tocID <- "BIN-MYSPE.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-MYSPE unit
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-09 - 2021-10
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 1.4    Add troubleshooting hints via errText[[...]]
 | 
			
		||||
# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
 | 
			
		||||
# V 1.2    Reorganized proportional plot section into a "further reading"
 | 
			
		||||
#          section, added nested-box, and sankey plot visualization of
 | 
			
		||||
#          proportions. Introduced plotly.
 | 
			
		||||
# V 1.1    2020 Workflow changes
 | 
			
		||||
# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
 | 
			
		||||
# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 | 
			
		||||
# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
 | 
			
		||||
#
 | 
			
		||||
# TODO:    Sample solution for sankey plot function.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                             Line
 | 
			
		||||
#TOC> -----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        PREPARATIONS                                        52
 | 
			
		||||
#TOC>   2        SUITABLE MYSPE SPECIES                              65
 | 
			
		||||
#TOC>   3        ADOPT "MYSPE"                                       89
 | 
			
		||||
#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
 | 
			
		||||
#TOC>   4.1        Percentages                                      146
 | 
			
		||||
#TOC>   4.2        Visualizing proportions: Pie chart               165
 | 
			
		||||
#TOC>   4.3        Visualizing proportions: Nested squares          243
 | 
			
		||||
#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  PREPARATIONS  ========================================================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Execute the two conditionals below:
 | 
			
		||||
if (! file.exists("./myScripts/.myProfile.R")) {
 | 
			
		||||
  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! exists("myStudentNumber")) {
 | 
			
		||||
  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  SUITABLE MYSPE SPECIES  ==============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# In this unit we will select one species from a list of genome sequenced fungi
 | 
			
		||||
# and write it into your personalized profile file. This species will be called
 | 
			
		||||
# "MYSPE" (My Species) for other learning units and exercises.
 | 
			
		||||
 | 
			
		||||
# A detailed description of the process of compiling the list of genome
 | 
			
		||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
 | 
			
		||||
# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
 | 
			
		||||
# was retrieved from https://fungi.ensembl.org; a search for homologues to
 | 
			
		||||
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
 | 
			
		||||
# A representative organism at each genus-level was chosen from those hits
 | 
			
		||||
# that actual;ly have a homologue. Finally, a mapping table was constructed to
 | 
			
		||||
# asymmetrically retrieve unique species: a student number will retrieve
 | 
			
		||||
# a species, but (public) knowledge of the species cannot reconstruct the
 | 
			
		||||
# student number.
 | 
			
		||||
 | 
			
		||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
 | 
			
		||||
#       of selecting and combining data from various data resources. Studying
 | 
			
		||||
#       it will give you a better sense of how such workflows can be
 | 
			
		||||
#       implemented in practice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  ADOPT "MYSPE"  =======================================================
 | 
			
		||||
 | 
			
		||||
# Execute:
 | 
			
		||||
( MYSPE <- getMYSPE(myStudentNumber) )
 | 
			
		||||
 | 
			
		||||
# If this produced an error, this session has not been properly set up. You
 | 
			
		||||
# may not yet have run  init()  and edited  .myProfile.R , or that file is not
 | 
			
		||||
# in your  myScripts/  folder. Fix this, and execute:
 | 
			
		||||
#
 | 
			
		||||
#    source(".Rprofile") .
 | 
			
		||||
 | 
			
		||||
# If this produced NA, your Student Number may not be correct, or you are not in
 | 
			
		||||
# my class-list. Contact me. Otherwise, this should have printed a species name,
 | 
			
		||||
# and the taxonomy ID of its genome-sequenced strain. This is your unique
 | 
			
		||||
# speciesfor this course. Note it in your journal ...
 | 
			
		||||
 | 
			
		||||
biCode(MYSPE) # and also note it's "BiCode" ...
 | 
			
		||||
( myTaxID <- names(MYSPE) )  # and its taxID
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
#   Note down the species name and its five letter BiCode on your Student
 | 
			
		||||
#   Wiki user page. Use this species whenever this or future assignments refer
 | 
			
		||||
#   to MYSPE. Whenever you start a session, it will automatically be loaded
 | 
			
		||||
#   from  myScripts/.myProfile.R  and is available as  MYSPE .
 | 
			
		||||
 | 
			
		||||
# Here is some more information about MYSPE, taken from the table of genome-
 | 
			
		||||
# sequenced fungi that is in your ./data folder.
 | 
			
		||||
fungiDat <- read.csv("data/Species.csv")
 | 
			
		||||
iMs <- which(fungiDat$Taxon.ID == myTaxID)
 | 
			
		||||
 | 
			
		||||
( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
 | 
			
		||||
( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
 | 
			
		||||
( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
 | 
			
		||||
 | 
			
		||||
# That's all.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
 | 
			
		||||
 | 
			
		||||
# The material below is an exploration of data-preparation and plotting
 | 
			
		||||
# techniques; you can treat this as additional practice and further reading and
 | 
			
		||||
# I expect that some of the code and plotting examples may be useful in a
 | 
			
		||||
# different context.
 | 
			
		||||
 | 
			
		||||
# A frequent task is to visualize the proportion of elements with given
 | 
			
		||||
# categories in a sample. For example, we might ask what the proportion of the
 | 
			
		||||
# different orders of fungi is the order of MYSPE? Let's first collect the
 | 
			
		||||
# numbers.
 | 
			
		||||
 | 
			
		||||
( nFungi <- nrow(fungiDat) )                            # sequenced fungi
 | 
			
		||||
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
 | 
			
		||||
( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
 | 
			
		||||
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Percentages  =======================================================
 | 
			
		||||
 | 
			
		||||
# The zeroth-order approach to visualization is simply to print percentages:
 | 
			
		||||
 | 
			
		||||
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
 | 
			
		||||
        myOr,
 | 
			
		||||
        (nOrder * 100) / nFungi))
 | 
			
		||||
 | 
			
		||||
# ... or, adding the actual numbers:
 | 
			
		||||
 | 
			
		||||
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
 | 
			
		||||
            myOr,
 | 
			
		||||
            (nOrder * 100) / nFungi,
 | 
			
		||||
            nOrder,
 | 
			
		||||
            nFungi))
 | 
			
		||||
 | 
			
		||||
# But that's hard to visualize for most of us, and anyway, we don't know how
 | 
			
		||||
# that relates to other orders.
 | 
			
		||||
 | 
			
		||||
# ==   4.2  Visualizing proportions: Pie chart  ================================
 | 
			
		||||
 | 
			
		||||
# Often, we will use a pie chart instead. Pie charts are rather informal types
 | 
			
		||||
# of plots, not well suited for analysis. But easy to do:
 | 
			
		||||
 | 
			
		||||
# Define four colors to identify the four categories
 | 
			
		||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
 | 
			
		||||
                                           # and remember the
 | 
			
		||||
                                           # previous setting
 | 
			
		||||
 | 
			
		||||
pie(c(nSpecies,                            # subtract numbers since these
 | 
			
		||||
      nGenus - nSpecies,                   # categories are mutually contained
 | 
			
		||||
      nOrder - nGenus - nSpecies,          # in each other
 | 
			
		||||
      nFungi - nOrder - nGenus - nSpecies),
 | 
			
		||||
      labels = "",
 | 
			
		||||
      radius = 0.9,
 | 
			
		||||
      main = "MYSPE in genome-sequenced fungi",
 | 
			
		||||
      lty = 0,                             # turn borders for wedges off
 | 
			
		||||
      col = pCol,
 | 
			
		||||
      clockwise = TRUE,
 | 
			
		||||
      init.angle = 90)
 | 
			
		||||
 | 
			
		||||
title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
 | 
			
		||||
 | 
			
		||||
legend(x = 0.95, y = 0.8,    # place at legend here
 | 
			
		||||
       legend = c("Species", "Genus", "Order", "Fungi"),
 | 
			
		||||
       y.intersp = 2,                      # line spacing for labels
 | 
			
		||||
       cex = 0.8,                          # character size for labels
 | 
			
		||||
       bty = "n",                          # "no" box around the legend
 | 
			
		||||
       pt.cex = 2,                         # size of colour boxes
 | 
			
		||||
       pch = 15,                           # a filled square
 | 
			
		||||
       col = pCol)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
# Unless MYSPE is one of the frequently sequenced species, there will only be a
 | 
			
		||||
# very thin wedge visible. Pie charts are not well suited to visualize small
 | 
			
		||||
# proportions.
 | 
			
		||||
 | 
			
		||||
# It is a little more useful if we have non-nested proportions - like the
 | 
			
		||||
# number of species in the same order overall:
 | 
			
		||||
 | 
			
		||||
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
 | 
			
		||||
head(myTbl)
 | 
			
		||||
 | 
			
		||||
# pie() does a reasonable job out of the box to interpret table() data:
 | 
			
		||||
pie(myTbl)
 | 
			
		||||
 | 
			
		||||
# ... we can improve this quickly with a bit of tweaking:
 | 
			
		||||
 | 
			
		||||
N <- length(myTbl)
 | 
			
		||||
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
 | 
			
		||||
 | 
			
		||||
myCol <- rep(pCol[4], N)       # N elements of pCol[1]
 | 
			
		||||
myCol[sel] <- pCol[1]          # replace this one color
 | 
			
		||||
 | 
			
		||||
myLbl <- rep("", N)            # N labels of ""
 | 
			
		||||
myLbl[sel] <- myOr             # replace this one label with the MYSPE order
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
 | 
			
		||||
 | 
			
		||||
pie(myTbl,
 | 
			
		||||
    labels = myLbl,
 | 
			
		||||
    radius = 0.9,
 | 
			
		||||
    main = "MYSPE order",
 | 
			
		||||
    border = "#DDDDDD",
 | 
			
		||||
    col = myCol,
 | 
			
		||||
    clockwise = TRUE,
 | 
			
		||||
    init.angle = 90)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
# But the overall problem remains.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.3  Visualizing proportions: Nested squares  ===========================
 | 
			
		||||
 | 
			
		||||
# A simple alternative is to draw such proportions as nested squares:
 | 
			
		||||
 | 
			
		||||
x <- sqrt(nFungi)
 | 
			
		||||
 | 
			
		||||
# set margins to ~ 0 and type to square
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
 | 
			
		||||
 | 
			
		||||
# empty, square plot
 | 
			
		||||
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
 | 
			
		||||
     type="n", axes=FALSE, xlab="", ylab="")
 | 
			
		||||
 | 
			
		||||
# basic square for all genomes
 | 
			
		||||
rect(0, 0, x,              x,              col = pCol[4])
 | 
			
		||||
 | 
			
		||||
# grid
 | 
			
		||||
u <- 0:floor(x)
 | 
			
		||||
N <- length(u)
 | 
			
		||||
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
 | 
			
		||||
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
 | 
			
		||||
# each square on this grid is one genome
 | 
			
		||||
 | 
			
		||||
# colored squares
 | 
			
		||||
rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
 | 
			
		||||
rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
 | 
			
		||||
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
 | 
			
		||||
 | 
			
		||||
# labels
 | 
			
		||||
text(x/2, x/2,      "Fungi")
 | 
			
		||||
text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
 | 
			
		||||
text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
 | 
			
		||||
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
 | 
			
		||||
 | 
			
		||||
# Sankey diagrams are an excellent way to visualize complicated nested
 | 
			
		||||
# proportions and their changes (see here for example:
 | 
			
		||||
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
 | 
			
		||||
# example with the MYSPE proportions, as an illustration of the plotting
 | 
			
		||||
# principle.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("plotly")) {
 | 
			
		||||
  install.packages("plotly")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help   = plotly)     # basic information
 | 
			
		||||
#  browseVignettes("plotly")    # available vignettes
 | 
			
		||||
#  data(package  = "plotly")    # available datasets
 | 
			
		||||
 | 
			
		||||
# Here, we use the plotly package that wraps a very well developed javascript
 | 
			
		||||
# library with many options for interactive plots. I am producing this plot
 | 
			
		||||
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
 | 
			
		||||
# to change the code to adapt it to your own MYSPE - or even build a function
 | 
			
		||||
# for this. Do try this if you have a bit of coding experience, sankey diagrams
 | 
			
		||||
# are a good way to show hierarchical data relations - and if you get this
 | 
			
		||||
# working for your own organism you can be proud that you have understood
 | 
			
		||||
# how preparing the data works.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
 | 
			
		||||
                          "Ophiostomatales (6)",       # 1
 | 
			
		||||
                          "Other...",                  # 2
 | 
			
		||||
                          "Sporothrix (4)",            # 3
 | 
			
		||||
                          "Other...",                  # 4
 | 
			
		||||
                          "Sporothrix schenckii (2)",  # 5
 | 
			
		||||
                          "Other..."                   # 6
 | 
			
		||||
                          ),
 | 
			
		||||
                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
 | 
			
		||||
                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
 | 
			
		||||
                color = c("#f2f2f0", #
 | 
			
		||||
                          "#ffd5c4",
 | 
			
		||||
                          "#CCCCCC",
 | 
			
		||||
                          "#ff9582",
 | 
			
		||||
                          "#CCCCCC",
 | 
			
		||||
                          "#ed394e",
 | 
			
		||||
                          "#CCCCCC"
 | 
			
		||||
                          ),
 | 
			
		||||
                pad = 15,
 | 
			
		||||
                thickness = 20,
 | 
			
		||||
                line = list(color = "black",
 | 
			
		||||
                            width = 0.5))
 | 
			
		||||
 | 
			
		||||
myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
 | 
			
		||||
                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
 | 
			
		||||
                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
 | 
			
		||||
 | 
			
		||||
# Setting up the actual plot ...
 | 
			
		||||
fig  <-  plotly::plot_ly(type = "sankey",
 | 
			
		||||
                         arrangement = "snap",
 | 
			
		||||
                         orientation = "h",
 | 
			
		||||
                         node = myNodes,
 | 
			
		||||
                         link = myLinks)
 | 
			
		||||
 | 
			
		||||
# Adding and adjusting a few layout parameters
 | 
			
		||||
fig <- plotly::layout(fig,
 | 
			
		||||
              title = "Fungi Genomes - Classification",
 | 
			
		||||
              font = list(size = 10))
 | 
			
		||||
 | 
			
		||||
fig     # plot the diagram
 | 
			
		||||
 | 
			
		||||
# Note that the plot appears in the Viewer window, not the Plot window, and that
 | 
			
		||||
# it is interactive: you can hover over nodes and links, and drag the nodes
 | 
			
		||||
# around.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-MYSPE.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-MYSPE unit
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-09 - 2021-10
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 1.4    Add troubleshooting hints via errText[[...]]
 | 
			
		||||
# V 1.3    2021 update of MYSPE mechanics; fix a bug no one had complained about
 | 
			
		||||
# V 1.2    Reorganized proportional plot section into a "further reading"
 | 
			
		||||
#          section, added nested-box, and sankey plot visualization of
 | 
			
		||||
#          proportions. Introduced plotly.
 | 
			
		||||
# V 1.1    2020 Workflow changes
 | 
			
		||||
# V 1.0.1  Move ABC-makeMYSPElist.R to ./scripts directory
 | 
			
		||||
# V 1.0    Final code, after rewriting BLAST parser and updating MYSPElist
 | 
			
		||||
# V 0.1    First code copied from BCH441_A03_makeMYSPElist.R
 | 
			
		||||
#
 | 
			
		||||
# TODO:    Sample solution for sankey plot function.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                             Line
 | 
			
		||||
#TOC> -----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        PREPARATIONS                                        52
 | 
			
		||||
#TOC>   2        SUITABLE MYSPE SPECIES                              65
 | 
			
		||||
#TOC>   3        ADOPT "MYSPE"                                       89
 | 
			
		||||
#TOC>   4        FURTHER READING: PLOTTING PROPORTIONS              128
 | 
			
		||||
#TOC>   4.1        Percentages                                      146
 | 
			
		||||
#TOC>   4.2        Visualizing proportions: Pie chart               165
 | 
			
		||||
#TOC>   4.3        Visualizing proportions: Nested squares          243
 | 
			
		||||
#TOC>   4.4        Visualizing proportions: Sankey diagrams         280
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  PREPARATIONS  ========================================================
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Execute the two conditionals below:
 | 
			
		||||
if (! file.exists("./myScripts/.myProfile.R")) {
 | 
			
		||||
  stop(errText[["noProfileFile"]])     # message defined in .Rprofile
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if (! exists("myStudentNumber")) {
 | 
			
		||||
  stop(errText[["noStudentNumber"]])   # message defined in .Rprofile
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  SUITABLE MYSPE SPECIES  ==============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# In this unit we will select one species from a list of genome sequenced fungi
 | 
			
		||||
# and write it into your personalized profile file. This species will be called
 | 
			
		||||
# "MYSPE" (My Species) for other learning units and exercises.
 | 
			
		||||
 | 
			
		||||
# A detailed description of the process of compiling the list of genome
 | 
			
		||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
 | 
			
		||||
# ./scripts/ABC-makeMYSPElist.R  In brief, data for genome-sequenced fungi
 | 
			
		||||
# was retrieved from https://fungi.ensembl.org; a search for homologues to
 | 
			
		||||
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
 | 
			
		||||
# A representative organism at each genus-level was chosen from those hits
 | 
			
		||||
# that actual;ly have a homologue. Finally, a mapping table was constructed to
 | 
			
		||||
# asymmetrically retrieve unique species: a student number will retrieve
 | 
			
		||||
# a species, but (public) knowledge of the species cannot reconstruct the
 | 
			
		||||
# student number.
 | 
			
		||||
 | 
			
		||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
 | 
			
		||||
#       of selecting and combining data from various data resources. Studying
 | 
			
		||||
#       it will give you a better sense of how such workflows can be
 | 
			
		||||
#       implemented in practice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  ADOPT "MYSPE"  =======================================================
 | 
			
		||||
 | 
			
		||||
# Execute:
 | 
			
		||||
( MYSPE <- getMYSPE(myStudentNumber) )
 | 
			
		||||
 | 
			
		||||
# If this produced an error, this session has not been properly set up. You
 | 
			
		||||
# may not yet have run  init()  and edited  .myProfile.R , or that file is not
 | 
			
		||||
# in your  myScripts/  folder. Fix this, and execute:
 | 
			
		||||
#
 | 
			
		||||
#    source(".Rprofile") .
 | 
			
		||||
 | 
			
		||||
# If this produced NA, your Student Number may not be correct, or you are not in
 | 
			
		||||
# my class-list. Contact me. Otherwise, this should have printed a species name,
 | 
			
		||||
# and the taxonomy ID of its genome-sequenced strain. This is your unique
 | 
			
		||||
# speciesfor this course. Note it in your journal ...
 | 
			
		||||
 | 
			
		||||
biCode(MYSPE) # and also note it's "BiCode" ...
 | 
			
		||||
( myTaxID <- names(MYSPE) )  # and its taxID
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
#   Note down the species name and its five letter BiCode on your Student
 | 
			
		||||
#   Wiki user page. Use this species whenever this or future assignments refer
 | 
			
		||||
#   to MYSPE. Whenever you start a session, it will automatically be loaded
 | 
			
		||||
#   from  myScripts/.myProfile.R  and is available as  MYSPE .
 | 
			
		||||
 | 
			
		||||
# Here is some more information about MYSPE, taken from the table of genome-
 | 
			
		||||
# sequenced fungi that is in your ./data folder.
 | 
			
		||||
fungiDat <- read.csv("data/Species.csv")
 | 
			
		||||
iMs <- which(fungiDat$Taxon.ID == myTaxID)
 | 
			
		||||
 | 
			
		||||
( myOr <- fungiDat$Classification[iMs] )  # Taxonomic order
 | 
			
		||||
( myGn <- gsub("\\s.*", "", MYSPE))       # Taxonomic genus
 | 
			
		||||
( mySt <- fungiDat$Name[iMs] )            # Taxonomic strain
 | 
			
		||||
 | 
			
		||||
# That's all.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  FURTHER READING: PLOTTING PROPORTIONS  ===============================
 | 
			
		||||
 | 
			
		||||
# The material below is an exploration of data-preparation and plotting
 | 
			
		||||
# techniques; you can treat this as additional practice and further reading and
 | 
			
		||||
# I expect that some of the code and plotting examples may be useful in a
 | 
			
		||||
# different context.
 | 
			
		||||
 | 
			
		||||
# A frequent task is to visualize the proportion of elements with given
 | 
			
		||||
# categories in a sample. For example, we might ask what the proportion of the
 | 
			
		||||
# different orders of fungi is the order of MYSPE? Let's first collect the
 | 
			
		||||
# numbers.
 | 
			
		||||
 | 
			
		||||
( nFungi <- nrow(fungiDat) )                            # sequenced fungi
 | 
			
		||||
( nOrder <- sum(grepl(myOr, fungiDat$Classification)) ) # same order as MYSPE
 | 
			
		||||
( nGenus <- sum(grepl(myGn, fungiDat$Name)) )           # same genus as MYSPE
 | 
			
		||||
( nSpecies <- sum(grepl(MYSPE, fungiDat$Name)) )        # same species as MYSPE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Percentages  =======================================================
 | 
			
		||||
 | 
			
		||||
# The zeroth-order approach to visualization is simply to print percentages:
 | 
			
		||||
 | 
			
		||||
cat(sprintf("\n%s comprise %5.2f%% of fungi.",
 | 
			
		||||
        myOr,
 | 
			
		||||
        (nOrder * 100) / nFungi))
 | 
			
		||||
 | 
			
		||||
# ... or, adding the actual numbers:
 | 
			
		||||
 | 
			
		||||
cat(sprintf("\n%s comprise %5.2f%% of fungi (%d of %d).",
 | 
			
		||||
            myOr,
 | 
			
		||||
            (nOrder * 100) / nFungi,
 | 
			
		||||
            nOrder,
 | 
			
		||||
            nFungi))
 | 
			
		||||
 | 
			
		||||
# But that's hard to visualize for most of us, and anyway, we don't know how
 | 
			
		||||
# that relates to other orders.
 | 
			
		||||
 | 
			
		||||
# ==   4.2  Visualizing proportions: Pie chart  ================================
 | 
			
		||||
 | 
			
		||||
# Often, we will use a pie chart instead. Pie charts are rather informal types
 | 
			
		||||
# of plots, not well suited for analysis. But easy to do:
 | 
			
		||||
 | 
			
		||||
# Define four colors to identify the four categories
 | 
			
		||||
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
 | 
			
		||||
                                           # and remember the
 | 
			
		||||
                                           # previous setting
 | 
			
		||||
 | 
			
		||||
pie(c(nSpecies,                            # subtract numbers since these
 | 
			
		||||
      nGenus - nSpecies,                   # categories are mutually contained
 | 
			
		||||
      nOrder - nGenus - nSpecies,          # in each other
 | 
			
		||||
      nFungi - nOrder - nGenus - nSpecies),
 | 
			
		||||
      labels = "",
 | 
			
		||||
      radius = 0.9,
 | 
			
		||||
      main = "MYSPE in genome-sequenced fungi",
 | 
			
		||||
      lty = 0,                             # turn borders for wedges off
 | 
			
		||||
      col = pCol,
 | 
			
		||||
      clockwise = TRUE,
 | 
			
		||||
      init.angle = 90)
 | 
			
		||||
 | 
			
		||||
title(main=MYSPE, line=0, cex.main=0.7)    # add a title to the plot
 | 
			
		||||
 | 
			
		||||
legend(x = 0.95, y = 0.8,    # place at legend here
 | 
			
		||||
       legend = c("Species", "Genus", "Order", "Fungi"),
 | 
			
		||||
       y.intersp = 2,                      # line spacing for labels
 | 
			
		||||
       cex = 0.8,                          # character size for labels
 | 
			
		||||
       bty = "n",                          # "no" box around the legend
 | 
			
		||||
       pt.cex = 2,                         # size of colour boxes
 | 
			
		||||
       pch = 15,                           # a filled square
 | 
			
		||||
       col = pCol)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
# Unless MYSPE is one of the frequently sequenced species, there will only be a
 | 
			
		||||
# very thin wedge visible. Pie charts are not well suited to visualize small
 | 
			
		||||
# proportions.
 | 
			
		||||
 | 
			
		||||
# It is a little more useful if we have non-nested proportions - like the
 | 
			
		||||
# number of species in the same order overall:
 | 
			
		||||
 | 
			
		||||
myTbl <- sort(table(fungiDat$Classification), decreasing = TRUE)
 | 
			
		||||
head(myTbl)
 | 
			
		||||
 | 
			
		||||
# pie() does a reasonable job out of the box to interpret table() data:
 | 
			
		||||
pie(myTbl)
 | 
			
		||||
 | 
			
		||||
# ... we can improve this quickly with a bit of tweaking:
 | 
			
		||||
 | 
			
		||||
N <- length(myTbl)
 | 
			
		||||
sel <- myOr == names(myTbl) # TRUE for the MYSPE order, FALSE elsewhere
 | 
			
		||||
 | 
			
		||||
myCol <- rep(pCol[4], N)       # N elements of pCol[1]
 | 
			
		||||
myCol[sel] <- pCol[1]          # replace this one color
 | 
			
		||||
 | 
			
		||||
myLbl <- rep("", N)            # N labels of ""
 | 
			
		||||
myLbl[sel] <- myOr             # replace this one label with the MYSPE order
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 2.5, 0.1))   # set margins to ~ 0
 | 
			
		||||
 | 
			
		||||
pie(myTbl,
 | 
			
		||||
    labels = myLbl,
 | 
			
		||||
    radius = 0.9,
 | 
			
		||||
    main = "MYSPE order",
 | 
			
		||||
    border = "#DDDDDD",
 | 
			
		||||
    col = myCol,
 | 
			
		||||
    clockwise = TRUE,
 | 
			
		||||
    init.angle = 90)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
# But the overall problem remains.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.3  Visualizing proportions: Nested squares  ===========================
 | 
			
		||||
 | 
			
		||||
# A simple alternative is to draw such proportions as nested squares:
 | 
			
		||||
 | 
			
		||||
x <- sqrt(nFungi)
 | 
			
		||||
 | 
			
		||||
# set margins to ~ 0 and type to square
 | 
			
		||||
oPar <- par(mar = c(0.1, 0.1, 0.1, 0.1), pty = "s")
 | 
			
		||||
 | 
			
		||||
# empty, square plot
 | 
			
		||||
plot(c(0, x), c(0, x), xlim = c(0, x), ylim = c(0, x),
 | 
			
		||||
     type="n", axes=FALSE, xlab="", ylab="")
 | 
			
		||||
 | 
			
		||||
# basic square for all genomes
 | 
			
		||||
rect(0, 0, x,              x,              col = pCol[4])
 | 
			
		||||
 | 
			
		||||
# grid
 | 
			
		||||
u <- 0:floor(x)
 | 
			
		||||
N <- length(u)
 | 
			
		||||
segments(rep(0, N), u, rep(x, N), u, col = "#0000FF18")
 | 
			
		||||
segments(u, rep(0, N), u, rep(x, N), col = "#0000FF18")
 | 
			
		||||
# each square on this grid is one genome
 | 
			
		||||
 | 
			
		||||
# colored squares
 | 
			
		||||
rect(0, 0, sqrt(nOrder),   sqrt(nOrder),   col = pCol[3])
 | 
			
		||||
rect(0, 0, sqrt(nGenus),   sqrt(nGenus),   col = pCol[2])
 | 
			
		||||
rect(0, 0, sqrt(nSpecies), sqrt(nSpecies), col = pCol[1])
 | 
			
		||||
 | 
			
		||||
# labels
 | 
			
		||||
text(x/2, x/2,      "Fungi")
 | 
			
		||||
text(x * 0.08, x * 0.11, myOr,   pos = 4, cex = 0.9)
 | 
			
		||||
text(x * 0.08, x * 0.06, myGn,   pos = 4, cex = 0.8)
 | 
			
		||||
text(x * 0.08, x * 0.02, MYSPE, pos = 4, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
par(oPar)                                  # reset graphics state
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   4.4  Visualizing proportions: Sankey diagrams  ==========================
 | 
			
		||||
 | 
			
		||||
# Sankey diagrams are an excellent way to visualize complicated nested
 | 
			
		||||
# proportions and their changes (see here for example:
 | 
			
		||||
# https://www.r-graph-gallery.com/sankey-diagram.html). Here is a very simple
 | 
			
		||||
# example with the MYSPE proportions, as an illustration of the plotting
 | 
			
		||||
# principle.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("plotly")) {
 | 
			
		||||
  install.packages("plotly")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help   = plotly)     # basic information
 | 
			
		||||
#  browseVignettes("plotly")    # available vignettes
 | 
			
		||||
#  data(package  = "plotly")    # available datasets
 | 
			
		||||
 | 
			
		||||
# Here, we use the plotly package that wraps a very well developed javascript
 | 
			
		||||
# library with many options for interactive plots. I am producing this plot
 | 
			
		||||
# hard-coded for the sample organism "Sporothrix schenkii"; you would need
 | 
			
		||||
# to change the code to adapt it to your own MYSPE - or even build a function
 | 
			
		||||
# for this. Do try this if you have a bit of coding experience, sankey diagrams
 | 
			
		||||
# are a good way to show hierarchical data relations - and if you get this
 | 
			
		||||
# working for your own organism you can be proud that you have understood
 | 
			
		||||
# how preparing the data works.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
myNodes <- list(label = c("Fungi (1014)",              # 0 <- node ID
 | 
			
		||||
                          "Ophiostomatales (6)",       # 1
 | 
			
		||||
                          "Other...",                  # 2
 | 
			
		||||
                          "Sporothrix (4)",            # 3
 | 
			
		||||
                          "Other...",                  # 4
 | 
			
		||||
                          "Sporothrix schenckii (2)",  # 5
 | 
			
		||||
                          "Other..."                   # 6
 | 
			
		||||
                          ),
 | 
			
		||||
                x = c(0.1, 0.4, 0.4, 0.7, 0.7, 1.0, 1.0),
 | 
			
		||||
                y = c(0.3, 0.1, 0.7, 0.2, 0.7, 0.3, 0.7),
 | 
			
		||||
                color = c("#f2f2f0", #
 | 
			
		||||
                          "#ffd5c4",
 | 
			
		||||
                          "#CCCCCC",
 | 
			
		||||
                          "#ff9582",
 | 
			
		||||
                          "#CCCCCC",
 | 
			
		||||
                          "#ed394e",
 | 
			
		||||
                          "#CCCCCC"
 | 
			
		||||
                          ),
 | 
			
		||||
                pad = 15,
 | 
			
		||||
                thickness = 20,
 | 
			
		||||
                line = list(color = "black",
 | 
			
		||||
                            width = 0.5))
 | 
			
		||||
 | 
			
		||||
myLinks <- list(source = c(0, 0, 1, 1, 3, 3),   # i.e. there is a link of
 | 
			
		||||
                target = c(1, 2, 3, 4, 5, 6),   # weight 6 between node 0
 | 
			
		||||
                value =  c(6, 18, 4, 2, 2, 2))  # and node 1
 | 
			
		||||
 | 
			
		||||
# Setting up the actual plot ...
 | 
			
		||||
fig  <-  plotly::plot_ly(type = "sankey",
 | 
			
		||||
                         arrangement = "snap",
 | 
			
		||||
                         orientation = "h",
 | 
			
		||||
                         node = myNodes,
 | 
			
		||||
                         link = myLinks)
 | 
			
		||||
 | 
			
		||||
# Adding and adjusting a few layout parameters
 | 
			
		||||
fig <- plotly::layout(fig,
 | 
			
		||||
              title = "Fungi Genomes - Classification",
 | 
			
		||||
              font = list(size = 10))
 | 
			
		||||
 | 
			
		||||
fig     # plot the diagram
 | 
			
		||||
 | 
			
		||||
# Note that the plot appears in the Viewer window, not the Plot window, and that
 | 
			
		||||
# it is interactive: you can hover over nodes and links, and drag the nodes
 | 
			
		||||
# around.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,234 +1,234 @@
 | 
			
		||||
# tocID <- "BIN-PHYLO-Data_preparation.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Data_preparation unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                     Line
 | 
			
		||||
#TOC> ---------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparations                                45
 | 
			
		||||
#TOC>   2        Fetching sequences                          77
 | 
			
		||||
#TOC>   3        Multiple Sequence Alignment                118
 | 
			
		||||
#TOC>   4        Reviewing and Editing Alignments           137
 | 
			
		||||
#TOC>   4.1        Masking workflow                         153
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparations  ========================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You need to reload your protein database, including changes that might have
 | 
			
		||||
# been made to the reference files. If you have worked with the prerequiste
 | 
			
		||||
# units, you should have a script named "makeProteinDB.R" that will create the
 | 
			
		||||
# myDB object with a protein and feature database. Ask for advice if not.
 | 
			
		||||
source("myScripts/makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
# Load packages we need
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("msa", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("msa")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = msa)       # basic information
 | 
			
		||||
#  browseVignettes("msa")  # available vignettes
 | 
			
		||||
#  data(package = "msa")   # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Fetching sequences  ==================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
 | 
			
		||||
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
 | 
			
		||||
# domains. You have annotated their ranges as a feature. The following code
 | 
			
		||||
# retrieves the sequences from myDB. You have seen similar code in other units.
 | 
			
		||||
 | 
			
		||||
sel <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
(proNames <- myDB$protein$name[sel])
 | 
			
		||||
(proIDs <- myDB$protein$ID[sel])
 | 
			
		||||
 | 
			
		||||
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
 | 
			
		||||
                              myDB$annotation$featureID == sel])      #  ==  !
 | 
			
		||||
                                                                      # Why?
 | 
			
		||||
APSI <- character(length(fanIDs))
 | 
			
		||||
 | 
			
		||||
for (i in seq_along(fanIDs)) {
 | 
			
		||||
  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
 | 
			
		||||
  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
 | 
			
		||||
  start <- myDB$annotation$start[sel]       # get start ...
 | 
			
		||||
  end   <- myDB$annotation$end[sel]         # ... and end
 | 
			
		||||
 | 
			
		||||
  sel <- myDB$protein$ID == proID           # get the protein row index ...
 | 
			
		||||
                                            # ... and the sequence
 | 
			
		||||
  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
 | 
			
		||||
  names(APSI)[i] <- (myDB$protein$name[sel])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
head(APSI)
 | 
			
		||||
 | 
			
		||||
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
 | 
			
		||||
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
 | 
			
		||||
 | 
			
		||||
APSI <- c(APSI,
 | 
			
		||||
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
 | 
			
		||||
names(APSI)[length(APSI)] <- "KILA_ESCCO"
 | 
			
		||||
tail(APSI)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Multiple Sequence Alignment  =========================================
 | 
			
		||||
 | 
			
		||||
# This vector of sequences with named elements fulfills the requirements to be
 | 
			
		||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
 | 
			
		||||
# the MSA algorithms in Biostrings.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
APSESSet <- Biostrings::AAStringSet(APSI)
 | 
			
		||||
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
 | 
			
		||||
 | 
			
		||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
 | 
			
		||||
# that happens in your case, just use msaClustalOmega() instead.
 | 
			
		||||
 | 
			
		||||
# inspect the alignment.
 | 
			
		||||
writeALN(APSESMsa)
 | 
			
		||||
 | 
			
		||||
# What do you think? Is this a good alignment for phylogenetic inference?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Reviewing and Editing Alignments  ====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Head back to the Wiki page for this unit and read up on the background
 | 
			
		||||
# first.
 | 
			
		||||
 | 
			
		||||
# Let's mask out all columns that have observations for
 | 
			
		||||
# less than 1/3 of the sequences in the dataset. This
 | 
			
		||||
# means they have more than round(nrow(msaSet) * (2/3))
 | 
			
		||||
# hyphens in a column.
 | 
			
		||||
#
 | 
			
		||||
# We take all sequences, split them into single
 | 
			
		||||
# characters, and put them into a matrix. Then we
 | 
			
		||||
# go through the matrix, column by column and decide
 | 
			
		||||
# whether we want to include that column.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Masking workflow  ==================================================
 | 
			
		||||
 | 
			
		||||
# get the length of the alignment
 | 
			
		||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
 | 
			
		||||
 | 
			
		||||
# initialize a matrix that can hold all characters
 | 
			
		||||
# individually
 | 
			
		||||
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
 | 
			
		||||
                    ncol = lenAli)
 | 
			
		||||
 | 
			
		||||
# assign the correct rownames
 | 
			
		||||
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
 | 
			
		||||
for (i in 1:nrow(APSESMsa)) {
 | 
			
		||||
  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# inspect the result
 | 
			
		||||
msaMatrix[1:7, 30:40]
 | 
			
		||||
 | 
			
		||||
# Now let's make a logical vector with an element for each column that selects
 | 
			
		||||
# which columns should be masked out.
 | 
			
		||||
 | 
			
		||||
# The number of hyphens in a column is easy to count. Consider:
 | 
			
		||||
 | 
			
		||||
    msaMatrix[ , 20]             # column 20
 | 
			
		||||
    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
 | 
			
		||||
sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
 | 
			
		||||
 | 
			
		||||
# Thus filling our logical vector is simple:
 | 
			
		||||
 | 
			
		||||
# initialize a mask
 | 
			
		||||
colMask <- logical(ncol(msaMatrix))
 | 
			
		||||
 | 
			
		||||
# define the threshold for rejecting a column
 | 
			
		||||
limit <- round(nrow(APSESMsa) * (2/3))
 | 
			
		||||
 | 
			
		||||
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
 | 
			
		||||
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
 | 
			
		||||
# and FALSE columns will be rejected.
 | 
			
		||||
for (i in 1:ncol(msaMatrix)) {
 | 
			
		||||
  count <- sum(msaMatrix[ , i] == "-")
 | 
			
		||||
  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Inspect the mask
 | 
			
		||||
colMask
 | 
			
		||||
 | 
			
		||||
# How many positions are being kept?
 | 
			
		||||
sum(colMask)
 | 
			
		||||
 | 
			
		||||
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
 | 
			
		||||
            100 * (1 - (sum(colMask) / length(colMask)))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next, we use colMask to remove the masked columns from the matrix
 | 
			
		||||
# in one step:
 | 
			
		||||
maskedMatrix <- msaMatrix[ , colMask]
 | 
			
		||||
 | 
			
		||||
# check:
 | 
			
		||||
ncol(maskedMatrix)
 | 
			
		||||
 | 
			
		||||
# ... then collapse each row of single characters back into a string ...
 | 
			
		||||
APSESphyloSet <- character()
 | 
			
		||||
for (i in 1:nrow(maskedMatrix)) {
 | 
			
		||||
  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
 | 
			
		||||
}
 | 
			
		||||
names(APSESphyloSet) <- rownames(maskedMatrix)
 | 
			
		||||
 | 
			
		||||
# inspect ...
 | 
			
		||||
writeALN(APSESphyloSet)
 | 
			
		||||
 | 
			
		||||
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
 | 
			
		||||
# several indels from the KILA_ESCCO outgroup sequence.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We save the aligned, masked domains to a file in the data/ directory,
 | 
			
		||||
# in multi-FASTA format.
 | 
			
		||||
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-PHYLO-Data_preparation.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Data_preparation unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                     Line
 | 
			
		||||
#TOC> ---------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparations                                45
 | 
			
		||||
#TOC>   2        Fetching sequences                          77
 | 
			
		||||
#TOC>   3        Multiple Sequence Alignment                118
 | 
			
		||||
#TOC>   4        Reviewing and Editing Alignments           137
 | 
			
		||||
#TOC>   4.1        Masking workflow                         153
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparations  ========================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You need to reload your protein database, including changes that might have
 | 
			
		||||
# been made to the reference files. If you have worked with the prerequiste
 | 
			
		||||
# units, you should have a script named "makeProteinDB.R" that will create the
 | 
			
		||||
# myDB object with a protein and feature database. Ask for advice if not.
 | 
			
		||||
source("myScripts/makeProteinDB.R")
 | 
			
		||||
 | 
			
		||||
# Load packages we need
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("msa", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("msa")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = msa)       # basic information
 | 
			
		||||
#  browseVignettes("msa")  # available vignettes
 | 
			
		||||
#  data(package = "msa")   # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Fetching sequences  ==================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# myDB contains the ten Mbp1 orthologues from the reference species and the Mbp1
 | 
			
		||||
# RBM for MYSPE. We will construct a phylogenetic tree from the proteins' APSES
 | 
			
		||||
# domains. You have annotated their ranges as a feature. The following code
 | 
			
		||||
# retrieves the sequences from myDB. You have seen similar code in other units.
 | 
			
		||||
 | 
			
		||||
sel <- grep("^MBP1_", myDB$protein$name)
 | 
			
		||||
(proNames <- myDB$protein$name[sel])
 | 
			
		||||
(proIDs <- myDB$protein$ID[sel])
 | 
			
		||||
 | 
			
		||||
(sel <- myDB$feature$ID[myDB$feature$name == "APSES fold"])
 | 
			
		||||
(fanIDs <- myDB$annotation$ID[myDB$annotation$proteinID %in% proIDs & # %in% !
 | 
			
		||||
                              myDB$annotation$featureID == sel])      #  ==  !
 | 
			
		||||
                                                                      # Why?
 | 
			
		||||
APSI <- character(length(fanIDs))
 | 
			
		||||
 | 
			
		||||
for (i in seq_along(fanIDs)) {
 | 
			
		||||
  sel   <- myDB$annotation$ID == fanIDs[i]  # get the feature row index
 | 
			
		||||
  proID <- myDB$annotation$proteinID[sel]   # get its protein ID
 | 
			
		||||
  start <- myDB$annotation$start[sel]       # get start ...
 | 
			
		||||
  end   <- myDB$annotation$end[sel]         # ... and end
 | 
			
		||||
 | 
			
		||||
  sel <- myDB$protein$ID == proID           # get the protein row index ...
 | 
			
		||||
                                            # ... and the sequence
 | 
			
		||||
  APSI[i] <- substring(myDB$protein$sequence[sel], start, end)
 | 
			
		||||
  names(APSI)[i] <- (myDB$protein$name[sel])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
head(APSI)
 | 
			
		||||
 | 
			
		||||
# Let's add the E.coli Kila-N domain sequence as an outgroup, for rooting our
 | 
			
		||||
# phylogenetic tree (see the unit's Wiki page for details on the sequence).
 | 
			
		||||
 | 
			
		||||
APSI <- c(APSI,
 | 
			
		||||
"IDGEIIHLRAKDGYINATSMCRTAGKLLSDYTRLKTTQEFFDELSRDMGIPISELIQSFKGGRPENQGTWVHPDIAINLAQ")
 | 
			
		||||
names(APSI)[length(APSI)] <- "KILA_ESCCO"
 | 
			
		||||
tail(APSI)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Multiple Sequence Alignment  =========================================
 | 
			
		||||
 | 
			
		||||
# This vector of sequences with named elements fulfills the requirements to be
 | 
			
		||||
# imported as a Biostrings object - an AAStringSet - which we need as input for
 | 
			
		||||
# the MSA algorithms in Biostrings.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
APSESSet <- Biostrings::AAStringSet(APSI)
 | 
			
		||||
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
 | 
			
		||||
 | 
			
		||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
 | 
			
		||||
# that happens in your case, just use msaClustalOmega() instead.
 | 
			
		||||
 | 
			
		||||
# inspect the alignment.
 | 
			
		||||
writeALN(APSESMsa)
 | 
			
		||||
 | 
			
		||||
# What do you think? Is this a good alignment for phylogenetic inference?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Reviewing and Editing Alignments  ====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Head back to the Wiki page for this unit and read up on the background
 | 
			
		||||
# first.
 | 
			
		||||
 | 
			
		||||
# Let's mask out all columns that have observations for
 | 
			
		||||
# less than 1/3 of the sequences in the dataset. This
 | 
			
		||||
# means they have more than round(nrow(msaSet) * (2/3))
 | 
			
		||||
# hyphens in a column.
 | 
			
		||||
#
 | 
			
		||||
# We take all sequences, split them into single
 | 
			
		||||
# characters, and put them into a matrix. Then we
 | 
			
		||||
# go through the matrix, column by column and decide
 | 
			
		||||
# whether we want to include that column.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  Masking workflow  ==================================================
 | 
			
		||||
 | 
			
		||||
# get the length of the alignment
 | 
			
		||||
(lenAli <- APSESMsa@unmasked@ranges@width[1])
 | 
			
		||||
 | 
			
		||||
# initialize a matrix that can hold all characters
 | 
			
		||||
# individually
 | 
			
		||||
msaMatrix <- matrix(character(nrow(APSESMsa) * lenAli),
 | 
			
		||||
                    ncol = lenAli)
 | 
			
		||||
 | 
			
		||||
# assign the correct rownames
 | 
			
		||||
rownames(msaMatrix) <- APSESMsa@unmasked@ranges@NAMES
 | 
			
		||||
for (i in 1:nrow(APSESMsa)) {
 | 
			
		||||
  msaMatrix[i, ] <- unlist(strsplit(as.character(APSESMsa@unmasked[i]), ""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# inspect the result
 | 
			
		||||
msaMatrix[1:7, 30:40]
 | 
			
		||||
 | 
			
		||||
# Now let's make a logical vector with an element for each column that selects
 | 
			
		||||
# which columns should be masked out.
 | 
			
		||||
 | 
			
		||||
# The number of hyphens in a column is easy to count. Consider:
 | 
			
		||||
 | 
			
		||||
    msaMatrix[ , 20]             # column 20
 | 
			
		||||
    msaMatrix[ , 20] == "-"      # TRUE for all gap characters
 | 
			
		||||
sum(msaMatrix[ , 20] == "-")     # adds 1 for each TRUE
 | 
			
		||||
 | 
			
		||||
# Thus filling our logical vector is simple:
 | 
			
		||||
 | 
			
		||||
# initialize a mask
 | 
			
		||||
colMask <- logical(ncol(msaMatrix))
 | 
			
		||||
 | 
			
		||||
# define the threshold for rejecting a column
 | 
			
		||||
limit <- round(nrow(APSESMsa) * (2/3))
 | 
			
		||||
 | 
			
		||||
# iterate over all columns, and write TRUE if there are less-or-equal to "limit"
 | 
			
		||||
# hyphens, FALSE if there are more - i.e. TRUE columns will be used for analysis
 | 
			
		||||
# and FALSE columns will be rejected.
 | 
			
		||||
for (i in 1:ncol(msaMatrix)) {
 | 
			
		||||
  count <- sum(msaMatrix[ , i] == "-")
 | 
			
		||||
  colMask[i] <- count <= limit # TRUE if less-or-equal to limit, FALSE if not
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Inspect the mask
 | 
			
		||||
colMask
 | 
			
		||||
 | 
			
		||||
# How many positions are being kept?
 | 
			
		||||
sum(colMask)
 | 
			
		||||
 | 
			
		||||
cat(sprintf("We are masking %4.2f %% of alignment columns.\n",
 | 
			
		||||
            100 * (1 - (sum(colMask) / length(colMask)))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Next, we use colMask to remove the masked columns from the matrix
 | 
			
		||||
# in one step:
 | 
			
		||||
maskedMatrix <- msaMatrix[ , colMask]
 | 
			
		||||
 | 
			
		||||
# check:
 | 
			
		||||
ncol(maskedMatrix)
 | 
			
		||||
 | 
			
		||||
# ... then collapse each row of single characters back into a string ...
 | 
			
		||||
APSESphyloSet <- character()
 | 
			
		||||
for (i in 1:nrow(maskedMatrix)) {
 | 
			
		||||
  APSESphyloSet[i] <- paste(maskedMatrix[i, ], collapse="")
 | 
			
		||||
}
 | 
			
		||||
names(APSESphyloSet) <- rownames(maskedMatrix)
 | 
			
		||||
 | 
			
		||||
# inspect ...
 | 
			
		||||
writeALN(APSESphyloSet)
 | 
			
		||||
 | 
			
		||||
# As you see, we have removed a three residue insertion from MBP1_NEUCR, and
 | 
			
		||||
# several indels from the KILA_ESCCO outgroup sequence.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We save the aligned, masked domains to a file in the data/ directory,
 | 
			
		||||
# in multi-FASTA format.
 | 
			
		||||
writeMFA(APSESphyloSet, myCon = "data/APSESphyloSet.mfa")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,406 +1,406 @@
 | 
			
		||||
# tocID <- "BIN-PHYLO-Tree_analysis.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
 | 
			
		||||
#                  Rewrite of tip re-ordering. Better handling of
 | 
			
		||||
#                  messages. pBar() for randomization.
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0.2  Typo in variable name, style changes
 | 
			
		||||
#           1.0.1  Wrong section heading
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                              Line
 | 
			
		||||
#TOC> --------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparation and Tree Plot            50
 | 
			
		||||
#TOC>   2        SPECIES REFERENCE TREE               66
 | 
			
		||||
#TOC>   3        Tree Analysis                       117
 | 
			
		||||
#TOC>   3.1        Rooting Trees                     177
 | 
			
		||||
#TOC>   3.2        Rotating Clades                   222
 | 
			
		||||
#TOC>   3.3        Computing tree distances          309
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparation and Tree Plot  ===========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("ape", quietly = TRUE)) {
 | 
			
		||||
  install.packages("ape")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = ape)       # basic information
 | 
			
		||||
#  browseVignettes("ape")    # available vignettes
 | 
			
		||||
#  data(package = "ape")     # available datasets
 | 
			
		||||
 | 
			
		||||
# We change the graphics parameters from time to time, let's define the
 | 
			
		||||
# default so we can recreate a sane state:
 | 
			
		||||
dev.off()
 | 
			
		||||
PAR <- par()
 | 
			
		||||
 | 
			
		||||
# =    2  SPECIES REFERENCE TREE  ==============================================
 | 
			
		||||
 | 
			
		||||
# Before we do any kind of phylogenetic analysis of genes from several species,
 | 
			
		||||
# we MUST have a reference tree of the taxonomic relationships in hand. This
 | 
			
		||||
# context is absolutely required for the interpretation of our tree.
 | 
			
		||||
 | 
			
		||||
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("taxize", quietly = TRUE)) {
 | 
			
		||||
  install.packages("taxize")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help   = taxize)       # basic information
 | 
			
		||||
#  browseVignettes("taxize")    # available vignettes
 | 
			
		||||
#  data(package  = "taxize")     # available datasets
 | 
			
		||||
 | 
			
		||||
( mySOI <- c(myDB$taxonomy$ID, "83333") )
 | 
			
		||||
myClass <- taxize::classification(mySOI, db = "ncbi")
 | 
			
		||||
str(myClass)
 | 
			
		||||
 | 
			
		||||
myClass[[1]]
 | 
			
		||||
 | 
			
		||||
fungiTree <- taxize::class2tree(myClass, check = TRUE)
 | 
			
		||||
plot(fungiTree)
 | 
			
		||||
 | 
			
		||||
# The tree produced by taxize:: contains full length species names,
 | 
			
		||||
# but it would be more convenient if it had bicodes instead. Also, the actual
 | 
			
		||||
# tree is only part of the list(), which will cause problems later:
 | 
			
		||||
str(fungiTree)
 | 
			
		||||
 | 
			
		||||
# we therefor simplify
 | 
			
		||||
fungiTree <- fungiTree$phylo
 | 
			
		||||
str(fungiTree)
 | 
			
		||||
 | 
			
		||||
# The species names are in a vector $phylo$tip.label of this list.
 | 
			
		||||
# We can use biCode() to shorten them.
 | 
			
		||||
fungiTree$tip.label <- biCode(fungiTree$tip.label)
 | 
			
		||||
 | 
			
		||||
# Plot the tree
 | 
			
		||||
nSP <- length(fungiTree$tip.label)
 | 
			
		||||
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
 | 
			
		||||
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.6,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
# Note that you can use the arrow buttons in the menu above the plot pane to
 | 
			
		||||
# scroll back to plots you have created earlier - so you can reference back to
 | 
			
		||||
# this species tree in your later analysis.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Tree Analysis  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# 1.1  Visualizing your tree
 | 
			
		||||
# The trees that are produced by Rphylip are stored as an object of class
 | 
			
		||||
# "phylo". This is a class for phylogenetic trees that is widely used in the
 | 
			
		||||
# community, practically all R phylogenetics packages will options to read and
 | 
			
		||||
# manipulate such trees. Outside of R, a popular interchange format is the
 | 
			
		||||
# Newick_format that you have seen above. It's easy to output your calculated
 | 
			
		||||
# trees in Newick format and visualize them elsewhere.
 | 
			
		||||
 | 
			
		||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
 | 
			
		||||
# print it have been defined with the Rphylip package, and in ape. You can
 | 
			
		||||
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
 | 
			
		||||
# to plot it. The underlying function is plot.phylo(), and documentation for its
 | 
			
		||||
# many options can by found by typing:
 | 
			
		||||
 | 
			
		||||
?plot.phylo
 | 
			
		||||
 | 
			
		||||
# We load the APSES sequence tree that you produced in the
 | 
			
		||||
# BIN-PHYLO-Tree_building unit:
 | 
			
		||||
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
 | 
			
		||||
 | 
			
		||||
plot(apsTree) # default type is "phylogram"
 | 
			
		||||
plot(apsTree, type = "unrooted")
 | 
			
		||||
plot(apsTree, type = "fan", no.margin = TRUE)
 | 
			
		||||
 | 
			
		||||
# rescale to show all of the labels:
 | 
			
		||||
# record the current plot parameters by assigning them to a variable ...
 | 
			
		||||
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
 | 
			
		||||
# ... and adjust the plot limits for a new plot:
 | 
			
		||||
plot(apsTree,
 | 
			
		||||
     type = "fan",
 | 
			
		||||
     x.lim = tmp$x.lim * 1.8,
 | 
			
		||||
     y.lim = tmp$y.lim * 1.8,
 | 
			
		||||
     cex = 0.8,
 | 
			
		||||
     no.margin = TRUE)
 | 
			
		||||
 | 
			
		||||
# Inspect the tree object
 | 
			
		||||
str(apsTree)
 | 
			
		||||
apsTree$tip.label
 | 
			
		||||
apsTree$edge
 | 
			
		||||
apsTree$edge.length
 | 
			
		||||
 | 
			
		||||
# show the node / edge and tip labels on a plot
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
ape::nodelabels()
 | 
			
		||||
ape::edgelabels()
 | 
			
		||||
ape::tiplabels()
 | 
			
		||||
 | 
			
		||||
# show the number of nodes, edges and tips
 | 
			
		||||
ape::Nnode(apsTree)
 | 
			
		||||
ape::Nedge(apsTree)
 | 
			
		||||
ape::Ntip(apsTree)
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# Finally, write the tree to console in Newick format
 | 
			
		||||
ape::write.tree(apsTree)
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Rooting Trees  =====================================================
 | 
			
		||||
 | 
			
		||||
# In order to analyse the tree, it is helpful to root it first and reorder its
 | 
			
		||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
 | 
			
		||||
 | 
			
		||||
ape::is.rooted(apsTree)
 | 
			
		||||
 | 
			
		||||
# You can root the tree with the command root() from the "ape" package.
 | 
			
		||||
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
 | 
			
		||||
# add labels for internal nodes and tips
 | 
			
		||||
ape::nodelabels(cex = 0.5, frame = "circle")
 | 
			
		||||
ape::tiplabels(cex = 0.5, frame = "rect")
 | 
			
		||||
 | 
			
		||||
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
 | 
			
		||||
# number in yours. Substitute the correct node number below for "outgroup".
 | 
			
		||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
ape::is.rooted(apsTree)
 | 
			
		||||
 | 
			
		||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
 | 
			
		||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
 | 
			
		||||
# Ancestor) of the ingroup.
 | 
			
		||||
 | 
			
		||||
# The edge lengths are stored in the phylo object:
 | 
			
		||||
apsTree$edge.length
 | 
			
		||||
 | 
			
		||||
# ... and you can assign a small arbitrary value to the edge
 | 
			
		||||
# to show how it connects to the tree without having an
 | 
			
		||||
# overlap.
 | 
			
		||||
apsTree$edge.length[1] <- 0.1
 | 
			
		||||
plot(apsTree, cex = 0.7)
 | 
			
		||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This procedure does however not assign an actual length to a root edge, and
 | 
			
		||||
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
 | 
			
		||||
# myself that too. We'll just add a length by hand.
 | 
			
		||||
 | 
			
		||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
 | 
			
		||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Rotating Clades  ===================================================
 | 
			
		||||
 | 
			
		||||
# To interpret the tree, it is useful to rotate the clades so that they appear
 | 
			
		||||
# in the order expected from the cladogram of species.
 | 
			
		||||
 | 
			
		||||
# We can either rotate around individual internal nodes ...
 | 
			
		||||
layout(matrix(1:2, 1, 2))
 | 
			
		||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
 | 
			
		||||
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
 | 
			
		||||
# Note that the species at the bottom of the clade descending from node
 | 
			
		||||
# 17 is now plotted at the top.
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# ... or we can rearrange the tree so it corresponds as well as possible to a
 | 
			
		||||
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
 | 
			
		||||
# from the NCBI taxonomic classification.
 | 
			
		||||
 | 
			
		||||
nOrg <- length(apsTree$tip.label)
 | 
			
		||||
 | 
			
		||||
plot(fungiTree,
 | 
			
		||||
     no.margin = FALSE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.5,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
 | 
			
		||||
# These are the fungi tree tips ...
 | 
			
		||||
fungiTree$tip.label
 | 
			
		||||
# ... and their order is determined by the edge-list that is stored in
 | 
			
		||||
fungiTree$edge
 | 
			
		||||
# which edges join the tips?
 | 
			
		||||
ape::tiplabels(cex = 0.5, frame = "rect")
 | 
			
		||||
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
 | 
			
		||||
# ordered from bottom to top.
 | 
			
		||||
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
 | 
			
		||||
 | 
			
		||||
sel <- fungiTree$edge[ , 2 ] <= nOrg
 | 
			
		||||
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
 | 
			
		||||
 | 
			
		||||
# Now, here are the genes of the apsTree tips ...
 | 
			
		||||
apsTree$tip.label
 | 
			
		||||
 | 
			
		||||
# ... and the "constraint"  we need for reordering, according to the help page
 | 
			
		||||
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
 | 
			
		||||
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
 | 
			
		||||
oSp <- gsub("^", "MBP1_", oSp)
 | 
			
		||||
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
 | 
			
		||||
 | 
			
		||||
# Then we can plot the two trees to compare: the fungi- tree
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
layout(matrix(1:2, 1, 2))
 | 
			
		||||
plot(fungiTree,
 | 
			
		||||
    no.margin = TRUE,
 | 
			
		||||
     root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.5,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
 | 
			
		||||
# and the re-organized apsesTree ...
 | 
			
		||||
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
 | 
			
		||||
     no.margin = TRUE,
 | 
			
		||||
     root.edge = TRUE)
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# As you can see, the reordering is not perfect, since the topologies are
 | 
			
		||||
# different, mostly due to the unresolved nodes in the reference tree. One
 | 
			
		||||
# could play with that ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Study the two trees and consider their similarities and differences.
 | 
			
		||||
#         What do you expect? What do you find? Note that this is not a "mixed"
 | 
			
		||||
#         gene tree yet, since it contains only a single gene for the species
 | 
			
		||||
#         we considered. All of the branch points in this tree are speciation
 | 
			
		||||
#         events. Thus the gene tree should have the same topology as the
 | 
			
		||||
#         species tree. Does it? Are the differences important? How many
 | 
			
		||||
#         branches would you need to remove and reinsert elsewhere to get the
 | 
			
		||||
#         same topology as the species tree?
 | 
			
		||||
 | 
			
		||||
# In order to quantify how different these two trees are, we need to compute
 | 
			
		||||
# tree distances.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Computing tree distances  ==========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Many superb phylogeny tools are contributed by the phangorn package.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("phangorn", quietly = TRUE)) {
 | 
			
		||||
  install.packages("phangorn")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = phangorn)       # basic information
 | 
			
		||||
#  browseVignettes("phangorn")    # available vignettes
 | 
			
		||||
#  data(package = "phangorn")     # available datasets
 | 
			
		||||
 | 
			
		||||
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
 | 
			
		||||
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
 | 
			
		||||
apsTree2 <- apsTree
 | 
			
		||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# phangorn provides several functions to compute tree-differences (and there
 | 
			
		||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
 | 
			
		||||
# "symmetric difference"
 | 
			
		||||
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
 | 
			
		||||
 | 
			
		||||
# Numbers. What do they mean? How much more similar is our apsTree to the
 | 
			
		||||
# (presumably) ground truth of fungiTree than a random tree would be?
 | 
			
		||||
# The ape package provides the function rtree()
 | 
			
		||||
# to compute random trees.
 | 
			
		||||
 | 
			
		||||
ape::rtree(n = length(apsTree2$tip.label), # number of tips
 | 
			
		||||
          rooted = TRUE,                   # we rooted the tree above,
 | 
			
		||||
                                           #  and fungiTree is rooted anyway
 | 
			
		||||
          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
 | 
			
		||||
          br = NULL)                       # don't generate branch lengths since
 | 
			
		||||
                                           #   fungiTree has none, so we can't
 | 
			
		||||
                                           #   compare them anyway.
 | 
			
		||||
 | 
			
		||||
# (Note the warning message about non-binary trees; we'll suppress that later
 | 
			
		||||
#  by wrapping the function call in supressMessages(); we don't want to
 | 
			
		||||
#  print it 10,000 times :-)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's compute some random trees this way, calculate the distances to
 | 
			
		||||
# fungiTree, and then compare the values we get for apsTree2. The random
 | 
			
		||||
# trees are provided by ape::rtree().
 | 
			
		||||
 | 
			
		||||
N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
 | 
			
		||||
            # defined in .utilities.R  to keep track of where we are at:
 | 
			
		||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
 | 
			
		||||
colnames(myTreeDistances) <- c("symm", "path")
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  pBar(i, N)
 | 
			
		||||
  xTree <- ape::rtree(n = length(apsTree2$tip.label),
 | 
			
		||||
                      rooted = TRUE,
 | 
			
		||||
                      tip.label = apsTree2$tip.label,
 | 
			
		||||
                      br = NULL)
 | 
			
		||||
  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                      # reset the random number generator
 | 
			
		||||
 | 
			
		||||
table(myTreeDistances[, "symm"])
 | 
			
		||||
 | 
			
		||||
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
 | 
			
		||||
 | 
			
		||||
# Random events less-or-equal to observation, divided by total number of
 | 
			
		||||
# events gives us the empirical p-value.
 | 
			
		||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
 | 
			
		||||
            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
hist(myTreeDistances[, "path"],
 | 
			
		||||
     col = "aliceblue",
 | 
			
		||||
     main = "Distances of random Trees to fungiTree")
 | 
			
		||||
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
 | 
			
		||||
abline(v = pathObs, col = "chartreuse")
 | 
			
		||||
 | 
			
		||||
# Random events less-or-equal to observation, divided by total number of
 | 
			
		||||
# events gives us the empirical p-value.
 | 
			
		||||
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
 | 
			
		||||
            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
 | 
			
		||||
 | 
			
		||||
# Indeed, our apsTree is _very_ much more similar to the species tree than
 | 
			
		||||
# we would expect by random chance.
 | 
			
		||||
 | 
			
		||||
# What do we gain from that analysis? Analyzing the tree we get from a single
 | 
			
		||||
# gene of orthologous sequences is a positive control in our computational
 | 
			
		||||
# experiment. If these genes are indeed orthologues, a correct tree-building
 | 
			
		||||
# program ought to give us a tree that exactly matches the species tree.
 | 
			
		||||
# Evaluating how far off we are from the known correct result gives us a way to
 | 
			
		||||
# validate our workflow and our algorithm. If we can't get that right, we can't
 | 
			
		||||
# expect to get "real" data right either. Employing such positive controls in
 | 
			
		||||
# every computational experiment is essential for research. Not doing so is
 | 
			
		||||
# Cargo Cult Bioinformatics.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-PHYLO-Tree_analysis.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Tree_analysis unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 updates. Deprecate iTol and use taxize:: instead.
 | 
			
		||||
#                  Rewrite of tip re-ordering. Better handling of
 | 
			
		||||
#                  messages. pBar() for randomization.
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0.2  Typo in variable name, style changes
 | 
			
		||||
#           1.0.1  Wrong section heading
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                              Line
 | 
			
		||||
#TOC> --------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparation and Tree Plot            50
 | 
			
		||||
#TOC>   2        SPECIES REFERENCE TREE               66
 | 
			
		||||
#TOC>   3        Tree Analysis                       117
 | 
			
		||||
#TOC>   3.1        Rooting Trees                     177
 | 
			
		||||
#TOC>   3.2        Rotating Clades                   222
 | 
			
		||||
#TOC>   3.3        Computing tree distances          309
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparation and Tree Plot  ===========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("ape", quietly = TRUE)) {
 | 
			
		||||
  install.packages("ape")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = ape)       # basic information
 | 
			
		||||
#  browseVignettes("ape")    # available vignettes
 | 
			
		||||
#  data(package = "ape")     # available datasets
 | 
			
		||||
 | 
			
		||||
# We change the graphics parameters from time to time, let's define the
 | 
			
		||||
# default so we can recreate a sane state:
 | 
			
		||||
dev.off()
 | 
			
		||||
PAR <- par()
 | 
			
		||||
 | 
			
		||||
# =    2  SPECIES REFERENCE TREE  ==============================================
 | 
			
		||||
 | 
			
		||||
# Before we do any kind of phylogenetic analysis of genes from several species,
 | 
			
		||||
# we MUST have a reference tree of the taxonomic relationships in hand. This
 | 
			
		||||
# context is absolutely required for the interpretation of our tree.
 | 
			
		||||
 | 
			
		||||
# We have the tax-ids in our database, and the NCBI has the species tree - we just need some way to extract the subtree that corresponds to our taxons of interest. Here's how to use the taxize:: package.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("taxize", quietly = TRUE)) {
 | 
			
		||||
  install.packages("taxize")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help   = taxize)       # basic information
 | 
			
		||||
#  browseVignettes("taxize")    # available vignettes
 | 
			
		||||
#  data(package  = "taxize")     # available datasets
 | 
			
		||||
 | 
			
		||||
( mySOI <- c(myDB$taxonomy$ID, "83333") )
 | 
			
		||||
myClass <- taxize::classification(mySOI, db = "ncbi")
 | 
			
		||||
str(myClass)
 | 
			
		||||
 | 
			
		||||
myClass[[1]]
 | 
			
		||||
 | 
			
		||||
fungiTree <- taxize::class2tree(myClass, check = TRUE)
 | 
			
		||||
plot(fungiTree)
 | 
			
		||||
 | 
			
		||||
# The tree produced by taxize:: contains full length species names,
 | 
			
		||||
# but it would be more convenient if it had bicodes instead. Also, the actual
 | 
			
		||||
# tree is only part of the list(), which will cause problems later:
 | 
			
		||||
str(fungiTree)
 | 
			
		||||
 | 
			
		||||
# we therefor simplify
 | 
			
		||||
fungiTree <- fungiTree$phylo
 | 
			
		||||
str(fungiTree)
 | 
			
		||||
 | 
			
		||||
# The species names are in a vector $phylo$tip.label of this list.
 | 
			
		||||
# We can use biCode() to shorten them.
 | 
			
		||||
fungiTree$tip.label <- biCode(fungiTree$tip.label)
 | 
			
		||||
 | 
			
		||||
# Plot the tree
 | 
			
		||||
nSP <- length(fungiTree$tip.label)
 | 
			
		||||
plot(fungiTree, cex = 0.8, root.edge = TRUE, no.margin = TRUE)
 | 
			
		||||
text(-1, nSP - 0.5, "Species Tree:\nFungi", pos = 4)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.6,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
# Note that you can use the arrow buttons in the menu above the plot pane to
 | 
			
		||||
# scroll back to plots you have created earlier - so you can reference back to
 | 
			
		||||
# this species tree in your later analysis.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Tree Analysis  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# 1.1  Visualizing your tree
 | 
			
		||||
# The trees that are produced by Rphylip are stored as an object of class
 | 
			
		||||
# "phylo". This is a class for phylogenetic trees that is widely used in the
 | 
			
		||||
# community, practically all R phylogenetics packages will options to read and
 | 
			
		||||
# manipulate such trees. Outside of R, a popular interchange format is the
 | 
			
		||||
# Newick_format that you have seen above. It's easy to output your calculated
 | 
			
		||||
# trees in Newick format and visualize them elsewhere.
 | 
			
		||||
 | 
			
		||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
 | 
			
		||||
# print it have been defined with the Rphylip package, and in ape. You can
 | 
			
		||||
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
 | 
			
		||||
# to plot it. The underlying function is plot.phylo(), and documentation for its
 | 
			
		||||
# many options can by found by typing:
 | 
			
		||||
 | 
			
		||||
?plot.phylo
 | 
			
		||||
 | 
			
		||||
# We load the APSES sequence tree that you produced in the
 | 
			
		||||
# BIN-PHYLO-Tree_building unit:
 | 
			
		||||
apsTree <- readRDS(file = "data/APSEStreeRproml.rds")
 | 
			
		||||
 | 
			
		||||
plot(apsTree) # default type is "phylogram"
 | 
			
		||||
plot(apsTree, type = "unrooted")
 | 
			
		||||
plot(apsTree, type = "fan", no.margin = TRUE)
 | 
			
		||||
 | 
			
		||||
# rescale to show all of the labels:
 | 
			
		||||
# record the current plot parameters by assigning them to a variable ...
 | 
			
		||||
(tmp <- plot(apsTree, type="fan", no.margin = TRUE, plot=FALSE))
 | 
			
		||||
# ... and adjust the plot limits for a new plot:
 | 
			
		||||
plot(apsTree,
 | 
			
		||||
     type = "fan",
 | 
			
		||||
     x.lim = tmp$x.lim * 1.8,
 | 
			
		||||
     y.lim = tmp$y.lim * 1.8,
 | 
			
		||||
     cex = 0.8,
 | 
			
		||||
     no.margin = TRUE)
 | 
			
		||||
 | 
			
		||||
# Inspect the tree object
 | 
			
		||||
str(apsTree)
 | 
			
		||||
apsTree$tip.label
 | 
			
		||||
apsTree$edge
 | 
			
		||||
apsTree$edge.length
 | 
			
		||||
 | 
			
		||||
# show the node / edge and tip labels on a plot
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
ape::nodelabels()
 | 
			
		||||
ape::edgelabels()
 | 
			
		||||
ape::tiplabels()
 | 
			
		||||
 | 
			
		||||
# show the number of nodes, edges and tips
 | 
			
		||||
ape::Nnode(apsTree)
 | 
			
		||||
ape::Nedge(apsTree)
 | 
			
		||||
ape::Ntip(apsTree)
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# Finally, write the tree to console in Newick format
 | 
			
		||||
ape::write.tree(apsTree)
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Rooting Trees  =====================================================
 | 
			
		||||
 | 
			
		||||
# In order to analyse the tree, it is helpful to root it first and reorder its
 | 
			
		||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
 | 
			
		||||
 | 
			
		||||
ape::is.rooted(apsTree)
 | 
			
		||||
 | 
			
		||||
# You can root the tree with the command root() from the "ape" package.
 | 
			
		||||
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
 | 
			
		||||
# add labels for internal nodes and tips
 | 
			
		||||
ape::nodelabels(cex = 0.5, frame = "circle")
 | 
			
		||||
ape::tiplabels(cex = 0.5, frame = "rect")
 | 
			
		||||
 | 
			
		||||
# The outgroup of the tree (KILA ESCCO) is tip "11" in my sample tree, it may be a different
 | 
			
		||||
# number in yours. Substitute the correct node number below for "outgroup".
 | 
			
		||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
ape::is.rooted(apsTree)
 | 
			
		||||
 | 
			
		||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
 | 
			
		||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
 | 
			
		||||
# Ancestor) of the ingroup.
 | 
			
		||||
 | 
			
		||||
# The edge lengths are stored in the phylo object:
 | 
			
		||||
apsTree$edge.length
 | 
			
		||||
 | 
			
		||||
# ... and you can assign a small arbitrary value to the edge
 | 
			
		||||
# to show how it connects to the tree without having an
 | 
			
		||||
# overlap.
 | 
			
		||||
apsTree$edge.length[1] <- 0.1
 | 
			
		||||
plot(apsTree, cex = 0.7)
 | 
			
		||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This procedure does however not assign an actual length to a root edge, and
 | 
			
		||||
# therefore no root edge is visible on the plot. Why? , you might ask. I ask
 | 
			
		||||
# myself that too. We'll just add a length by hand.
 | 
			
		||||
 | 
			
		||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
 | 
			
		||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Rotating Clades  ===================================================
 | 
			
		||||
 | 
			
		||||
# To interpret the tree, it is useful to rotate the clades so that they appear
 | 
			
		||||
# in the order expected from the cladogram of species.
 | 
			
		||||
 | 
			
		||||
# We can either rotate around individual internal nodes ...
 | 
			
		||||
layout(matrix(1:2, 1, 2))
 | 
			
		||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
 | 
			
		||||
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
 | 
			
		||||
# Note that the species at the bottom of the clade descending from node
 | 
			
		||||
# 17 is now plotted at the top.
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# ... or we can rearrange the tree so it corresponds as well as possible to a
 | 
			
		||||
# predefined tip ordering. Here we use the ordering that taxize:: has inferred
 | 
			
		||||
# from the NCBI taxonomic classification.
 | 
			
		||||
 | 
			
		||||
nOrg <- length(apsTree$tip.label)
 | 
			
		||||
 | 
			
		||||
plot(fungiTree,
 | 
			
		||||
     no.margin = FALSE, root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.5,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
 | 
			
		||||
# These are the fungi tree tips ...
 | 
			
		||||
fungiTree$tip.label
 | 
			
		||||
# ... and their order is determined by the edge-list that is stored in
 | 
			
		||||
fungiTree$edge
 | 
			
		||||
# which edges join the tips?
 | 
			
		||||
ape::tiplabels(cex = 0.5, frame = "rect")
 | 
			
		||||
# as you can see, the tips (range [1:nOrg] ) are in column 2 and they are
 | 
			
		||||
# ordered from bottom to top.
 | 
			
		||||
# And each tip number is the index of the species in the tip.label vector. So we can take column 2, subset it, and use it to get a list of species in the order of the tree ...
 | 
			
		||||
 | 
			
		||||
sel <- fungiTree$edge[ , 2 ] <= nOrg
 | 
			
		||||
( oSp <- fungiTree$tip.label[fungiTree$edge[sel , 2 ]] )
 | 
			
		||||
 | 
			
		||||
# Now, here are the genes of the apsTree tips ...
 | 
			
		||||
apsTree$tip.label
 | 
			
		||||
 | 
			
		||||
# ... and the "constraint"  we need for reordering, according to the help page
 | 
			
		||||
# of ape::rotateConstr(), is "a vector specifying the order of the tips as they
 | 
			
		||||
# should appear (from bottom to top)". Thus we need to add the "MBP1_" prefix to our vector
 | 
			
		||||
oSp <- gsub("^", "MBP1_", oSp)
 | 
			
		||||
( oSp <- gsub("MBP1_ESSCO", "KILA_ESCCO", oSp) )
 | 
			
		||||
 | 
			
		||||
# Then we can plot the two trees to compare: the fungi- tree
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
layout(matrix(1:2, 1, 2))
 | 
			
		||||
plot(fungiTree,
 | 
			
		||||
    no.margin = TRUE,
 | 
			
		||||
     root.edge = TRUE)
 | 
			
		||||
ape::nodelabels(text = fungiTree$node.label,
 | 
			
		||||
                cex = 0.5,
 | 
			
		||||
                adj = 0.2,
 | 
			
		||||
                bg = "#D4F2DA")
 | 
			
		||||
 | 
			
		||||
# and the re-organized apsesTree ...
 | 
			
		||||
plot(ape::rotateConstr(apsTree, constraint = oSp[]),
 | 
			
		||||
     no.margin = TRUE,
 | 
			
		||||
     root.edge = TRUE)
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
 | 
			
		||||
# As you can see, the reordering is not perfect, since the topologies are
 | 
			
		||||
# different, mostly due to the unresolved nodes in the reference tree. One
 | 
			
		||||
# could play with that ...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Study the two trees and consider their similarities and differences.
 | 
			
		||||
#         What do you expect? What do you find? Note that this is not a "mixed"
 | 
			
		||||
#         gene tree yet, since it contains only a single gene for the species
 | 
			
		||||
#         we considered. All of the branch points in this tree are speciation
 | 
			
		||||
#         events. Thus the gene tree should have the same topology as the
 | 
			
		||||
#         species tree. Does it? Are the differences important? How many
 | 
			
		||||
#         branches would you need to remove and reinsert elsewhere to get the
 | 
			
		||||
#         same topology as the species tree?
 | 
			
		||||
 | 
			
		||||
# In order to quantify how different these two trees are, we need to compute
 | 
			
		||||
# tree distances.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Computing tree distances  ==========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Many superb phylogeny tools are contributed by the phangorn package.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("phangorn", quietly = TRUE)) {
 | 
			
		||||
  install.packages("phangorn")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = phangorn)       # basic information
 | 
			
		||||
#  browseVignettes("phangorn")    # available vignettes
 | 
			
		||||
#  data(package = "phangorn")     # available datasets
 | 
			
		||||
 | 
			
		||||
# To compare two trees, they must have the same tip labels. We delete "MBP1_" or
 | 
			
		||||
# "KILA_" from the existing tip labels in a copy of our APSES domain tree.
 | 
			
		||||
apsTree2 <- apsTree
 | 
			
		||||
apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# phangorn provides several functions to compute tree-differences (and there
 | 
			
		||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
 | 
			
		||||
# "symmetric difference"
 | 
			
		||||
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
 | 
			
		||||
 | 
			
		||||
# Numbers. What do they mean? How much more similar is our apsTree to the
 | 
			
		||||
# (presumably) ground truth of fungiTree than a random tree would be?
 | 
			
		||||
# The ape package provides the function rtree()
 | 
			
		||||
# to compute random trees.
 | 
			
		||||
 | 
			
		||||
ape::rtree(n = length(apsTree2$tip.label), # number of tips
 | 
			
		||||
          rooted = TRUE,                   # we rooted the tree above,
 | 
			
		||||
                                           #  and fungiTree is rooted anyway
 | 
			
		||||
          tip.label = apsTree2$tip.label,  # use the apsTree2 labels
 | 
			
		||||
          br = NULL)                       # don't generate branch lengths since
 | 
			
		||||
                                           #   fungiTree has none, so we can't
 | 
			
		||||
                                           #   compare them anyway.
 | 
			
		||||
 | 
			
		||||
# (Note the warning message about non-binary trees; we'll suppress that later
 | 
			
		||||
#  by wrapping the function call in supressMessages(); we don't want to
 | 
			
		||||
#  print it 10,000 times :-)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's compute some random trees this way, calculate the distances to
 | 
			
		||||
# fungiTree, and then compare the values we get for apsTree2. The random
 | 
			
		||||
# trees are provided by ape::rtree().
 | 
			
		||||
 | 
			
		||||
N <- 10000  # takes about 15 seconds, and we'll use the pBar function,
 | 
			
		||||
            # defined in .utilities.R  to keep track of where we are at:
 | 
			
		||||
myTreeDistances <- matrix(numeric(N * 2), ncol = 2)
 | 
			
		||||
colnames(myTreeDistances) <- c("symm", "path")
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  pBar(i, N)
 | 
			
		||||
  xTree <- ape::rtree(n = length(apsTree2$tip.label),
 | 
			
		||||
                      rooted = TRUE,
 | 
			
		||||
                      tip.label = apsTree2$tip.label,
 | 
			
		||||
                      br = NULL)
 | 
			
		||||
  myTreeDistances[i, ] <- suppressMessages(phangorn::treedist(fungiTree, xTree))
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                      # reset the random number generator
 | 
			
		||||
 | 
			
		||||
table(myTreeDistances[, "symm"])
 | 
			
		||||
 | 
			
		||||
( symmObs <- phangorn::treedist(fungiTree, apsTree2)[1] )
 | 
			
		||||
 | 
			
		||||
# Random events less-or-equal to observation, divided by total number of
 | 
			
		||||
# events gives us the empirical p-value.
 | 
			
		||||
cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n",
 | 
			
		||||
            (sum(myTreeDistances[ , "symm"] <= symmObs) + 1) / (N + 1)))
 | 
			
		||||
 | 
			
		||||
par(PAR)   # reset graphics state
 | 
			
		||||
hist(myTreeDistances[, "path"],
 | 
			
		||||
     col = "aliceblue",
 | 
			
		||||
     main = "Distances of random Trees to fungiTree")
 | 
			
		||||
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
 | 
			
		||||
abline(v = pathObs, col = "chartreuse")
 | 
			
		||||
 | 
			
		||||
# Random events less-or-equal to observation, divided by total number of
 | 
			
		||||
# events gives us the empirical p-value.
 | 
			
		||||
cat(sprintf("\nEmpirical p-value for path diff. of observed tree is %1.4f\n",
 | 
			
		||||
            (sum(myTreeDistances[ , "path"] <= symmObs) + 1) / (N + 1)))
 | 
			
		||||
 | 
			
		||||
# Indeed, our apsTree is _very_ much more similar to the species tree than
 | 
			
		||||
# we would expect by random chance.
 | 
			
		||||
 | 
			
		||||
# What do we gain from that analysis? Analyzing the tree we get from a single
 | 
			
		||||
# gene of orthologous sequences is a positive control in our computational
 | 
			
		||||
# experiment. If these genes are indeed orthologues, a correct tree-building
 | 
			
		||||
# program ought to give us a tree that exactly matches the species tree.
 | 
			
		||||
# Evaluating how far off we are from the known correct result gives us a way to
 | 
			
		||||
# validate our workflow and our algorithm. If we can't get that right, we can't
 | 
			
		||||
# expect to get "real" data right either. Employing such positive controls in
 | 
			
		||||
# every computational experiment is essential for research. Not doing so is
 | 
			
		||||
# Cargo Cult Bioinformatics.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,168 +1,168 @@
 | 
			
		||||
# tocID <- "BIN-PHYLO-Tree_building.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Tree_building unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10   2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
 | 
			
		||||
#                  instructions to authorize proml.app
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Add MrBayes
 | 
			
		||||
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                       Line
 | 
			
		||||
#TOC> -----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Calculating Trees                             48
 | 
			
		||||
#TOC>   1.1        PROMLPATH ...                               68
 | 
			
		||||
#TOC>   1.1.1          ... on the Mac                          73
 | 
			
		||||
#TOC>   1.1.2          ... on Windows                         101
 | 
			
		||||
#TOC>   1.1.3          ... on Linux                           115
 | 
			
		||||
#TOC>   1.1.4          Confirming PROMLPATH                   120
 | 
			
		||||
#TOC>   1.2        Building a maximum likelihood tree         134
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Calculating Trees  ===================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Follow the instructions found at phylip's home on the Web to install. If you
 | 
			
		||||
# are on a Windows computer, take note of the installation directory.
 | 
			
		||||
 | 
			
		||||
# After you have installed Phylip on your computer, install the R package that
 | 
			
		||||
# provides an interface to the Phylip functions.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("Rphylip", quietly = TRUE)) {
 | 
			
		||||
  install.packages("Rphylip")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Rphylip)       # basic information
 | 
			
		||||
#  browseVignettes("Rphylip")    # available vignettes
 | 
			
		||||
#  data(package = "Rphylip")     # available datasets
 | 
			
		||||
 | 
			
		||||
# This will install RPhylip, as well as its dependency, the package "ape".
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  PROMLPATH ...  =====================================================
 | 
			
		||||
# The next part may be tricky. You will need to figure out where
 | 
			
		||||
# on your computer Phylip has been installed and define the path
 | 
			
		||||
# to the proml program that calculates a maximum-likelihood tree.
 | 
			
		||||
 | 
			
		||||
# ===   1.1.1  ... on the Mac                    
 | 
			
		||||
# On the Mac, the standard installation places a phylip folder
 | 
			
		||||
# in the /Applications directory. That folder contains all the
 | 
			
		||||
# individual phylip programs as <name>.app files. These are not
 | 
			
		||||
# the actual executables, but "app" files are actually directories
 | 
			
		||||
# that contain the required resources for a program to run.
 | 
			
		||||
 | 
			
		||||
# The executable is in a subdirectory and you can point Rphylip
 | 
			
		||||
# directly to that subdirectory to find the program it needs:
 | 
			
		||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
 | 
			
		||||
 | 
			
		||||
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
 | 
			
		||||
# because they have not been "signed" by the PHYLIP developers. The process
 | 
			
		||||
# will terminate by your system, with a warning.
 | 
			
		||||
 | 
			
		||||
#   -  Navigate to the phylip folder in your ~/Applications directory
 | 
			
		||||
#   -  Descend into the "exe" folder and find  proml.app
 | 
			
		||||
#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
 | 
			
		||||
#      says: "macOS cannot verify the developer of “proml.app”.
 | 
			
		||||
#             Are you sure you want to open it?"
 | 
			
		||||
#   -  Click open to continue. You may need to allow access to the terminal
 | 
			
		||||
#      as well. When the proml terminal session open, you can type
 | 
			
		||||
#      Ctrl-c to abort the program and close the window.
 | 
			
		||||
#
 | 
			
		||||
#   This adds proml.app to the list of known-good programs and you will not
 | 
			
		||||
#   need to repeat this process.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# ===   1.1.2  ... on Windows                    
 | 
			
		||||
# On Windows you need to know where the programs have been installed, and you
 | 
			
		||||
# need to specify a path that is correct for the Windows OS. Find the folder
 | 
			
		||||
# that is named "exe", and right-click to inspect its properties. The path
 | 
			
		||||
# should be listed among them.
 | 
			
		||||
 | 
			
		||||
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
 | 
			
		||||
# assignment has to be
 | 
			
		||||
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
 | 
			
		||||
# (Note: "/", not "\")
 | 
			
		||||
 | 
			
		||||
# I have heard that your path must not contain spaces, and it is prudent to
 | 
			
		||||
# avoid other special characters as well.
 | 
			
		||||
 | 
			
		||||
# ===   1.1.3  ... on Linux                      
 | 
			
		||||
# If you are running Linux I trust you know what to do. It's probably
 | 
			
		||||
# something like
 | 
			
		||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
 | 
			
		||||
 | 
			
		||||
# ===   1.1.4  Confirming PROMLPATH              
 | 
			
		||||
# Confirm that the settings are right.
 | 
			
		||||
PROMLPATH                # returns the path
 | 
			
		||||
list.dirs(PROMLPATH)     # returns the directories in that path
 | 
			
		||||
list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
 | 
			
		||||
 | 
			
		||||
# If "proml" is NOT among the files that the last command returns, you
 | 
			
		||||
# can't continue. Ask on the mailing list for advice.
 | 
			
		||||
 | 
			
		||||
# If everything is good, you can add the line that defines PROMLPATH to
 | 
			
		||||
# myScripts/.myProfile.R - the path will then be automatically set when
 | 
			
		||||
# you quit RStudio and return.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  Building a maximum likelihood tree  ================================
 | 
			
		||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
 | 
			
		||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
 | 
			
		||||
 | 
			
		||||
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
 | 
			
		||||
str(apsIn)
 | 
			
		||||
 | 
			
		||||
# ... and you are ready to build a tree.
 | 
			
		||||
 | 
			
		||||
# There are many fast options in PHYLIP - we will use the most _accurate_ one
 | 
			
		||||
# that it has: proml, a maximum-likelihood tree building program for protein
 | 
			
		||||
# data.
 | 
			
		||||
 | 
			
		||||
# Building maximum-likelihood trees can eat as much computer time
 | 
			
		||||
# as you can throw at it. Calculating a tree of 48 APSES domains
 | 
			
		||||
# with default parameters of Rproml() runs for more than half a day
 | 
			
		||||
# on my computer. But we have only twelve sequences here, so the
 | 
			
		||||
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
 | 
			
		||||
# of coffee while you are waiting.
 | 
			
		||||
 | 
			
		||||
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
 | 
			
		||||
 | 
			
		||||
# A quick first look:
 | 
			
		||||
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
 | 
			
		||||
# save your tree:
 | 
			
		||||
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
 | 
			
		||||
 | 
			
		||||
# If this did not work, ask for advice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-PHYLO-Tree_building.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PHYLO-Tree_building unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10   2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    deprecate save()/load() for saveRDS()/readRDS(); Mac:
 | 
			
		||||
#                  instructions to authorize proml.app
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#           1.0    First 2017 version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#           Add MrBayes
 | 
			
		||||
# https://cran.r-project.org/web/packages/phangorn/vignettes/IntertwiningTreesAndNetworks.html
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                       Line
 | 
			
		||||
#TOC> -----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Calculating Trees                             48
 | 
			
		||||
#TOC>   1.1        PROMLPATH ...                               68
 | 
			
		||||
#TOC>   1.1.1          ... on the Mac                          73
 | 
			
		||||
#TOC>   1.1.2          ... on Windows                         101
 | 
			
		||||
#TOC>   1.1.3          ... on Linux                           115
 | 
			
		||||
#TOC>   1.1.4          Confirming PROMLPATH                   120
 | 
			
		||||
#TOC>   1.2        Building a maximum likelihood tree         134
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Calculating Trees  ===================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Follow the instructions found at phylip's home on the Web to install. If you
 | 
			
		||||
# are on a Windows computer, take note of the installation directory.
 | 
			
		||||
 | 
			
		||||
# After you have installed Phylip on your computer, install the R package that
 | 
			
		||||
# provides an interface to the Phylip functions.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("Rphylip", quietly = TRUE)) {
 | 
			
		||||
  install.packages("Rphylip")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Rphylip)       # basic information
 | 
			
		||||
#  browseVignettes("Rphylip")    # available vignettes
 | 
			
		||||
#  data(package = "Rphylip")     # available datasets
 | 
			
		||||
 | 
			
		||||
# This will install RPhylip, as well as its dependency, the package "ape".
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  PROMLPATH ...  =====================================================
 | 
			
		||||
# The next part may be tricky. You will need to figure out where
 | 
			
		||||
# on your computer Phylip has been installed and define the path
 | 
			
		||||
# to the proml program that calculates a maximum-likelihood tree.
 | 
			
		||||
 | 
			
		||||
# ===   1.1.1  ... on the Mac                    
 | 
			
		||||
# On the Mac, the standard installation places a phylip folder
 | 
			
		||||
# in the /Applications directory. That folder contains all the
 | 
			
		||||
# individual phylip programs as <name>.app files. These are not
 | 
			
		||||
# the actual executables, but "app" files are actually directories
 | 
			
		||||
# that contain the required resources for a program to run.
 | 
			
		||||
 | 
			
		||||
# The executable is in a subdirectory and you can point Rphylip
 | 
			
		||||
# directly to that subdirectory to find the program it needs:
 | 
			
		||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
 | 
			
		||||
 | 
			
		||||
# However, RPHYLIP will not be able to run PHYLIP applications immediately,
 | 
			
		||||
# because they have not been "signed" by the PHYLIP developers. The process
 | 
			
		||||
# will terminate by your system, with a warning.
 | 
			
		||||
 | 
			
		||||
#   -  Navigate to the phylip folder in your ~/Applications directory
 | 
			
		||||
#   -  Descend into the "exe" folder and find  proml.app
 | 
			
		||||
#   -  Ctrl-click  proml.app  and choose "Open". A dialogue will show that
 | 
			
		||||
#      says: "macOS cannot verify the developer of “proml.app”.
 | 
			
		||||
#             Are you sure you want to open it?"
 | 
			
		||||
#   -  Click open to continue. You may need to allow access to the terminal
 | 
			
		||||
#      as well. When the proml terminal session open, you can type
 | 
			
		||||
#      Ctrl-c to abort the program and close the window.
 | 
			
		||||
#
 | 
			
		||||
#   This adds proml.app to the list of known-good programs and you will not
 | 
			
		||||
#   need to repeat this process.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# ===   1.1.2  ... on Windows                    
 | 
			
		||||
# On Windows you need to know where the programs have been installed, and you
 | 
			
		||||
# need to specify a path that is correct for the Windows OS. Find the folder
 | 
			
		||||
# that is named "exe", and right-click to inspect its properties. The path
 | 
			
		||||
# should be listed among them.
 | 
			
		||||
 | 
			
		||||
# If the path looks like "C:\Users\Meng\Programs\phylip-3.695\exe", then your
 | 
			
		||||
# assignment has to be
 | 
			
		||||
# PROMLPATH <- "C:/Users/Meng/Programs/phylip-3.695/exe"
 | 
			
		||||
# (Note: "/", not "\")
 | 
			
		||||
 | 
			
		||||
# I have heard that your path must not contain spaces, and it is prudent to
 | 
			
		||||
# avoid other special characters as well.
 | 
			
		||||
 | 
			
		||||
# ===   1.1.3  ... on Linux                      
 | 
			
		||||
# If you are running Linux I trust you know what to do. It's probably
 | 
			
		||||
# something like
 | 
			
		||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
 | 
			
		||||
 | 
			
		||||
# ===   1.1.4  Confirming PROMLPATH              
 | 
			
		||||
# Confirm that the settings are right.
 | 
			
		||||
PROMLPATH                # returns the path
 | 
			
		||||
list.dirs(PROMLPATH)     # returns the directories in that path
 | 
			
		||||
list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command"
 | 
			
		||||
 | 
			
		||||
# If "proml" is NOT among the files that the last command returns, you
 | 
			
		||||
# can't continue. Ask on the mailing list for advice.
 | 
			
		||||
 | 
			
		||||
# If everything is good, you can add the line that defines PROMLPATH to
 | 
			
		||||
# myScripts/.myProfile.R - the path will then be automatically set when
 | 
			
		||||
# you quit RStudio and return.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  Building a maximum likelihood tree  ================================
 | 
			
		||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
 | 
			
		||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
 | 
			
		||||
 | 
			
		||||
apsIn <- Rphylip::read.protein("data/APSESphyloSet.mfa")
 | 
			
		||||
str(apsIn)
 | 
			
		||||
 | 
			
		||||
# ... and you are ready to build a tree.
 | 
			
		||||
 | 
			
		||||
# There are many fast options in PHYLIP - we will use the most _accurate_ one
 | 
			
		||||
# that it has: proml, a maximum-likelihood tree building program for protein
 | 
			
		||||
# data.
 | 
			
		||||
 | 
			
		||||
# Building maximum-likelihood trees can eat as much computer time
 | 
			
		||||
# as you can throw at it. Calculating a tree of 48 APSES domains
 | 
			
		||||
# with default parameters of Rproml() runs for more than half a day
 | 
			
		||||
# on my computer. But we have only twelve sequences here, so the
 | 
			
		||||
# process will take us about 5 to 15 minutes. Run this, and anjoy a good cup
 | 
			
		||||
# of coffee while you are waiting.
 | 
			
		||||
 | 
			
		||||
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
 | 
			
		||||
 | 
			
		||||
# A quick first look:
 | 
			
		||||
 | 
			
		||||
plot(apsTree)
 | 
			
		||||
 | 
			
		||||
# save your tree:
 | 
			
		||||
saveRDS(apsTree, file = "data/APSEStreeRproml.rds")
 | 
			
		||||
 | 
			
		||||
# If this did not work, ask for advice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,323 +1,323 @@
 | 
			
		||||
# tocID <- "BIN-PPI-Analysis.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PPI-Analysis unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:   1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-08  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.4    Update vector ID's for betweenness centrality.
 | 
			
		||||
#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
 | 
			
		||||
#           1.2    2020 Updates; Rewrite for new STRINg V11;
 | 
			
		||||
#                  Deprecate save()/load() for saveRDS()/readRDS()
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                           Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Setup and data                                    50
 | 
			
		||||
#TOC>   2        Functional Edges in the Human Proteome            86
 | 
			
		||||
#TOC>   2.1        Cliques                                        129
 | 
			
		||||
#TOC>   2.2        Communities                                    170
 | 
			
		||||
#TOC>   2.3        Betweenness Centrality                         184
 | 
			
		||||
#TOC>   3        biomaRt                                          231
 | 
			
		||||
#TOC>   4        Task for submission                              302
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Setup and data  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("igraph", quietly = TRUE)) {
 | 
			
		||||
  install.packages("igraph")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = igraph)       # basic information
 | 
			
		||||
#  browseVignettes("igraph")    # available vignettes
 | 
			
		||||
#  data(package = "igraph")     # available datasets
 | 
			
		||||
 | 
			
		||||
# In order for you to explore some real, biological networks, I give you a
 | 
			
		||||
# dataframe of functional relationships of human proteins that I have downloaded
 | 
			
		||||
# from the STRING database. The full table has 8.5 million records, here is a
 | 
			
		||||
# subset of records with combined confidence scores > 980
 | 
			
		||||
 | 
			
		||||
# The selected set of edges with a confidence of > 964 is a dataframe with about
 | 
			
		||||
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
 | 
			
		||||
# a fungal proteome. You can load the saved dataframe here (To read more about
 | 
			
		||||
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
 | 
			
		||||
 | 
			
		||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
 | 
			
		||||
 | 
			
		||||
head(STRINGedges)
 | 
			
		||||
 | 
			
		||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
 | 
			
		||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
 | 
			
		||||
 | 
			
		||||
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
 | 
			
		||||
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
 | 
			
		||||
 | 
			
		||||
head(STRINGedges)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Functional Edges in the Human Proteome  ==============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# There are many possibilities to explore interesting aspects of biological
 | 
			
		||||
# networks, we will keep with some very simple procedures here but you have
 | 
			
		||||
# to be aware that this is barely scratching the surface of possibilities.
 | 
			
		||||
# However, once the network exists in your computer, it is comparatively
 | 
			
		||||
# easy to find information online about the many, many options to analyze.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Make a graph from this dataframe
 | 
			
		||||
?igraph::graph_from_data_frame
 | 
			
		||||
 | 
			
		||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
 | 
			
		||||
 | 
			
		||||
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
 | 
			
		||||
# layout of such large graphs is possible, but requires specialized code. Google
 | 
			
		||||
# for <layout large graphs> if you are curious. Also, consider what one can
 | 
			
		||||
# really learn from plotting such a graph ...
 | 
			
		||||
 | 
			
		||||
# Of course simple computations on this graph are reasonably fast:
 | 
			
		||||
 | 
			
		||||
compSTR <- igraph::components(gSTR)
 | 
			
		||||
summary(compSTR) # our graph is fully connected!
 | 
			
		||||
 | 
			
		||||
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
 | 
			
		||||
# this actually does look rather scale-free
 | 
			
		||||
 | 
			
		||||
(freqRank <- table(igraph::degree(gSTR)))
 | 
			
		||||
plot(log10(as.numeric(names(freqRank)) + 1),
 | 
			
		||||
     log10(as.numeric(freqRank)), type = "b",
 | 
			
		||||
     pch = 21, bg = "#FEE0AF",
 | 
			
		||||
     xlab = "log(Rank)", ylab = "log(frequency)",
 | 
			
		||||
     main = "8,400 nodes from the human functional interaction network")
 | 
			
		||||
 | 
			
		||||
# This looks very scale-free indeed.
 | 
			
		||||
 | 
			
		||||
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
 | 
			
		||||
                      log10(as.numeric(names(freqRank)) + 1)))
 | 
			
		||||
abline(regressionLine, col = "firebrick")
 | 
			
		||||
 | 
			
		||||
# Now explore some more:
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Cliques  ===========================================================
 | 
			
		||||
 | 
			
		||||
# Let's find the largest cliques. Remember: a clique is a fully connected
 | 
			
		||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
 | 
			
		||||
# Biological complexes often appear as cliques in interaction graphs.
 | 
			
		||||
 | 
			
		||||
igraph::clique_num(gSTR)
 | 
			
		||||
# The largest clique has 81 members.
 | 
			
		||||
 | 
			
		||||
(C <- igraph::largest_cliques(gSTR)[[1]])
 | 
			
		||||
 | 
			
		||||
# Pick one of the proteins and find out what this fully connected cluster of 81
 | 
			
		||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
 | 
			
		||||
 | 
			
		||||
# Plot this ...
 | 
			
		||||
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
 | 
			
		||||
 | 
			
		||||
# color the vertices along a color spectrum
 | 
			
		||||
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
 | 
			
		||||
 | 
			
		||||
# color the edges to have the same color as the originating node
 | 
			
		||||
eCol <- character()
 | 
			
		||||
for (i in seq_along(vCol)) {
 | 
			
		||||
  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar= rep(0,4)) # Turn margins off
 | 
			
		||||
plot(R,
 | 
			
		||||
     layout = igraph::layout_in_circle(R),
 | 
			
		||||
     vertex.size = 3,
 | 
			
		||||
     vertex.color = vCol,
 | 
			
		||||
     edge.color = eCol,
 | 
			
		||||
     edge.width = 0.1,
 | 
			
		||||
     vertex.label = NA)
 | 
			
		||||
par(oPar)
 | 
			
		||||
 | 
			
		||||
# ... well: remember: a clique means every node is connected to every other
 | 
			
		||||
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
 | 
			
		||||
# networks looks like for large complexes.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Communities  =======================================================
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
gSTRclusters <- igraph::cluster_infomap(gSTR)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
 | 
			
		||||
                                 # membership types are from each other
 | 
			
		||||
tMem <- table(igraph::membership(gSTRclusters))
 | 
			
		||||
length(tMem)  # About 700 communities identified
 | 
			
		||||
hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
 | 
			
		||||
range(tMem) # ... but one has > 200 members
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.3  Betweenness Centrality  ============================================
 | 
			
		||||
 | 
			
		||||
# Let's find the nodes with the 10 - highest betweenness centralities.
 | 
			
		||||
#
 | 
			
		||||
BC <- igraph::centr_betw(gSTR)
 | 
			
		||||
 | 
			
		||||
# remember: BC$res contains the results
 | 
			
		||||
head(BC$res)
 | 
			
		||||
 | 
			
		||||
BC$res[1]   # betweenness centrality of node 1 in the graph ...
 | 
			
		||||
# ... which one is node 1?
 | 
			
		||||
igraph::V(gSTR)[1]
 | 
			
		||||
 | 
			
		||||
# to get the ten-highest nodes, we simply label the elements of BC with their
 | 
			
		||||
# index ...
 | 
			
		||||
names(BC$res) <- as.character(1:length(BC$res))
 | 
			
		||||
 | 
			
		||||
# ... and then we sort:
 | 
			
		||||
sBC <- sort(BC$res, decreasing = TRUE)
 | 
			
		||||
head(sBC)
 | 
			
		||||
 | 
			
		||||
# This ordered vector means: node 3 has the highest betweenness centrality,
 | 
			
		||||
# node 721 has the second highest, etc.
 | 
			
		||||
 | 
			
		||||
(BCsel <- as.numeric(names(sBC)[1:10]))
 | 
			
		||||
 | 
			
		||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
 | 
			
		||||
# IDs...
 | 
			
		||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
 | 
			
		||||
# We are going to use these IDs to produce some output for a submitted task:
 | 
			
		||||
# therefore I need you to execute the following line, note the "seal" that this
 | 
			
		||||
# returns, and not change myENSPsel later:
 | 
			
		||||
 | 
			
		||||
myENSPsel <- selectENSP(ENSPsel)
 | 
			
		||||
 | 
			
		||||
#  Next, to find what these proteins are...
 | 
			
		||||
 | 
			
		||||
# We could now Google for all of these IDs to learn more about them. But really,
 | 
			
		||||
# googling for IDs one after the other, that would be lame. Let's instead use
 | 
			
		||||
# the very, very useful biomaRt package to translate these Ensemble IDs into
 | 
			
		||||
# gene symbols.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  biomaRt  =============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# IDs are just labels, but for _bio_informatics we need to learn more about the
 | 
			
		||||
# biological function of the genes or proteins that we retrieve via graph data
 | 
			
		||||
# mining. biomaRt is the tool of choice. It's a package distributed by the
 | 
			
		||||
# bioconductor project. This here is not a biomaRt tutorial (that's for another
 | 
			
		||||
# day), simply a few lines of sample code to get you started on the specific use
 | 
			
		||||
# case of retrieving descriptions for ensembl protein IDs.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("biomaRt", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("biomaRt")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = biomaRt)       # basic information
 | 
			
		||||
#  browseVignettes("biomaRt")    # available vignettes
 | 
			
		||||
#  data(package = "biomaRt")     # available datasets
 | 
			
		||||
 | 
			
		||||
# define which dataset to use ... this takes a while for download
 | 
			
		||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
 | 
			
		||||
 | 
			
		||||
# what filters are defined?
 | 
			
		||||
( filters <- biomaRt::listFilters(myMart) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# and what attributes can we filter for?
 | 
			
		||||
( attributes <- biomaRt::listAttributes(myMart) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Soooo many options - let's look for the correct name of filters that are
 | 
			
		||||
# useful for ENSP IDs ...
 | 
			
		||||
filters[grep("ENSP", filters$description), ]
 | 
			
		||||
 | 
			
		||||
# ... and the correct attribute names for gene symbols and descriptions ...
 | 
			
		||||
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
 | 
			
		||||
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ... so we can put this together: here is a syntax example:
 | 
			
		||||
biomaRt::getBM(filters = "ensembl_peptide_id",
 | 
			
		||||
               attributes = c("hgnc_symbol",
 | 
			
		||||
                              "wikigene_description",
 | 
			
		||||
                              "interpro_description",
 | 
			
		||||
                              "phenotype_description"),
 | 
			
		||||
               values = "ENSP00000000442",
 | 
			
		||||
               mart = myMart)
 | 
			
		||||
 | 
			
		||||
# A simple loop will now get us the information for our 10 most central genes
 | 
			
		||||
# from the human subset of STRING.
 | 
			
		||||
 | 
			
		||||
CPdefs <- list()  # Since we don't know how many matches one of our queries
 | 
			
		||||
# will return, we'll put the result dataframes into a list.
 | 
			
		||||
 | 
			
		||||
for (ID in myENSPsel) {
 | 
			
		||||
  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
 | 
			
		||||
                                 attributes = c("hgnc_symbol",
 | 
			
		||||
                                                "wikigene_description",
 | 
			
		||||
                                                "interpro_description",
 | 
			
		||||
                                                "phenotype_description"),
 | 
			
		||||
                                 values = ID,
 | 
			
		||||
                                 mart = myMart)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# So what are the proteins with the ten highest betweenness centralities?
 | 
			
		||||
#  ... are you surprised? (I am! Really.)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Task for submission  =================================================
 | 
			
		||||
 | 
			
		||||
# Write a loop that will go through your personalized list of Ensemble IDs and
 | 
			
		||||
#    for each ID:
 | 
			
		||||
#    --  print the ID,
 | 
			
		||||
#    --  print the first row's HGNC symbol,
 | 
			
		||||
#    --  print the first row's wikigene description.
 | 
			
		||||
#    --  print the first row's phenotype.
 | 
			
		||||
#
 | 
			
		||||
# Write your thoughts about this group of genes.
 | 
			
		||||
#
 | 
			
		||||
# (Hint, you can structure your loop in the same way as the loop that
 | 
			
		||||
# created CPdefs. )
 | 
			
		||||
 | 
			
		||||
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
 | 
			
		||||
# for this loop and its output into your report if you are submitting
 | 
			
		||||
# anything for credit for this unit. Please read the requirements carefully.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-PPI-Analysis.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-PPI-Analysis unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:   1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-08  -  2020-10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.4    Update vector ID's for betweenness centrality.
 | 
			
		||||
#           1.3    Bugfix: called the wrong function on ENSPsel in l. 220
 | 
			
		||||
#           1.2    2020 Updates; Rewrite for new STRINg V11;
 | 
			
		||||
#                  Deprecate save()/load() for saveRDS()/readRDS()
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                           Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Setup and data                                    50
 | 
			
		||||
#TOC>   2        Functional Edges in the Human Proteome            86
 | 
			
		||||
#TOC>   2.1        Cliques                                        129
 | 
			
		||||
#TOC>   2.2        Communities                                    170
 | 
			
		||||
#TOC>   2.3        Betweenness Centrality                         184
 | 
			
		||||
#TOC>   3        biomaRt                                          231
 | 
			
		||||
#TOC>   4        Task for submission                              302
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Setup and data  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("igraph", quietly = TRUE)) {
 | 
			
		||||
  install.packages("igraph")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = igraph)       # basic information
 | 
			
		||||
#  browseVignettes("igraph")    # available vignettes
 | 
			
		||||
#  data(package = "igraph")     # available datasets
 | 
			
		||||
 | 
			
		||||
# In order for you to explore some real, biological networks, I give you a
 | 
			
		||||
# dataframe of functional relationships of human proteins that I have downloaded
 | 
			
		||||
# from the STRING database. The full table has 8.5 million records, here is a
 | 
			
		||||
# subset of records with combined confidence scores > 980
 | 
			
		||||
 | 
			
		||||
# The selected set of edges with a confidence of > 964 is a dataframe with about
 | 
			
		||||
# 50,000 edges and 8,400 unique proteins. Incidentaly, that's about the size of
 | 
			
		||||
# a fungal proteome. You can load the saved dataframe here (To read more about
 | 
			
		||||
# what the scores mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
 | 
			
		||||
 | 
			
		||||
STRINGedges <- readRDS("./data/STRINGedges.rds")
 | 
			
		||||
 | 
			
		||||
head(STRINGedges)
 | 
			
		||||
 | 
			
		||||
# Note that STRING has appended the tax-ID for Homo sapiens - 9606 - to the
 | 
			
		||||
# Ensemble transcript identifiers that start with ENSP. We'll remove them:
 | 
			
		||||
 | 
			
		||||
STRINGedges$a <- gsub("^9606\\.", "", STRINGedges$a)
 | 
			
		||||
STRINGedges$b <- gsub("^9606\\.", "", STRINGedges$b)
 | 
			
		||||
 | 
			
		||||
head(STRINGedges)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Functional Edges in the Human Proteome  ==============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# There are many possibilities to explore interesting aspects of biological
 | 
			
		||||
# networks, we will keep with some very simple procedures here but you have
 | 
			
		||||
# to be aware that this is barely scratching the surface of possibilities.
 | 
			
		||||
# However, once the network exists in your computer, it is comparatively
 | 
			
		||||
# easy to find information online about the many, many options to analyze.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Make a graph from this dataframe
 | 
			
		||||
?igraph::graph_from_data_frame
 | 
			
		||||
 | 
			
		||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
 | 
			
		||||
 | 
			
		||||
# CAUTION you DON'T want to plot a graph with 8,000 nodes and 50,000 edges -
 | 
			
		||||
# layout of such large graphs is possible, but requires specialized code. Google
 | 
			
		||||
# for <layout large graphs> if you are curious. Also, consider what one can
 | 
			
		||||
# really learn from plotting such a graph ...
 | 
			
		||||
 | 
			
		||||
# Of course simple computations on this graph are reasonably fast:
 | 
			
		||||
 | 
			
		||||
compSTR <- igraph::components(gSTR)
 | 
			
		||||
summary(compSTR) # our graph is fully connected!
 | 
			
		||||
 | 
			
		||||
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
 | 
			
		||||
# this actually does look rather scale-free
 | 
			
		||||
 | 
			
		||||
(freqRank <- table(igraph::degree(gSTR)))
 | 
			
		||||
plot(log10(as.numeric(names(freqRank)) + 1),
 | 
			
		||||
     log10(as.numeric(freqRank)), type = "b",
 | 
			
		||||
     pch = 21, bg = "#FEE0AF",
 | 
			
		||||
     xlab = "log(Rank)", ylab = "log(frequency)",
 | 
			
		||||
     main = "8,400 nodes from the human functional interaction network")
 | 
			
		||||
 | 
			
		||||
# This looks very scale-free indeed.
 | 
			
		||||
 | 
			
		||||
(regressionLine <- lm(log10(as.numeric(freqRank)) ~
 | 
			
		||||
                      log10(as.numeric(names(freqRank)) + 1)))
 | 
			
		||||
abline(regressionLine, col = "firebrick")
 | 
			
		||||
 | 
			
		||||
# Now explore some more:
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Cliques  ===========================================================
 | 
			
		||||
 | 
			
		||||
# Let's find the largest cliques. Remember: a clique is a fully connected
 | 
			
		||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
 | 
			
		||||
# Biological complexes often appear as cliques in interaction graphs.
 | 
			
		||||
 | 
			
		||||
igraph::clique_num(gSTR)
 | 
			
		||||
# The largest clique has 81 members.
 | 
			
		||||
 | 
			
		||||
(C <- igraph::largest_cliques(gSTR)[[1]])
 | 
			
		||||
 | 
			
		||||
# Pick one of the proteins and find out what this fully connected cluster of 81
 | 
			
		||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
 | 
			
		||||
 | 
			
		||||
# Plot this ...
 | 
			
		||||
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
 | 
			
		||||
 | 
			
		||||
# color the vertices along a color spectrum
 | 
			
		||||
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
 | 
			
		||||
 | 
			
		||||
# color the edges to have the same color as the originating node
 | 
			
		||||
eCol <- character()
 | 
			
		||||
for (i in seq_along(vCol)) {
 | 
			
		||||
  eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
oPar <- par(mar= rep(0,4)) # Turn margins off
 | 
			
		||||
plot(R,
 | 
			
		||||
     layout = igraph::layout_in_circle(R),
 | 
			
		||||
     vertex.size = 3,
 | 
			
		||||
     vertex.color = vCol,
 | 
			
		||||
     edge.color = eCol,
 | 
			
		||||
     edge.width = 0.1,
 | 
			
		||||
     vertex.label = NA)
 | 
			
		||||
par(oPar)
 | 
			
		||||
 | 
			
		||||
# ... well: remember: a clique means every node is connected to every other
 | 
			
		||||
# node. We have 81 * 81 = 6,561 edges. This is what a matrix model of PPI
 | 
			
		||||
# networks looks like for large complexes.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Communities  =======================================================
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
gSTRclusters <- igraph::cluster_infomap(gSTR)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
 | 
			
		||||
                                 # membership types are from each other
 | 
			
		||||
tMem <- table(igraph::membership(gSTRclusters))
 | 
			
		||||
length(tMem)  # About 700 communities identified
 | 
			
		||||
hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ...
 | 
			
		||||
range(tMem) # ... but one has > 200 members
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.3  Betweenness Centrality  ============================================
 | 
			
		||||
 | 
			
		||||
# Let's find the nodes with the 10 - highest betweenness centralities.
 | 
			
		||||
#
 | 
			
		||||
BC <- igraph::centr_betw(gSTR)
 | 
			
		||||
 | 
			
		||||
# remember: BC$res contains the results
 | 
			
		||||
head(BC$res)
 | 
			
		||||
 | 
			
		||||
BC$res[1]   # betweenness centrality of node 1 in the graph ...
 | 
			
		||||
# ... which one is node 1?
 | 
			
		||||
igraph::V(gSTR)[1]
 | 
			
		||||
 | 
			
		||||
# to get the ten-highest nodes, we simply label the elements of BC with their
 | 
			
		||||
# index ...
 | 
			
		||||
names(BC$res) <- as.character(1:length(BC$res))
 | 
			
		||||
 | 
			
		||||
# ... and then we sort:
 | 
			
		||||
sBC <- sort(BC$res, decreasing = TRUE)
 | 
			
		||||
head(sBC)
 | 
			
		||||
 | 
			
		||||
# This ordered vector means: node 3 has the highest betweenness centrality,
 | 
			
		||||
# node 721 has the second highest, etc.
 | 
			
		||||
 | 
			
		||||
(BCsel <- as.numeric(names(sBC)[1:10]))
 | 
			
		||||
 | 
			
		||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
 | 
			
		||||
# IDs...
 | 
			
		||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# =====
 | 
			
		||||
# IMPORTANT, IF YOU INTEND TO SUBMIT YOUR ANALYSIS FOR CREDIT
 | 
			
		||||
# We are going to use these IDs to produce some output for a submitted task:
 | 
			
		||||
# therefore I need you to execute the following line, note the "seal" that this
 | 
			
		||||
# returns, and not change myENSPsel later:
 | 
			
		||||
 | 
			
		||||
myENSPsel <- selectENSP(ENSPsel)
 | 
			
		||||
 | 
			
		||||
#  Next, to find what these proteins are...
 | 
			
		||||
 | 
			
		||||
# We could now Google for all of these IDs to learn more about them. But really,
 | 
			
		||||
# googling for IDs one after the other, that would be lame. Let's instead use
 | 
			
		||||
# the very, very useful biomaRt package to translate these Ensemble IDs into
 | 
			
		||||
# gene symbols.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  biomaRt  =============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# IDs are just labels, but for _bio_informatics we need to learn more about the
 | 
			
		||||
# biological function of the genes or proteins that we retrieve via graph data
 | 
			
		||||
# mining. biomaRt is the tool of choice. It's a package distributed by the
 | 
			
		||||
# bioconductor project. This here is not a biomaRt tutorial (that's for another
 | 
			
		||||
# day), simply a few lines of sample code to get you started on the specific use
 | 
			
		||||
# case of retrieving descriptions for ensembl protein IDs.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("biomaRt", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("biomaRt")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = biomaRt)       # basic information
 | 
			
		||||
#  browseVignettes("biomaRt")    # available vignettes
 | 
			
		||||
#  data(package = "biomaRt")     # available datasets
 | 
			
		||||
 | 
			
		||||
# define which dataset to use ... this takes a while for download
 | 
			
		||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
 | 
			
		||||
 | 
			
		||||
# what filters are defined?
 | 
			
		||||
( filters <- biomaRt::listFilters(myMart) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# and what attributes can we filter for?
 | 
			
		||||
( attributes <- biomaRt::listAttributes(myMart) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Soooo many options - let's look for the correct name of filters that are
 | 
			
		||||
# useful for ENSP IDs ...
 | 
			
		||||
filters[grep("ENSP", filters$description), ]
 | 
			
		||||
 | 
			
		||||
# ... and the correct attribute names for gene symbols and descriptions ...
 | 
			
		||||
attributes[grep("symbol", attributes$description, ignore.case = TRUE), ]
 | 
			
		||||
attributes[grep("description", attributes$description, ignore.case = TRUE), ]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ... so we can put this together: here is a syntax example:
 | 
			
		||||
biomaRt::getBM(filters = "ensembl_peptide_id",
 | 
			
		||||
               attributes = c("hgnc_symbol",
 | 
			
		||||
                              "wikigene_description",
 | 
			
		||||
                              "interpro_description",
 | 
			
		||||
                              "phenotype_description"),
 | 
			
		||||
               values = "ENSP00000000442",
 | 
			
		||||
               mart = myMart)
 | 
			
		||||
 | 
			
		||||
# A simple loop will now get us the information for our 10 most central genes
 | 
			
		||||
# from the human subset of STRING.
 | 
			
		||||
 | 
			
		||||
CPdefs <- list()  # Since we don't know how many matches one of our queries
 | 
			
		||||
# will return, we'll put the result dataframes into a list.
 | 
			
		||||
 | 
			
		||||
for (ID in myENSPsel) {
 | 
			
		||||
  CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
 | 
			
		||||
                                 attributes = c("hgnc_symbol",
 | 
			
		||||
                                                "wikigene_description",
 | 
			
		||||
                                                "interpro_description",
 | 
			
		||||
                                                "phenotype_description"),
 | 
			
		||||
                                 values = ID,
 | 
			
		||||
                                 mart = myMart)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# So what are the proteins with the ten highest betweenness centralities?
 | 
			
		||||
#  ... are you surprised? (I am! Really.)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Task for submission  =================================================
 | 
			
		||||
 | 
			
		||||
# Write a loop that will go through your personalized list of Ensemble IDs and
 | 
			
		||||
#    for each ID:
 | 
			
		||||
#    --  print the ID,
 | 
			
		||||
#    --  print the first row's HGNC symbol,
 | 
			
		||||
#    --  print the first row's wikigene description.
 | 
			
		||||
#    --  print the first row's phenotype.
 | 
			
		||||
#
 | 
			
		||||
# Write your thoughts about this group of genes.
 | 
			
		||||
#
 | 
			
		||||
# (Hint, you can structure your loop in the same way as the loop that
 | 
			
		||||
# created CPdefs. )
 | 
			
		||||
 | 
			
		||||
# Submit the "seal" for your ENSP vector, the ENSP vector itself, the R code
 | 
			
		||||
# for this loop and its output into your report if you are submitting
 | 
			
		||||
# anything for credit for this unit. Please read the requirements carefully.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,252 +1,252 @@
 | 
			
		||||
# tocID <- "BIN-SEQA-Composition.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-SEQA-Comparison unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-11  -  2020-09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    First live version 2017
 | 
			
		||||
#           0.1    First code copied from BCH441_A03_makeYFOlist.R
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                      Line
 | 
			
		||||
#TOC> ----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparation                                  48
 | 
			
		||||
#TOC>   2        Aggregate properties                         69
 | 
			
		||||
#TOC>   3        Sequence Composition Enrichment             113
 | 
			
		||||
#TOC>   3.1        Barplot, and side-by-side barplot         136
 | 
			
		||||
#TOC>   3.2        Plotting ratios                           171
 | 
			
		||||
#TOC>   3.3        Plotting log ratios                       188
 | 
			
		||||
#TOC>   3.4        Sort by frequency                         204
 | 
			
		||||
#TOC>   3.5        Color by amino acid type                  221
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparation  =========================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Load a reference sequence to work with:
 | 
			
		||||
 | 
			
		||||
# If you have done the BIN-Storing_data unit:
 | 
			
		||||
   source("makeProteinDB.R")
 | 
			
		||||
   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
 | 
			
		||||
   mySeq <- myDB$protein$sequence[sel]
 | 
			
		||||
 | 
			
		||||
# If not, use the yeast Mbp1 sequence:
 | 
			
		||||
   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Aggregate properties  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's try a simple function from seqinr: computing the pI of the sequence
 | 
			
		||||
?seqinr::computePI
 | 
			
		||||
 | 
			
		||||
# This takes as input a vector of upper-case AA codes
 | 
			
		||||
 | 
			
		||||
# We can use the function strsplit() to split the string
 | 
			
		||||
# into single characters
 | 
			
		||||
 | 
			
		||||
(s <- strsplit(mySeq, "")) # splitting on the empty spring
 | 
			
		||||
                           # splits into single characters
 | 
			
		||||
s <- unlist(s)             # strsplit() returns a list! Why?
 | 
			
		||||
                           # (But we don't need a list now...)
 | 
			
		||||
 | 
			
		||||
# Alternatively, seqinr provides
 | 
			
		||||
# the function s2c() to convert strings into
 | 
			
		||||
# character vectors (and c2s to convert them back).
 | 
			
		||||
 | 
			
		||||
seqinr::s2c(mySeq)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
 | 
			
		||||
seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
 | 
			
		||||
seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
 | 
			
		||||
                                       # values along the sequence
 | 
			
		||||
 | 
			
		||||
# A true Labor of Love has gone into the
 | 
			
		||||
# compilation of the "aaindex" data:
 | 
			
		||||
 | 
			
		||||
?seqinr::aaindex
 | 
			
		||||
data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
 | 
			
		||||
                                   # accessible as an R object
 | 
			
		||||
 | 
			
		||||
length(aaindex)  # no seqinr:: needed for the dataset since we just
 | 
			
		||||
                 # "attached" it with data()
 | 
			
		||||
 | 
			
		||||
# Here are all the index descriptions
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Sequence Composition Enrichment  =====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Lets use one of the indices to calculate and plot amino-acid
 | 
			
		||||
# composition enrichment:
 | 
			
		||||
aaindex[[459]]$D
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Let's construct an enrichment plot to compare average frequencies
 | 
			
		||||
# with the amino acid counts in our sequence.
 | 
			
		||||
 | 
			
		||||
(refData <- aaindex[[459]]$I)                # reference frequencies in %
 | 
			
		||||
names(refData) <- seqinr::a(names(refData))  # change names to single-letter
 | 
			
		||||
                                             # code using seqinr's "a()" function
 | 
			
		||||
sum(refData)
 | 
			
		||||
refData        # ... in %
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# tabulate the amino acid counts in mySeq
 | 
			
		||||
(obsData <- table(seqinr::s2c(mySeq)))        # counts
 | 
			
		||||
(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Barplot, and side-by-side barplot  =================================
 | 
			
		||||
 | 
			
		||||
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#BB0000")
 | 
			
		||||
 | 
			
		||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#555555")
 | 
			
		||||
 | 
			
		||||
# Ok: first problem - the values in obsData are in alphabetical order. But the
 | 
			
		||||
# values in refData are in alphabetical order of amino acid name: alanine,
 | 
			
		||||
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
 | 
			
		||||
# order a lot - one of the old biochemistry tropes in the field. So we need to
 | 
			
		||||
# re-order one of the vectors to match the other. That's easy though:
 | 
			
		||||
refData
 | 
			
		||||
(refData <- refData[names(obsData)])
 | 
			
		||||
 | 
			
		||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#555555")
 | 
			
		||||
 | 
			
		||||
# To compare the values, we want to see them in a barplot, side-by-side ...
 | 
			
		||||
barplot(rbind(obsData, refData),
 | 
			
		||||
        ylim = c(0, 12),
 | 
			
		||||
        beside = TRUE,
 | 
			
		||||
        col = c("#CCCCCC", "#BB0000"),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#00000044")
 | 
			
		||||
 | 
			
		||||
# ... and add a legend
 | 
			
		||||
legend (x = 1, y = 12,
 | 
			
		||||
        legend = c("mySeq", "Average composition"),
 | 
			
		||||
        fill = c("#CCCCCC", "#BB0000"),
 | 
			
		||||
        cex = 0.7,
 | 
			
		||||
        bty = "n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Plotting ratios  ===================================================
 | 
			
		||||
 | 
			
		||||
# To better compare the values, we'll calculate ratios between
 | 
			
		||||
# obsData and refData
 | 
			
		||||
 | 
			
		||||
barplot(obsData / refData,
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "Sequence / Average",
 | 
			
		||||
        ylim = c(0, 2.5),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = 1, col="#BB0000")
 | 
			
		||||
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
# ... but  ratios are not very good here, since the difference in height on the
 | 
			
		||||
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
 | 
			
		||||
# lines) are exactly the same fold-difference !
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Plotting log ratios  ===============================================
 | 
			
		||||
 | 
			
		||||
# A better way to display this
 | 
			
		||||
# is to plot log(ratios).
 | 
			
		||||
 | 
			
		||||
barplot(log(obsData / refData),
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#BB0000")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
# Note how the two-fold difference lines are now the same distance from the
 | 
			
		||||
# line of equal ratio.
 | 
			
		||||
 | 
			
		||||
# ==   3.4  Sort by frequency  =================================================
 | 
			
		||||
 | 
			
		||||
barplot(sort(log(obsData / refData), decreasing = TRUE),
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#BB0000")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
yTxt <- log(0.9)
 | 
			
		||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
 | 
			
		||||
text(5.5, yTxt, "Enriched", cex = 0.7)
 | 
			
		||||
yTxt <- log(1.1)
 | 
			
		||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
 | 
			
		||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
# ==   3.5  Color by amino acid type  ==========================================
 | 
			
		||||
 | 
			
		||||
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
 | 
			
		||||
# script, or define your own.
 | 
			
		||||
 | 
			
		||||
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
 | 
			
		||||
 | 
			
		||||
lR <- sort(log(obsData / refData), decreasing = TRUE)
 | 
			
		||||
barplot(lR,
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        col = AACOLS[names(lR)],
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#00000055")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
 | 
			
		||||
 | 
			
		||||
yTxt <- log(0.9)
 | 
			
		||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
 | 
			
		||||
text(5.5, yTxt, "Enriched", cex = 0.7)
 | 
			
		||||
yTxt <- log(1.1)
 | 
			
		||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
 | 
			
		||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
 | 
			
		||||
#   Depleted?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-SEQA-Composition.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-SEQA-Comparison unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-11  -  2020-09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    First live version 2017
 | 
			
		||||
#           0.1    First code copied from BCH441_A03_makeYFOlist.R
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                      Line
 | 
			
		||||
#TOC> ----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Preparation                                  48
 | 
			
		||||
#TOC>   2        Aggregate properties                         69
 | 
			
		||||
#TOC>   3        Sequence Composition Enrichment             113
 | 
			
		||||
#TOC>   3.1        Barplot, and side-by-side barplot         136
 | 
			
		||||
#TOC>   3.2        Plotting ratios                           171
 | 
			
		||||
#TOC>   3.3        Plotting log ratios                       188
 | 
			
		||||
#TOC>   3.4        Sort by frequency                         204
 | 
			
		||||
#TOC>   3.5        Color by amino acid type                  221
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Preparation  =========================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Load a reference sequence to work with:
 | 
			
		||||
 | 
			
		||||
# If you have done the BIN-Storing_data unit:
 | 
			
		||||
   source("makeProteinDB.R")
 | 
			
		||||
   sel <- which(myDB$protein$name == sprintf("MBP1_%s", biCode(MYSPE)))
 | 
			
		||||
   mySeq <- myDB$protein$sequence[sel]
 | 
			
		||||
 | 
			
		||||
# If not, use the yeast Mbp1 sequence:
 | 
			
		||||
   mySeq <- dbSanitizeSequence(fromJSON("./data/MBP1_SACCE.json")$sequence)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Aggregate properties  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Let's try a simple function from seqinr: computing the pI of the sequence
 | 
			
		||||
?seqinr::computePI
 | 
			
		||||
 | 
			
		||||
# This takes as input a vector of upper-case AA codes
 | 
			
		||||
 | 
			
		||||
# We can use the function strsplit() to split the string
 | 
			
		||||
# into single characters
 | 
			
		||||
 | 
			
		||||
(s <- strsplit(mySeq, "")) # splitting on the empty spring
 | 
			
		||||
                           # splits into single characters
 | 
			
		||||
s <- unlist(s)             # strsplit() returns a list! Why?
 | 
			
		||||
                           # (But we don't need a list now...)
 | 
			
		||||
 | 
			
		||||
# Alternatively, seqinr provides
 | 
			
		||||
# the function s2c() to convert strings into
 | 
			
		||||
# character vectors (and c2s to convert them back).
 | 
			
		||||
 | 
			
		||||
seqinr::s2c(mySeq)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
seqinr::computePI(seqinr::s2c(mySeq))  # isoelectric point
 | 
			
		||||
seqinr::pmw(seqinr::s2c(mySeq))        # molecular weight
 | 
			
		||||
seqinr::AAstat(seqinr::s2c(mySeq))     # This also plots the distribution of
 | 
			
		||||
                                       # values along the sequence
 | 
			
		||||
 | 
			
		||||
# A true Labor of Love has gone into the
 | 
			
		||||
# compilation of the "aaindex" data:
 | 
			
		||||
 | 
			
		||||
?seqinr::aaindex
 | 
			
		||||
data(aaindex, package = "seqinr")  # "attach" the dataset - i.e. make it
 | 
			
		||||
                                   # accessible as an R object
 | 
			
		||||
 | 
			
		||||
length(aaindex)  # no seqinr:: needed for the dataset since we just
 | 
			
		||||
                 # "attached" it with data()
 | 
			
		||||
 | 
			
		||||
# Here are all the index descriptions
 | 
			
		||||
for (i in 1:length(aaindex)) {
 | 
			
		||||
  cat(paste(i, ": ", aaindex[[i]]$D, "\n", sep=""))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Sequence Composition Enrichment  =====================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Lets use one of the indices to calculate and plot amino-acid
 | 
			
		||||
# composition enrichment:
 | 
			
		||||
aaindex[[459]]$D
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Let's construct an enrichment plot to compare average frequencies
 | 
			
		||||
# with the amino acid counts in our sequence.
 | 
			
		||||
 | 
			
		||||
(refData <- aaindex[[459]]$I)                # reference frequencies in %
 | 
			
		||||
names(refData) <- seqinr::a(names(refData))  # change names to single-letter
 | 
			
		||||
                                             # code using seqinr's "a()" function
 | 
			
		||||
sum(refData)
 | 
			
		||||
refData        # ... in %
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# tabulate the amino acid counts in mySeq
 | 
			
		||||
(obsData <- table(seqinr::s2c(mySeq)))        # counts
 | 
			
		||||
(obsData <- 100 * (obsData / sum(obsData)))   # frequencies
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Barplot, and side-by-side barplot  =================================
 | 
			
		||||
 | 
			
		||||
barplot(obsData, col = "#CCCCCC", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#BB0000")
 | 
			
		||||
 | 
			
		||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#555555")
 | 
			
		||||
 | 
			
		||||
# Ok: first problem - the values in obsData are in alphabetical order. But the
 | 
			
		||||
# values in refData are in alphabetical order of amino acid name: alanine,
 | 
			
		||||
# arginine, asparagine, aspartic acid ... A, R, N, D, E ... you will see this
 | 
			
		||||
# order a lot - one of the old biochemistry tropes in the field. So we need to
 | 
			
		||||
# re-order one of the vectors to match the other. That's easy though:
 | 
			
		||||
refData
 | 
			
		||||
(refData <- refData[names(obsData)])
 | 
			
		||||
 | 
			
		||||
barplot(refData, col = "#BB0000", cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#555555")
 | 
			
		||||
 | 
			
		||||
# To compare the values, we want to see them in a barplot, side-by-side ...
 | 
			
		||||
barplot(rbind(obsData, refData),
 | 
			
		||||
        ylim = c(0, 12),
 | 
			
		||||
        beside = TRUE,
 | 
			
		||||
        col = c("#CCCCCC", "#BB0000"),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = 100/20, col="#00000044")
 | 
			
		||||
 | 
			
		||||
# ... and add a legend
 | 
			
		||||
legend (x = 1, y = 12,
 | 
			
		||||
        legend = c("mySeq", "Average composition"),
 | 
			
		||||
        fill = c("#CCCCCC", "#BB0000"),
 | 
			
		||||
        cex = 0.7,
 | 
			
		||||
        bty = "n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Plotting ratios  ===================================================
 | 
			
		||||
 | 
			
		||||
# To better compare the values, we'll calculate ratios between
 | 
			
		||||
# obsData and refData
 | 
			
		||||
 | 
			
		||||
barplot(obsData / refData,
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "Sequence / Average",
 | 
			
		||||
        ylim = c(0, 2.5),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = 1, col="#BB0000")
 | 
			
		||||
abline(h = c(1/2, 2), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
# ... but  ratios are not very good here, since the difference in height on the
 | 
			
		||||
# plot now depends on the order we compare in: ratios of 1/2 and 2 (dotted
 | 
			
		||||
# lines) are exactly the same fold-difference !
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Plotting log ratios  ===============================================
 | 
			
		||||
 | 
			
		||||
# A better way to display this
 | 
			
		||||
# is to plot log(ratios).
 | 
			
		||||
 | 
			
		||||
barplot(log(obsData / refData),
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#BB0000")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
# Note how the two-fold difference lines are now the same distance from the
 | 
			
		||||
# line of equal ratio.
 | 
			
		||||
 | 
			
		||||
# ==   3.4  Sort by frequency  =================================================
 | 
			
		||||
 | 
			
		||||
barplot(sort(log(obsData / refData), decreasing = TRUE),
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        col = "#CCCCCC",
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#BB0000")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#BB000055")
 | 
			
		||||
 | 
			
		||||
yTxt <- log(0.9)
 | 
			
		||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
 | 
			
		||||
text(5.5, yTxt, "Enriched", cex = 0.7)
 | 
			
		||||
yTxt <- log(1.1)
 | 
			
		||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
 | 
			
		||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
# ==   3.5  Color by amino acid type  ==========================================
 | 
			
		||||
 | 
			
		||||
# Color the bars by amino acid type. Use AACOLS , defined in the .utilities.R
 | 
			
		||||
# script, or define your own.
 | 
			
		||||
 | 
			
		||||
barplot(rep(1, 20), names.arg = names(AACOLS), col = AACOLS, cex.names = 0.5)
 | 
			
		||||
 | 
			
		||||
lR <- sort(log(obsData / refData), decreasing = TRUE)
 | 
			
		||||
barplot(lR,
 | 
			
		||||
        ylim = log(c(1/3, 3)),
 | 
			
		||||
        col = AACOLS[names(lR)],
 | 
			
		||||
        ylab = "log(Sequence / Average)",
 | 
			
		||||
        cex.names = 0.7)
 | 
			
		||||
abline(h = log(1), col="#00000055")
 | 
			
		||||
abline(h = log(c(1/2, 2)), lty = 2, col="#00000033")
 | 
			
		||||
 | 
			
		||||
yTxt <- log(0.9)
 | 
			
		||||
arrows(4, yTxt, 0, yTxt, length = 0.07)
 | 
			
		||||
text(5.5, yTxt, "Enriched", cex = 0.7)
 | 
			
		||||
yTxt <- log(1.1)
 | 
			
		||||
arrows(20, yTxt, 24, yTxt, length = 0.07)
 | 
			
		||||
text(19.5, yTxt, "Depleted", pos = 2, cex = 0.7)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#   Interpret this plot. (Can you?) Which types of amino acids are enriched?
 | 
			
		||||
#   Depleted?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							
							
						
						
									
										788
									
								
								BIN-Sequence.R
									
									
									
									
									
								
							@@ -1,394 +1,394 @@
 | 
			
		||||
# tocID <- "BIN-Sequence.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-Sequence unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.5
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09  - 2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.5    2020 Updates
 | 
			
		||||
#           1.4    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.3    Update set.seed() usage
 | 
			
		||||
#           1.2    Removed irrelevant task. How did that even get in there? smh
 | 
			
		||||
#           1.1    Add chartr()
 | 
			
		||||
#           1.0    First live version 2017.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                Line
 | 
			
		||||
#TOC> ----------------------------------------------------
 | 
			
		||||
#TOC>   1        Prepare                                63
 | 
			
		||||
#TOC>   2        Storing Sequence                       80
 | 
			
		||||
#TOC>   3        String properties                     109
 | 
			
		||||
#TOC>   4        Substrings                            116
 | 
			
		||||
#TOC>   5        Creating strings: sprintf()           137
 | 
			
		||||
#TOC>   6        Changing strings                      172
 | 
			
		||||
#TOC>   6.1.1          Changing case                   174
 | 
			
		||||
#TOC>   6.1.2          Reverse                         179
 | 
			
		||||
#TOC>   6.1.3          Change characters               183
 | 
			
		||||
#TOC>   6.1.4          Substitute characters           211
 | 
			
		||||
#TOC>   6.2        stringi and stringr                 231
 | 
			
		||||
#TOC>   6.3        dbSanitizeSequence()                241
 | 
			
		||||
#TOC>   7        Permuting and sampling                253
 | 
			
		||||
#TOC>   7.1        Permutations                        260
 | 
			
		||||
#TOC>   7.2        Sampling                            306
 | 
			
		||||
#TOC>   7.2.1          Equiprobable characters         308
 | 
			
		||||
#TOC>   7.2.2          Defined probability vector      350
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Prepare  =============================================================
 | 
			
		||||
 | 
			
		||||
# Much basic sequence handling is supported by the Bioconductor package
 | 
			
		||||
# Biostrings.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Storing Sequence  ====================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Sequences can be represented and stored as vectors of single characters ...
 | 
			
		||||
(v <- c("D", "I", "V", "M", "T", "Q"))
 | 
			
		||||
 | 
			
		||||
# ... as strings ...
 | 
			
		||||
(s <- "DIVMTQ")
 | 
			
		||||
 | 
			
		||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
 | 
			
		||||
# DNAstring, RNAstring, AAString, etc.
 | 
			
		||||
(a <- Biostrings::AAString("DIVMTQ"))
 | 
			
		||||
 | 
			
		||||
# ... and all of these representations can be interconverted:
 | 
			
		||||
 | 
			
		||||
# string to vector ...
 | 
			
		||||
unlist(strsplit(s, ""))
 | 
			
		||||
 | 
			
		||||
# vector to string ...
 | 
			
		||||
paste(v, sep = "", collapse = "")
 | 
			
		||||
 | 
			
		||||
# ... and AAstring to plain string.
 | 
			
		||||
as.character(a)
 | 
			
		||||
 | 
			
		||||
# Since operations with character vectors trivially follow all other vector
 | 
			
		||||
# conventions and syntax, and we will look at Biostrings methods in more
 | 
			
		||||
# detail in a later unit, we will focus on basic strings in the following.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  String properties  ===================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
length(s) # why ???
 | 
			
		||||
nchar(s)  # Aha!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Substrings  ==========================================================
 | 
			
		||||
 | 
			
		||||
# Use the substr() function
 | 
			
		||||
substr(s, 2, 4)
 | 
			
		||||
 | 
			
		||||
# or the similar substring()
 | 
			
		||||
substring(s, 2, 4)
 | 
			
		||||
 | 
			
		||||
# Note: both functions are vectorized (i.e. they operate on vectors
 | 
			
		||||
# of arguments, you don't need to loop over input)...
 | 
			
		||||
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
 | 
			
		||||
substr(   myBiCodes, 1, 3)
 | 
			
		||||
substring(myBiCodes, 1, 3)
 | 
			
		||||
 | 
			
		||||
# ... however only substring() will also use vectors for start and stop
 | 
			
		||||
s <- "gatattgtgatgacccagtaa"       # a DNA sequence
 | 
			
		||||
(vI <- seq(1, nchar(s), by = 3))   # an index vector
 | 
			
		||||
substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
 | 
			
		||||
substring(s, vI, vI+2)             # ... returns all triplets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Creating strings: sprintf()  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Sprintf is a very smart, very powerful function and has cognates in all
 | 
			
		||||
# other programming languages. It has a bit of a  learning curve, but this is
 | 
			
		||||
# totally worth it:
 | 
			
		||||
# the function takes a format string, and a list of other arguments. It returns
 | 
			
		||||
# a formatted string. Here are some examples - watch carefully for sprintf()
 | 
			
		||||
# calls elsewhere in the code.
 | 
			
		||||
 | 
			
		||||
sprintf("Just a string.")
 | 
			
		||||
sprintf("A string and the number %d.", 5)
 | 
			
		||||
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
 | 
			
		||||
sprintf("Pi is ~ %1.2f ...", pi)
 | 
			
		||||
sprintf("or more accurately ~ %1.11f.", pi)
 | 
			
		||||
x <- "bottles of beer"
 | 
			
		||||
N <- 99
 | 
			
		||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
 | 
			
		||||
        N, x, N, x, "one down, and pass it around", N - 1, x)
 | 
			
		||||
 | 
			
		||||
# Note that in the last example, the value of the string was displayed with
 | 
			
		||||
# R's usual print-formatting function and therefore the line-break "\n" did
 | 
			
		||||
# not actually break the line. To have line breaks, tabs etc, you need to use
 | 
			
		||||
# cat() to display the string:
 | 
			
		||||
 | 
			
		||||
for (i in N:(N-4)) {
 | 
			
		||||
  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
 | 
			
		||||
              i, x, i, x, "one down, and pass it around", i - 1, x))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# sprintf() is vectorized: if one of its parameters is a vector, it
 | 
			
		||||
# will generate one output string for each of the vector's elements:
 | 
			
		||||
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Changing strings  ====================================================
 | 
			
		||||
 | 
			
		||||
# ===   6.1.1  Changing case
 | 
			
		||||
tolower(s)
 | 
			
		||||
toupper(tolower(s))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.2  Reverse
 | 
			
		||||
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
 | 
			
		||||
# Biostrings::str_rev(s)
 | 
			
		||||
# The following works, of course, but awkward:
 | 
			
		||||
s
 | 
			
		||||
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
 | 
			
		||||
 | 
			
		||||
# reverse complement
 | 
			
		||||
COMP <- c("t", "g", "c", "a")
 | 
			
		||||
names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
 | 
			
		||||
s
 | 
			
		||||
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.3  Change characters
 | 
			
		||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
 | 
			
		||||
# correpsonding character in "new." Kind of like the COMP vector above ...
 | 
			
		||||
 | 
			
		||||
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
 | 
			
		||||
 | 
			
		||||
# One could implement toupper() and tolower() with this - remember that R has
 | 
			
		||||
# character vectors of uppercase and lowercase letters as language constants.
 | 
			
		||||
chartr(paste0(letters, collapse = ""),
 | 
			
		||||
       paste0(LETTERS, collapse = ""),
 | 
			
		||||
       "Twinkle, twinkle little star, how I wonder what you are.")
 | 
			
		||||
 | 
			
		||||
# One amusing way to use the function  is for a reversible substitution
 | 
			
		||||
# cypher.
 | 
			
		||||
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
# encode ...
 | 
			
		||||
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
 | 
			
		||||
 | 
			
		||||
# decode ...
 | 
			
		||||
chartr(myCypher, alBet, x)
 | 
			
		||||
# (Nb. substitution cyphers are easy to crack!)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.4  Substitute characters
 | 
			
		||||
# gsub can change lengths.
 | 
			
		||||
#   Example: implementing the binary Fibonacci sequence:
 | 
			
		||||
#   0 -> 1; 1 -> 10 , in three nested gsub() statements
 | 
			
		||||
( s <- 1 )
 | 
			
		||||
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
 | 
			
		||||
 | 
			
		||||
# Iterate this line a few times ...
 | 
			
		||||
#
 | 
			
		||||
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
 | 
			
		||||
# for the features of the sequence.
 | 
			
		||||
 | 
			
		||||
# I use gsub() often to delete unwanted characters ...
 | 
			
		||||
# ... select something, and substitute the empty string for it.
 | 
			
		||||
(s <- gsub("-", "", s))
 | 
			
		||||
 | 
			
		||||
# For example: clean up a sequence
 | 
			
		||||
# copy/paste from UniProt
 | 
			
		||||
(s <- "        10         20         30         40         50
 | 
			
		||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# remove numbers
 | 
			
		||||
(s <- gsub("[0-9]", "", s))
 | 
			
		||||
 | 
			
		||||
# remove "whitespace" (spaces, tabs, line breaks)...
 | 
			
		||||
(s <- gsub("\\s", "", s))
 | 
			
		||||
 | 
			
		||||
# ==   6.2  stringi and stringr  ===============================================
 | 
			
		||||
 | 
			
		||||
# But there are also specialized functions eg. to remove leading/trailing
 | 
			
		||||
# whitespace which may be important to sanitize user input etc. Have a look at
 | 
			
		||||
# the function descriptions for the stringr and the stringi package. stringr is
 | 
			
		||||
# part of the tidyverse, and for the most part a wrapper for stringi functions.
 | 
			
		||||
# https://github.com/tidyverse/stringr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   6.3  dbSanitizeSequence()  ==============================================
 | 
			
		||||
 | 
			
		||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
 | 
			
		||||
# sequences that may be copy/pasted from Web-sources
 | 
			
		||||
 | 
			
		||||
cat( s <- ">FASTA header will be removed
 | 
			
		||||
10         20         30         40         50
 | 
			
		||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
 | 
			
		||||
 | 
			
		||||
dbSanitizeSequence(s)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    7  Permuting and sampling  ==============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# An important aspect of working with strings is generating random strings
 | 
			
		||||
# with given statistical properties: reference items to evaluate significance.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   7.1  Permutations  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# One way to produce such reference items is to permute a string. A permuted
 | 
			
		||||
# string has the same composition as the original, but all positional
 | 
			
		||||
# information is lost. The sample() function can be used to permute:
 | 
			
		||||
 | 
			
		||||
# This is the sequence of the ompA secretion signal
 | 
			
		||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 | 
			
		||||
 | 
			
		||||
(x <- sample(s, length(s)))  # permuted
 | 
			
		||||
 | 
			
		||||
# Here's a small example how such permuted strings may be useful. As you look
 | 
			
		||||
# at the ompA sequence, you suspect that the two lysines near the +-charged
 | 
			
		||||
# N-terminus may not be accidental, but selected for a positively charged
 | 
			
		||||
# N-terminus. What is the chance that such a sequence has two lysines close to
 | 
			
		||||
# the N-terminus simply by chance? Or put differently: what is the average
 | 
			
		||||
# distance of two lysines in such a sequence to the N-terminus. First, we
 | 
			
		||||
# need an expression that measures the distance. A simple use of the which()
 | 
			
		||||
# function will do just fine.
 | 
			
		||||
 | 
			
		||||
which(s == "K")        # shows they are in position 2 and 3, so ...
 | 
			
		||||
mean(which(s == "K"))  # ... gives us the average, and ...
 | 
			
		||||
mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
 | 
			
		||||
 | 
			
		||||
# So what does the distribution look like? Lets do 10,000 trials.
 | 
			
		||||
 | 
			
		||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 | 
			
		||||
N <- 10000
 | 
			
		||||
d <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  d[i] <- mean(which(sample(s, length(s)) == "K"))
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(d, breaks = 20)
 | 
			
		||||
abline(v = 2.5, lwd = 2, col = "firebrick")
 | 
			
		||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
 | 
			
		||||
              # N-terminus or more. That's just below the signifcance
 | 
			
		||||
              # threshold of 5 %. It's a trend, but to be sure we are looking
 | 
			
		||||
              # at a biological effect we would need to see more
 | 
			
		||||
              # sequences.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   7.2  Sampling  ==========================================================
 | 
			
		||||
 | 
			
		||||
# ===   7.2.1  Equiprobable characters
 | 
			
		||||
 | 
			
		||||
# Assume you need a large random-nucleotide string for some statistical model.
 | 
			
		||||
# How to create such a string? sample() can easily create it:
 | 
			
		||||
 | 
			
		||||
nuc <- c("A", "C", "G", "T")
 | 
			
		||||
N <- 100
 | 
			
		||||
 | 
			
		||||
set.seed(16818)                        # set RNG seed for repeatable randomness
 | 
			
		||||
v <- sample(nuc, N, replace = TRUE)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
(mySeq <- paste(v, collapse = ""))
 | 
			
		||||
 | 
			
		||||
# What's the GC content?
 | 
			
		||||
table(v)
 | 
			
		||||
sum(table(v)[c("G", "C")]) # 51 is close to expected
 | 
			
		||||
 | 
			
		||||
# What's the number of CpG motifs? Easy to check with the stringi
 | 
			
		||||
# stri_match_all() function
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("stringi", quietly = TRUE)) {
 | 
			
		||||
  install.packages("stringi")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = stringi)       # basic information
 | 
			
		||||
#  browseVignettes("stringi")    # available vignettes
 | 
			
		||||
#  data(package = "stringi")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 | 
			
		||||
length(unlist(x))
 | 
			
		||||
 | 
			
		||||
# Now you could compare that number with yeast DNA sequences, and determine
 | 
			
		||||
# whether there are more or less CpG motifs than expected by chance.
 | 
			
		||||
# (cf. https://en.wikipedia.org/wiki/CpG_site)
 | 
			
		||||
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
 | 
			
		||||
# with the same probability. But the yeast genomic DNA GC content is only
 | 
			
		||||
# 38%. So you would expect fewer CpG motifs based on the statistical properties
 | 
			
		||||
# of the smaller number of Cs and Gs - before biology even comes into play. How
 | 
			
		||||
# do we account for that?
 | 
			
		||||
 | 
			
		||||
# ===   7.2.2  Defined probability vector
 | 
			
		||||
 | 
			
		||||
# This is where we need to know how to create samples with specific probability
 | 
			
		||||
# distributions. A crude hack would be to create a sampling source vector with
 | 
			
		||||
# 19 C, 19 G, 31 A and 31 T
 | 
			
		||||
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
 | 
			
		||||
# ... but that doesn't scale if the numeric accuracy needs to be higher.
 | 
			
		||||
#
 | 
			
		||||
# However sample() has an argument that takes care of that: you can explicitly
 | 
			
		||||
# specify the probabilities with which each element of the the sampling vector
 | 
			
		||||
# should be chosen:
 | 
			
		||||
 | 
			
		||||
nuc <- c("A", "C", "G", "T")
 | 
			
		||||
N <- 100
 | 
			
		||||
myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
 | 
			
		||||
 | 
			
		||||
set.seed(16818)                       # set RNG seed for repeatable randomness
 | 
			
		||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
(mySeq <- paste(v, collapse = ""))
 | 
			
		||||
 | 
			
		||||
# What's the GC content?
 | 
			
		||||
table(v)
 | 
			
		||||
sum(table(v)[c("G", "C")]) # Close to expected
 | 
			
		||||
 | 
			
		||||
# What's the number of CpG motifs?
 | 
			
		||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 | 
			
		||||
# ... not a single one in this case.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "BIN-Sequence.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the BIN-Sequence unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.5
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09  - 2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.5    2020 Updates
 | 
			
		||||
#           1.4    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.3    Update set.seed() usage
 | 
			
		||||
#           1.2    Removed irrelevant task. How did that even get in there? smh
 | 
			
		||||
#           1.1    Add chartr()
 | 
			
		||||
#           1.0    First live version 2017.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                Line
 | 
			
		||||
#TOC> ----------------------------------------------------
 | 
			
		||||
#TOC>   1        Prepare                                63
 | 
			
		||||
#TOC>   2        Storing Sequence                       80
 | 
			
		||||
#TOC>   3        String properties                     109
 | 
			
		||||
#TOC>   4        Substrings                            116
 | 
			
		||||
#TOC>   5        Creating strings: sprintf()           137
 | 
			
		||||
#TOC>   6        Changing strings                      172
 | 
			
		||||
#TOC>   6.1.1          Changing case                   174
 | 
			
		||||
#TOC>   6.1.2          Reverse                         179
 | 
			
		||||
#TOC>   6.1.3          Change characters               183
 | 
			
		||||
#TOC>   6.1.4          Substitute characters           211
 | 
			
		||||
#TOC>   6.2        stringi and stringr                 231
 | 
			
		||||
#TOC>   6.3        dbSanitizeSequence()                241
 | 
			
		||||
#TOC>   7        Permuting and sampling                253
 | 
			
		||||
#TOC>   7.1        Permutations                        260
 | 
			
		||||
#TOC>   7.2        Sampling                            306
 | 
			
		||||
#TOC>   7.2.1          Equiprobable characters         308
 | 
			
		||||
#TOC>   7.2.2          Defined probability vector      350
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Prepare  =============================================================
 | 
			
		||||
 | 
			
		||||
# Much basic sequence handling is supported by the Bioconductor package
 | 
			
		||||
# Biostrings.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Storing Sequence  ====================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Sequences can be represented and stored as vectors of single characters ...
 | 
			
		||||
(v <- c("D", "I", "V", "M", "T", "Q"))
 | 
			
		||||
 | 
			
		||||
# ... as strings ...
 | 
			
		||||
(s <- "DIVMTQ")
 | 
			
		||||
 | 
			
		||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
 | 
			
		||||
# DNAstring, RNAstring, AAString, etc.
 | 
			
		||||
(a <- Biostrings::AAString("DIVMTQ"))
 | 
			
		||||
 | 
			
		||||
# ... and all of these representations can be interconverted:
 | 
			
		||||
 | 
			
		||||
# string to vector ...
 | 
			
		||||
unlist(strsplit(s, ""))
 | 
			
		||||
 | 
			
		||||
# vector to string ...
 | 
			
		||||
paste(v, sep = "", collapse = "")
 | 
			
		||||
 | 
			
		||||
# ... and AAstring to plain string.
 | 
			
		||||
as.character(a)
 | 
			
		||||
 | 
			
		||||
# Since operations with character vectors trivially follow all other vector
 | 
			
		||||
# conventions and syntax, and we will look at Biostrings methods in more
 | 
			
		||||
# detail in a later unit, we will focus on basic strings in the following.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  String properties  ===================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
length(s) # why ???
 | 
			
		||||
nchar(s)  # Aha!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Substrings  ==========================================================
 | 
			
		||||
 | 
			
		||||
# Use the substr() function
 | 
			
		||||
substr(s, 2, 4)
 | 
			
		||||
 | 
			
		||||
# or the similar substring()
 | 
			
		||||
substring(s, 2, 4)
 | 
			
		||||
 | 
			
		||||
# Note: both functions are vectorized (i.e. they operate on vectors
 | 
			
		||||
# of arguments, you don't need to loop over input)...
 | 
			
		||||
myBiCodes <- c("HOMSA", "MUSMU", "FUGRU", "XENLA")
 | 
			
		||||
substr(   myBiCodes, 1, 3)
 | 
			
		||||
substring(myBiCodes, 1, 3)
 | 
			
		||||
 | 
			
		||||
# ... however only substring() will also use vectors for start and stop
 | 
			
		||||
s <- "gatattgtgatgacccagtaa"       # a DNA sequence
 | 
			
		||||
(vI <- seq(1, nchar(s), by = 3))   # an index vector
 | 
			
		||||
substr(   s, vI, vI+2)             # ... returns only the first nucleotide triplet
 | 
			
		||||
substring(s, vI, vI+2)             # ... returns all triplets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Creating strings: sprintf()  =========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Sprintf is a very smart, very powerful function and has cognates in all
 | 
			
		||||
# other programming languages. It has a bit of a  learning curve, but this is
 | 
			
		||||
# totally worth it:
 | 
			
		||||
# the function takes a format string, and a list of other arguments. It returns
 | 
			
		||||
# a formatted string. Here are some examples - watch carefully for sprintf()
 | 
			
		||||
# calls elsewhere in the code.
 | 
			
		||||
 | 
			
		||||
sprintf("Just a string.")
 | 
			
		||||
sprintf("A string and the number %d.", 5)
 | 
			
		||||
sprintf("More numbers: %d ate %d.", 7, 9) # Sorry
 | 
			
		||||
sprintf("Pi is ~ %1.2f ...", pi)
 | 
			
		||||
sprintf("or more accurately ~ %1.11f.", pi)
 | 
			
		||||
x <- "bottles of beer"
 | 
			
		||||
N <- 99
 | 
			
		||||
sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.",
 | 
			
		||||
        N, x, N, x, "one down, and pass it around", N - 1, x)
 | 
			
		||||
 | 
			
		||||
# Note that in the last example, the value of the string was displayed with
 | 
			
		||||
# R's usual print-formatting function and therefore the line-break "\n" did
 | 
			
		||||
# not actually break the line. To have line breaks, tabs etc, you need to use
 | 
			
		||||
# cat() to display the string:
 | 
			
		||||
 | 
			
		||||
for (i in N:(N-4)) {
 | 
			
		||||
  cat(sprintf("%d %s on the wall, %d %s - \ntake %s: %d %s on the wall.\n\n",
 | 
			
		||||
              i, x, i, x, "one down, and pass it around", i - 1, x))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# sprintf() is vectorized: if one of its parameters is a vector, it
 | 
			
		||||
# will generate one output string for each of the vector's elements:
 | 
			
		||||
cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Changing strings  ====================================================
 | 
			
		||||
 | 
			
		||||
# ===   6.1.1  Changing case
 | 
			
		||||
tolower(s)
 | 
			
		||||
toupper(tolower(s))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.2  Reverse
 | 
			
		||||
# (This used to work in Biostrings, apparently it doesn't work anymore. Why?)
 | 
			
		||||
# Biostrings::str_rev(s)
 | 
			
		||||
# The following works, of course, but awkward:
 | 
			
		||||
s
 | 
			
		||||
paste0(rev(unlist(strsplit(s, ""))), collapse = "")
 | 
			
		||||
 | 
			
		||||
# reverse complement
 | 
			
		||||
COMP <- c("t", "g", "c", "a")
 | 
			
		||||
names(COMP) <- c("a", "c", "g", "t")     # mapping the complement via names
 | 
			
		||||
s
 | 
			
		||||
paste0(COMP[rev(unlist(strsplit(s, "")))], collapse = "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.3  Change characters
 | 
			
		||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
 | 
			
		||||
# correpsonding character in "new." Kind of like the COMP vector above ...
 | 
			
		||||
 | 
			
		||||
chartr("aeio", "uuuu", "We hold these truths to be self-evident ...")
 | 
			
		||||
 | 
			
		||||
# One could implement toupper() and tolower() with this - remember that R has
 | 
			
		||||
# character vectors of uppercase and lowercase letters as language constants.
 | 
			
		||||
chartr(paste0(letters, collapse = ""),
 | 
			
		||||
       paste0(LETTERS, collapse = ""),
 | 
			
		||||
       "Twinkle, twinkle little star, how I wonder what you are.")
 | 
			
		||||
 | 
			
		||||
# One amusing way to use the function  is for a reversible substitution
 | 
			
		||||
# cypher.
 | 
			
		||||
alBet <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz .,;:?0123456789"
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
( myCypher <- paste0(sample(unlist(strsplit(alBet, ""))), collapse = "") )
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
# encode ...
 | 
			
		||||
(x <- chartr(alBet, myCypher, "... seven for a secret, never to be told."))
 | 
			
		||||
 | 
			
		||||
# decode ...
 | 
			
		||||
chartr(myCypher, alBet, x)
 | 
			
		||||
# (Nb. substitution cyphers are easy to crack!)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   6.1.4  Substitute characters
 | 
			
		||||
# gsub can change lengths.
 | 
			
		||||
#   Example: implementing the binary Fibonacci sequence:
 | 
			
		||||
#   0 -> 1; 1 -> 10 , in three nested gsub() statements
 | 
			
		||||
( s <- 1 )
 | 
			
		||||
( s <- gsub("2", "10", gsub("0", "1", gsub("1", "2", s))) )
 | 
			
		||||
 | 
			
		||||
# Iterate this line a few times ...
 | 
			
		||||
#
 | 
			
		||||
# cf. http://www.maths.surrey.ac.uk/hosted-sites/R.Knott/Fibonacci/fibrab.html
 | 
			
		||||
# for the features of the sequence.
 | 
			
		||||
 | 
			
		||||
# I use gsub() often to delete unwanted characters ...
 | 
			
		||||
# ... select something, and substitute the empty string for it.
 | 
			
		||||
(s <- gsub("-", "", s))
 | 
			
		||||
 | 
			
		||||
# For example: clean up a sequence
 | 
			
		||||
# copy/paste from UniProt
 | 
			
		||||
(s <- "        10         20         30         40         50
 | 
			
		||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR ")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# remove numbers
 | 
			
		||||
(s <- gsub("[0-9]", "", s))
 | 
			
		||||
 | 
			
		||||
# remove "whitespace" (spaces, tabs, line breaks)...
 | 
			
		||||
(s <- gsub("\\s", "", s))
 | 
			
		||||
 | 
			
		||||
# ==   6.2  stringi and stringr  ===============================================
 | 
			
		||||
 | 
			
		||||
# But there are also specialized functions eg. to remove leading/trailing
 | 
			
		||||
# whitespace which may be important to sanitize user input etc. Have a look at
 | 
			
		||||
# the function descriptions for the stringr and the stringi package. stringr is
 | 
			
		||||
# part of the tidyverse, and for the most part a wrapper for stringi functions.
 | 
			
		||||
# https://github.com/tidyverse/stringr
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   6.3  dbSanitizeSequence()  ==============================================
 | 
			
		||||
 | 
			
		||||
# In our learning units, we use a function dbSanitizeSequence() to clean up
 | 
			
		||||
# sequences that may be copy/pasted from Web-sources
 | 
			
		||||
 | 
			
		||||
cat( s <- ">FASTA header will be removed
 | 
			
		||||
10         20         30         40         50
 | 
			
		||||
MSNQIYSARY SGVDVYEFIH STGSIMKRKK DDWVNATHIL KAANFAKAKR " )
 | 
			
		||||
 | 
			
		||||
dbSanitizeSequence(s)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    7  Permuting and sampling  ==============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# An important aspect of working with strings is generating random strings
 | 
			
		||||
# with given statistical properties: reference items to evaluate significance.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   7.1  Permutations  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# One way to produce such reference items is to permute a string. A permuted
 | 
			
		||||
# string has the same composition as the original, but all positional
 | 
			
		||||
# information is lost. The sample() function can be used to permute:
 | 
			
		||||
 | 
			
		||||
# This is the sequence of the ompA secretion signal
 | 
			
		||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 | 
			
		||||
 | 
			
		||||
(x <- sample(s, length(s)))  # permuted
 | 
			
		||||
 | 
			
		||||
# Here's a small example how such permuted strings may be useful. As you look
 | 
			
		||||
# at the ompA sequence, you suspect that the two lysines near the +-charged
 | 
			
		||||
# N-terminus may not be accidental, but selected for a positively charged
 | 
			
		||||
# N-terminus. What is the chance that such a sequence has two lysines close to
 | 
			
		||||
# the N-terminus simply by chance? Or put differently: what is the average
 | 
			
		||||
# distance of two lysines in such a sequence to the N-terminus. First, we
 | 
			
		||||
# need an expression that measures the distance. A simple use of the which()
 | 
			
		||||
# function will do just fine.
 | 
			
		||||
 | 
			
		||||
which(s == "K")        # shows they are in position 2 and 3, so ...
 | 
			
		||||
mean(which(s == "K"))  # ... gives us the average, and ...
 | 
			
		||||
mean(which(x == "K"))  # ... gives us the average of the permuted sequence.
 | 
			
		||||
 | 
			
		||||
# So what does the distribution look like? Lets do 10,000 trials.
 | 
			
		||||
 | 
			
		||||
(s <- unlist(strsplit("MKKTAIAVALAGFATVAQA", "")))
 | 
			
		||||
N <- 10000
 | 
			
		||||
d <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                       # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  d[i] <- mean(which(sample(s, length(s)) == "K"))
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(d, breaks = 20)
 | 
			
		||||
abline(v = 2.5, lwd = 2, col = "firebrick")
 | 
			
		||||
sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
 | 
			
		||||
              # N-terminus or more. That's just below the signifcance
 | 
			
		||||
              # threshold of 5 %. It's a trend, but to be sure we are looking
 | 
			
		||||
              # at a biological effect we would need to see more
 | 
			
		||||
              # sequences.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   7.2  Sampling  ==========================================================
 | 
			
		||||
 | 
			
		||||
# ===   7.2.1  Equiprobable characters
 | 
			
		||||
 | 
			
		||||
# Assume you need a large random-nucleotide string for some statistical model.
 | 
			
		||||
# How to create such a string? sample() can easily create it:
 | 
			
		||||
 | 
			
		||||
nuc <- c("A", "C", "G", "T")
 | 
			
		||||
N <- 100
 | 
			
		||||
 | 
			
		||||
set.seed(16818)                        # set RNG seed for repeatable randomness
 | 
			
		||||
v <- sample(nuc, N, replace = TRUE)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
(mySeq <- paste(v, collapse = ""))
 | 
			
		||||
 | 
			
		||||
# What's the GC content?
 | 
			
		||||
table(v)
 | 
			
		||||
sum(table(v)[c("G", "C")]) # 51 is close to expected
 | 
			
		||||
 | 
			
		||||
# What's the number of CpG motifs? Easy to check with the stringi
 | 
			
		||||
# stri_match_all() function
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("stringi", quietly = TRUE)) {
 | 
			
		||||
  install.packages("stringi")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = stringi)       # basic information
 | 
			
		||||
#  browseVignettes("stringi")    # available vignettes
 | 
			
		||||
#  data(package = "stringi")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 | 
			
		||||
length(unlist(x))
 | 
			
		||||
 | 
			
		||||
# Now you could compare that number with yeast DNA sequences, and determine
 | 
			
		||||
# whether there are more or less CpG motifs than expected by chance.
 | 
			
		||||
# (cf. https://en.wikipedia.org/wiki/CpG_site)
 | 
			
		||||
# But hold on: is that a fair comparison? sample() gives us all four nucleotides
 | 
			
		||||
# with the same probability. But the yeast genomic DNA GC content is only
 | 
			
		||||
# 38%. So you would expect fewer CpG motifs based on the statistical properties
 | 
			
		||||
# of the smaller number of Cs and Gs - before biology even comes into play. How
 | 
			
		||||
# do we account for that?
 | 
			
		||||
 | 
			
		||||
# ===   7.2.2  Defined probability vector
 | 
			
		||||
 | 
			
		||||
# This is where we need to know how to create samples with specific probability
 | 
			
		||||
# distributions. A crude hack would be to create a sampling source vector with
 | 
			
		||||
# 19 C, 19 G, 31 A and 31 T
 | 
			
		||||
c(rep("C", 19), rep("G", 19), rep(c("A"), 31), rep(c("T"), 31))
 | 
			
		||||
# ... but that doesn't scale if the numeric accuracy needs to be higher.
 | 
			
		||||
#
 | 
			
		||||
# However sample() has an argument that takes care of that: you can explicitly
 | 
			
		||||
# specify the probabilities with which each element of the the sampling vector
 | 
			
		||||
# should be chosen:
 | 
			
		||||
 | 
			
		||||
nuc <- c("A", "C", "G", "T")
 | 
			
		||||
N <- 100
 | 
			
		||||
myProb <- c(0.31, 0.19, 0.19, 0.31)    # sampling probabilities
 | 
			
		||||
 | 
			
		||||
set.seed(16818)                       # set RNG seed for repeatable randomness
 | 
			
		||||
v <- sample(nuc, N, prob = myProb, replace = TRUE)
 | 
			
		||||
set.seed(NULL)                         # reset the RNG
 | 
			
		||||
 | 
			
		||||
(mySeq <- paste(v, collapse = ""))
 | 
			
		||||
 | 
			
		||||
# What's the GC content?
 | 
			
		||||
table(v)
 | 
			
		||||
sum(table(v)[c("G", "C")]) # Close to expected
 | 
			
		||||
 | 
			
		||||
# What's the number of CpG motifs?
 | 
			
		||||
(x <- stringi::stri_match_all(mySeq, regex = "CG"))
 | 
			
		||||
# ... not a single one in this case.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
							
						
						
									
										1368
									
								
								BIN-Storing_data.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,349 +1,349 @@
 | 
			
		||||
# tocID <- "FND-Genetic_code.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-Genetic_code unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017  10  -  2019  01
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0.1  Comment on "incomplete final line" warning in FASTA
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                            Line
 | 
			
		||||
#TOC> ----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Storing the genetic code                           45
 | 
			
		||||
#TOC>   1.1        Genetic code in Biostrings                       63
 | 
			
		||||
#TOC>   2        Working with the genetic code                      94
 | 
			
		||||
#TOC>   2.1        Translate a sequence.                           129
 | 
			
		||||
#TOC>   3        An alternative representation: 3D array           212
 | 
			
		||||
#TOC>   3.1        Print a Genetic code table                      246
 | 
			
		||||
#TOC>   4        Tasks                                             272
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Storing the genetic code  ============================================
 | 
			
		||||
 | 
			
		||||
# The genetic code maps trinucleotide codons to amino acids. To store it, we
 | 
			
		||||
# need some mechanism to associate the two representations. The most
 | 
			
		||||
# convenient way to do that is a "named vector" which holds the amino acid
 | 
			
		||||
# code and assigns the codons as names to its elements.
 | 
			
		||||
 | 
			
		||||
x <- c("M", "H", "H", "*", "*", "*")
 | 
			
		||||
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
 | 
			
		||||
x
 | 
			
		||||
 | 
			
		||||
# Then we can access the vector by the codon as name, and retrieve the
 | 
			
		||||
# amino acid ...
 | 
			
		||||
 | 
			
		||||
x["ATG"]
 | 
			
		||||
x["CAC"]
 | 
			
		||||
x["TAA"]
 | 
			
		||||
 | 
			
		||||
# ... or the names of elements, to retrieve the codon(s)
 | 
			
		||||
names(x)[x == "M"]
 | 
			
		||||
names(x)[x == "H"]
 | 
			
		||||
names(x)[x == "*"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Genetic code in Biostrings  ========================================
 | 
			
		||||
 | 
			
		||||
# Coveniently, the standard genetic code as well as its alternatives are
 | 
			
		||||
# available in the Bioconductor "Biostrings" package:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The standard genetic code vector
 | 
			
		||||
Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# The table of genetic codes. This information corresponds to this page
 | 
			
		||||
# at the NCBI:
 | 
			
		||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
 | 
			
		||||
Biostrings::GENETIC_CODE_TABLE
 | 
			
		||||
 | 
			
		||||
# Most of the alternative codes are mitochondrial codes. The id of the
 | 
			
		||||
# Alternative Yeast Nuclear code is "12"
 | 
			
		||||
Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Working with the genetic code  =======================================
 | 
			
		||||
 | 
			
		||||
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
 | 
			
		||||
# to a "local" variable, rather than retrieving it from the package all the
 | 
			
		||||
# time.
 | 
			
		||||
 | 
			
		||||
GC <- Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# This is a named vector of characters ...
 | 
			
		||||
 | 
			
		||||
str(GC)
 | 
			
		||||
 | 
			
		||||
# ... which also stores the alternative initiation codons TTG and CTG in
 | 
			
		||||
# an attribute of the vector. (Alternative initiation codons sometimes are
 | 
			
		||||
# used instead of ATG to intiate translation, if translation is not initiated
 | 
			
		||||
# at ATG thses are still translated with fMet.)
 | 
			
		||||
 | 
			
		||||
attr(GC, "alt_init_codons")
 | 
			
		||||
 | 
			
		||||
# But the key to use this vector is in the "names" which we use for subsetting
 | 
			
		||||
# the list of amino acids in whatever way we need.
 | 
			
		||||
names(GC)
 | 
			
		||||
 | 
			
		||||
# The translation of "TGG" ...
 | 
			
		||||
GC["TGG"]
 | 
			
		||||
 | 
			
		||||
# All stop codons
 | 
			
		||||
names(GC)[GC == "*"]
 | 
			
		||||
 | 
			
		||||
# All start codons
 | 
			
		||||
names(GC)[GC == "M"] # ... or
 | 
			
		||||
c(names(GC)[GC == "M"],
 | 
			
		||||
  attr(GC, "alt_init_codons"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Translate a sequence.  =============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have provided a gene sequence in the data directory:
 | 
			
		||||
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
 | 
			
		||||
 | 
			
		||||
# read it
 | 
			
		||||
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
 | 
			
		||||
# You will notice that this generates a Warning message:
 | 
			
		||||
#      Warning message:
 | 
			
		||||
#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
 | 
			
		||||
#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
 | 
			
		||||
 | 
			
		||||
# The reason for this is that the last character of the file is the letter "A"
 | 
			
		||||
# and not a "\n" line break. This file is exactly how it was sent from the
 | 
			
		||||
# NCBI server; I think good, defensive programming practice would have been to
 | 
			
		||||
# include some kind of an end-marker in the file, like a final "\n". This helps
 | 
			
		||||
# us recognize an incomplete transmission. Let's parse the actual sequence from
 | 
			
		||||
# the file, and then check for completeness.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
head(mbp1)
 | 
			
		||||
 | 
			
		||||
# drop the first line (header)
 | 
			
		||||
mbp1 <- mbp1[-1]
 | 
			
		||||
head(mbp1)
 | 
			
		||||
 | 
			
		||||
# concatenate it all to a single string
 | 
			
		||||
mbp1 <- paste(mbp1, sep = "", collapse = "")
 | 
			
		||||
 | 
			
		||||
# how long is it?
 | 
			
		||||
nchar(mbp1)
 | 
			
		||||
 | 
			
		||||
# how many codons?
 | 
			
		||||
nchar(mbp1)/3
 | 
			
		||||
 | 
			
		||||
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
 | 
			
		||||
# first verification that the file we read is complete, the nucleotides of a
 | 
			
		||||
# complete ORF should be divisible by 3.
 | 
			
		||||
 | 
			
		||||
# Extract the codons. There are many ways to split a long string into chunks
 | 
			
		||||
# of three characters. Here we use the Biostrings  codons()  function. codons()
 | 
			
		||||
# requires an object of type DNAstring - a special kind of string with
 | 
			
		||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
 | 
			
		||||
# with DNAstring(), then split it up, then convert it into a plain
 | 
			
		||||
# character vector.
 | 
			
		||||
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
 | 
			
		||||
 | 
			
		||||
head(mbp1Codons)
 | 
			
		||||
 | 
			
		||||
# now translate each codon
 | 
			
		||||
 | 
			
		||||
mbp1AA <- character(834)
 | 
			
		||||
for (i in seq_along(mbp1Codons)) {
 | 
			
		||||
  mbp1AA[i] <- GC[mbp1Codons[i]]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
head(mbp1Codons)
 | 
			
		||||
head(mbp1AA)
 | 
			
		||||
 | 
			
		||||
tail(mbp1Codons)
 | 
			
		||||
tail(mbp1AA) # Note the stop!
 | 
			
		||||
 | 
			
		||||
# The TAA "ochre" stop codon is our second verification that the nucleotide
 | 
			
		||||
# sequence is complete: a stop codon can't appear internally in an ORF.
 | 
			
		||||
 | 
			
		||||
# We can work with the mbp1AA vector, for example to tabulate the
 | 
			
		||||
# amino acid frequencies:
 | 
			
		||||
table(mbp1AA)
 | 
			
		||||
sort(table(mbp1AA), decreasing = TRUE)
 | 
			
		||||
 | 
			
		||||
# Or we can paste all elements together into a single string. But let's remove
 | 
			
		||||
# the stop, it's not actually a part of the sequence. To remove the last element
 | 
			
		||||
# of a vector, re-assign it with a vector minus the index of the last element:
 | 
			
		||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
 | 
			
		||||
tail(mbp1AA) # Note the stop is gone!
 | 
			
		||||
 | 
			
		||||
# paste it together, collapsing the elements using an empty string as the
 | 
			
		||||
# separation-character (i.e.: nothing)
 | 
			
		||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  An alternative representation: 3D array  =============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
 | 
			
		||||
# here is a good opportunity to review the syntax of 3D arrays with a
 | 
			
		||||
# genetic code cube:
 | 
			
		||||
 | 
			
		||||
# Initialize, using A G C T as the names of the elements in each dimension
 | 
			
		||||
cCube <- array(data     = character(64),
 | 
			
		||||
               dim      = c(4, 4, 4),
 | 
			
		||||
               dimnames = list(c("A", "G", "C", "T"),
 | 
			
		||||
                               c("A", "G", "C", "T"),
 | 
			
		||||
                               c("A", "G", "C", "T")))
 | 
			
		||||
 | 
			
		||||
# fill it with amino acid codes using three nested loops
 | 
			
		||||
for (i in 1:4) {
 | 
			
		||||
  for (j in 1:4) {
 | 
			
		||||
    for (k in 1:4) {
 | 
			
		||||
      myCodon <- paste(dimnames(cCube)[[1]][i],
 | 
			
		||||
                       dimnames(cCube)[[2]][j],
 | 
			
		||||
                       dimnames(cCube)[[3]][k],
 | 
			
		||||
                       sep = "",
 | 
			
		||||
                       collapse = "")
 | 
			
		||||
      cCube[i, j, k] <- GC[myCodon]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
cCube["A", "T", "G"] # methionine
 | 
			
		||||
cCube["T", "T", "T"] # phenylalanine
 | 
			
		||||
cCube["T", "A", "G"] # stop (amber)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Print a Genetic code table  ========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The data structure of our cCube is well suited to print a table. In the
 | 
			
		||||
# "standard" way to print the genetic code, we write codons with the same
 | 
			
		||||
# second nucleotide in columns, and arrange rows in blocks of same
 | 
			
		||||
# first nucleotide, varying the third nucleotide fastest. This maximizes the
 | 
			
		||||
# similarity of adjacent amino acids in the table if we print the
 | 
			
		||||
# nucleotides in the order T C A G. It's immidiately obvious that the code
 | 
			
		||||
# is not random: the universal genetic code is exceptionally error tolerant in
 | 
			
		||||
# the sense that mutations (or single-nucleotide translation errors) are likely
 | 
			
		||||
# to result in an amino acid with similar biophysical properties as the
 | 
			
		||||
# original.
 | 
			
		||||
 | 
			
		||||
nuc <- c("T", "C", "A", "G")
 | 
			
		||||
 | 
			
		||||
# (calling variables f, s, t to indicate first, second, and third position ...)
 | 
			
		||||
for (f in nuc) {      # first varies in blocks
 | 
			
		||||
  for (t in nuc) {    # third varies in columns
 | 
			
		||||
    for (s in nuc) {  # second varies in rows
 | 
			
		||||
      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
 | 
			
		||||
    }
 | 
			
		||||
    cat("\n")
 | 
			
		||||
  }
 | 
			
		||||
  cat("\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Tasks  ===============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: What do you need to change to print the table with U instead
 | 
			
		||||
#         of T? Try it.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Point mutations are more often transitions (purine -> purine;
 | 
			
		||||
#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
 | 
			
		||||
#         pyrimidine -> purine), even though twice as many transversions
 | 
			
		||||
#         are possible in the code. This is most likely due a deamination /
 | 
			
		||||
#         tautomerization process that favours C -> T changes. If the code
 | 
			
		||||
#         indeed minimizes the effect of mutations, you would expect that
 | 
			
		||||
#         codons that differ by a transition code for more similar amino acids
 | 
			
		||||
#         than codons that differ by a transversion. Is that true? List the set
 | 
			
		||||
#         of all amino acid pairs that are encoded by codons with a C -> T
 | 
			
		||||
#         transition. Then list the set of amino acid pairs with a C -> A
 | 
			
		||||
#         transversion. Which set of pairs is more similar?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
 | 
			
		||||
#         have if you translate them in the 2. or the 3. frame?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: How does the amino acid composition change if you translate the mbp1
 | 
			
		||||
#         gene with the Alternative Yeast Nuclear code that is used by the
 | 
			
		||||
#         "GTC clade" of fungi?
 | 
			
		||||
#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
 | 
			
		||||
 | 
			
		||||
# Solution:
 | 
			
		||||
 | 
			
		||||
    # Fetch the code
 | 
			
		||||
    Biostrings::GENETIC_CODE_TABLE
 | 
			
		||||
    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
 | 
			
		||||
    altYcode <- Biostrings::getGeneticCode("12")
 | 
			
		||||
 | 
			
		||||
    # what's the difference?
 | 
			
		||||
    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
 | 
			
		||||
 | 
			
		||||
    Biostrings::GENETIC_CODE[delta]
 | 
			
		||||
    altYcode[delta]
 | 
			
		||||
 | 
			
		||||
    # translate
 | 
			
		||||
    altYAA <- character(834)
 | 
			
		||||
    for (i in seq_along(mbp1Codons)) {
 | 
			
		||||
      altYAA[i] <- altYcode[mbp1Codons[i]]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    table(mbp1AA)
 | 
			
		||||
    table(altYAA)
 | 
			
		||||
 | 
			
		||||
# Task: The genetic code has significant redundacy, i.e. there are up to six
 | 
			
		||||
#         codons that code for the same amino acid. Write code that lists how
 | 
			
		||||
#         many amino acids are present how often i.e. it should tell you that
 | 
			
		||||
#         two amino acids are encoded only with a single codon, three amino
 | 
			
		||||
#         acids have six codons, etc. Solution below, but don't peek. There
 | 
			
		||||
#         are many possible ways to do this.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Solution:
 | 
			
		||||
( x <- table(table(Biostrings::GENETIC_CODE)) )
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
sum(x * as.numeric(names(x)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "FND-Genetic_code.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-Genetic_code unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017  10  -  2019  01
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0.1  Comment on "incomplete final line" warning in FASTA
 | 
			
		||||
#           1.0    First live version
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                            Line
 | 
			
		||||
#TOC> ----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Storing the genetic code                           45
 | 
			
		||||
#TOC>   1.1        Genetic code in Biostrings                       63
 | 
			
		||||
#TOC>   2        Working with the genetic code                      94
 | 
			
		||||
#TOC>   2.1        Translate a sequence.                           129
 | 
			
		||||
#TOC>   3        An alternative representation: 3D array           212
 | 
			
		||||
#TOC>   3.1        Print a Genetic code table                      246
 | 
			
		||||
#TOC>   4        Tasks                                             272
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Storing the genetic code  ============================================
 | 
			
		||||
 | 
			
		||||
# The genetic code maps trinucleotide codons to amino acids. To store it, we
 | 
			
		||||
# need some mechanism to associate the two representations. The most
 | 
			
		||||
# convenient way to do that is a "named vector" which holds the amino acid
 | 
			
		||||
# code and assigns the codons as names to its elements.
 | 
			
		||||
 | 
			
		||||
x <- c("M", "H", "H", "*", "*", "*")
 | 
			
		||||
names(x) <- c("ATG", "CAC", "CAT", "TAA", "TAG", "TGA")
 | 
			
		||||
x
 | 
			
		||||
 | 
			
		||||
# Then we can access the vector by the codon as name, and retrieve the
 | 
			
		||||
# amino acid ...
 | 
			
		||||
 | 
			
		||||
x["ATG"]
 | 
			
		||||
x["CAC"]
 | 
			
		||||
x["TAA"]
 | 
			
		||||
 | 
			
		||||
# ... or the names of elements, to retrieve the codon(s)
 | 
			
		||||
names(x)[x == "M"]
 | 
			
		||||
names(x)[x == "H"]
 | 
			
		||||
names(x)[x == "*"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Genetic code in Biostrings  ========================================
 | 
			
		||||
 | 
			
		||||
# Coveniently, the standard genetic code as well as its alternatives are
 | 
			
		||||
# available in the Bioconductor "Biostrings" package:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The standard genetic code vector
 | 
			
		||||
Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# The table of genetic codes. This information corresponds to this page
 | 
			
		||||
# at the NCBI:
 | 
			
		||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
 | 
			
		||||
Biostrings::GENETIC_CODE_TABLE
 | 
			
		||||
 | 
			
		||||
# Most of the alternative codes are mitochondrial codes. The id of the
 | 
			
		||||
# Alternative Yeast Nuclear code is "12"
 | 
			
		||||
Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Working with the genetic code  =======================================
 | 
			
		||||
 | 
			
		||||
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
 | 
			
		||||
# to a "local" variable, rather than retrieving it from the package all the
 | 
			
		||||
# time.
 | 
			
		||||
 | 
			
		||||
GC <- Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# This is a named vector of characters ...
 | 
			
		||||
 | 
			
		||||
str(GC)
 | 
			
		||||
 | 
			
		||||
# ... which also stores the alternative initiation codons TTG and CTG in
 | 
			
		||||
# an attribute of the vector. (Alternative initiation codons sometimes are
 | 
			
		||||
# used instead of ATG to intiate translation, if translation is not initiated
 | 
			
		||||
# at ATG thses are still translated with fMet.)
 | 
			
		||||
 | 
			
		||||
attr(GC, "alt_init_codons")
 | 
			
		||||
 | 
			
		||||
# But the key to use this vector is in the "names" which we use for subsetting
 | 
			
		||||
# the list of amino acids in whatever way we need.
 | 
			
		||||
names(GC)
 | 
			
		||||
 | 
			
		||||
# The translation of "TGG" ...
 | 
			
		||||
GC["TGG"]
 | 
			
		||||
 | 
			
		||||
# All stop codons
 | 
			
		||||
names(GC)[GC == "*"]
 | 
			
		||||
 | 
			
		||||
# All start codons
 | 
			
		||||
names(GC)[GC == "M"] # ... or
 | 
			
		||||
c(names(GC)[GC == "M"],
 | 
			
		||||
  attr(GC, "alt_init_codons"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Translate a sequence.  =============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have provided a gene sequence in the data directory:
 | 
			
		||||
# S288C_YDL056W_MBP1_coding.fsa is the yeast Mbp1 FASTA sequence.
 | 
			
		||||
 | 
			
		||||
# read it
 | 
			
		||||
mbp1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
 | 
			
		||||
# You will notice that this generates a Warning message:
 | 
			
		||||
#      Warning message:
 | 
			
		||||
#        In readLines("./data/S288C_YDL056W_MBP1_coding.fsa") :
 | 
			
		||||
#        incomplete final line found on './data/S288C_YDL056W_MBP1_coding.fsa'
 | 
			
		||||
 | 
			
		||||
# The reason for this is that the last character of the file is the letter "A"
 | 
			
		||||
# and not a "\n" line break. This file is exactly how it was sent from the
 | 
			
		||||
# NCBI server; I think good, defensive programming practice would have been to
 | 
			
		||||
# include some kind of an end-marker in the file, like a final "\n". This helps
 | 
			
		||||
# us recognize an incomplete transmission. Let's parse the actual sequence from
 | 
			
		||||
# the file, and then check for completeness.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
head(mbp1)
 | 
			
		||||
 | 
			
		||||
# drop the first line (header)
 | 
			
		||||
mbp1 <- mbp1[-1]
 | 
			
		||||
head(mbp1)
 | 
			
		||||
 | 
			
		||||
# concatenate it all to a single string
 | 
			
		||||
mbp1 <- paste(mbp1, sep = "", collapse = "")
 | 
			
		||||
 | 
			
		||||
# how long is it?
 | 
			
		||||
nchar(mbp1)
 | 
			
		||||
 | 
			
		||||
# how many codons?
 | 
			
		||||
nchar(mbp1)/3
 | 
			
		||||
 | 
			
		||||
# That looks correct for the 833 aa sequence plus 1 stop codon. This gives us a
 | 
			
		||||
# first verification that the file we read is complete, the nucleotides of a
 | 
			
		||||
# complete ORF should be divisible by 3.
 | 
			
		||||
 | 
			
		||||
# Extract the codons. There are many ways to split a long string into chunks
 | 
			
		||||
# of three characters. Here we use the Biostrings  codons()  function. codons()
 | 
			
		||||
# requires an object of type DNAstring - a special kind of string with
 | 
			
		||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
 | 
			
		||||
# with DNAstring(), then split it up, then convert it into a plain
 | 
			
		||||
# character vector.
 | 
			
		||||
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
 | 
			
		||||
 | 
			
		||||
head(mbp1Codons)
 | 
			
		||||
 | 
			
		||||
# now translate each codon
 | 
			
		||||
 | 
			
		||||
mbp1AA <- character(834)
 | 
			
		||||
for (i in seq_along(mbp1Codons)) {
 | 
			
		||||
  mbp1AA[i] <- GC[mbp1Codons[i]]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
head(mbp1Codons)
 | 
			
		||||
head(mbp1AA)
 | 
			
		||||
 | 
			
		||||
tail(mbp1Codons)
 | 
			
		||||
tail(mbp1AA) # Note the stop!
 | 
			
		||||
 | 
			
		||||
# The TAA "ochre" stop codon is our second verification that the nucleotide
 | 
			
		||||
# sequence is complete: a stop codon can't appear internally in an ORF.
 | 
			
		||||
 | 
			
		||||
# We can work with the mbp1AA vector, for example to tabulate the
 | 
			
		||||
# amino acid frequencies:
 | 
			
		||||
table(mbp1AA)
 | 
			
		||||
sort(table(mbp1AA), decreasing = TRUE)
 | 
			
		||||
 | 
			
		||||
# Or we can paste all elements together into a single string. But let's remove
 | 
			
		||||
# the stop, it's not actually a part of the sequence. To remove the last element
 | 
			
		||||
# of a vector, re-assign it with a vector minus the index of the last element:
 | 
			
		||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
 | 
			
		||||
tail(mbp1AA) # Note the stop is gone!
 | 
			
		||||
 | 
			
		||||
# paste it together, collapsing the elements using an empty string as the
 | 
			
		||||
# separation-character (i.e.: nothing)
 | 
			
		||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  An alternative representation: 3D array  =============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
 | 
			
		||||
# here is a good opportunity to review the syntax of 3D arrays with a
 | 
			
		||||
# genetic code cube:
 | 
			
		||||
 | 
			
		||||
# Initialize, using A G C T as the names of the elements in each dimension
 | 
			
		||||
cCube <- array(data     = character(64),
 | 
			
		||||
               dim      = c(4, 4, 4),
 | 
			
		||||
               dimnames = list(c("A", "G", "C", "T"),
 | 
			
		||||
                               c("A", "G", "C", "T"),
 | 
			
		||||
                               c("A", "G", "C", "T")))
 | 
			
		||||
 | 
			
		||||
# fill it with amino acid codes using three nested loops
 | 
			
		||||
for (i in 1:4) {
 | 
			
		||||
  for (j in 1:4) {
 | 
			
		||||
    for (k in 1:4) {
 | 
			
		||||
      myCodon <- paste(dimnames(cCube)[[1]][i],
 | 
			
		||||
                       dimnames(cCube)[[2]][j],
 | 
			
		||||
                       dimnames(cCube)[[3]][k],
 | 
			
		||||
                       sep = "",
 | 
			
		||||
                       collapse = "")
 | 
			
		||||
      cCube[i, j, k] <- GC[myCodon]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
cCube["A", "T", "G"] # methionine
 | 
			
		||||
cCube["T", "T", "T"] # phenylalanine
 | 
			
		||||
cCube["T", "A", "G"] # stop (amber)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Print a Genetic code table  ========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The data structure of our cCube is well suited to print a table. In the
 | 
			
		||||
# "standard" way to print the genetic code, we write codons with the same
 | 
			
		||||
# second nucleotide in columns, and arrange rows in blocks of same
 | 
			
		||||
# first nucleotide, varying the third nucleotide fastest. This maximizes the
 | 
			
		||||
# similarity of adjacent amino acids in the table if we print the
 | 
			
		||||
# nucleotides in the order T C A G. It's immidiately obvious that the code
 | 
			
		||||
# is not random: the universal genetic code is exceptionally error tolerant in
 | 
			
		||||
# the sense that mutations (or single-nucleotide translation errors) are likely
 | 
			
		||||
# to result in an amino acid with similar biophysical properties as the
 | 
			
		||||
# original.
 | 
			
		||||
 | 
			
		||||
nuc <- c("T", "C", "A", "G")
 | 
			
		||||
 | 
			
		||||
# (calling variables f, s, t to indicate first, second, and third position ...)
 | 
			
		||||
for (f in nuc) {      # first varies in blocks
 | 
			
		||||
  for (t in nuc) {    # third varies in columns
 | 
			
		||||
    for (s in nuc) {  # second varies in rows
 | 
			
		||||
      cat(sprintf("%s%s%s: %s   ", f, s, t, cCube[f, s, t]))
 | 
			
		||||
    }
 | 
			
		||||
    cat("\n")
 | 
			
		||||
  }
 | 
			
		||||
  cat("\n")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Tasks  ===============================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: What do you need to change to print the table with U instead
 | 
			
		||||
#         of T? Try it.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Point mutations are more often transitions (purine -> purine;
 | 
			
		||||
#         pyrimidine -> pyrimidine) than transversions (purine -> pyrimidine;
 | 
			
		||||
#         pyrimidine -> purine), even though twice as many transversions
 | 
			
		||||
#         are possible in the code. This is most likely due a deamination /
 | 
			
		||||
#         tautomerization process that favours C -> T changes. If the code
 | 
			
		||||
#         indeed minimizes the effect of mutations, you would expect that
 | 
			
		||||
#         codons that differ by a transition code for more similar amino acids
 | 
			
		||||
#         than codons that differ by a transversion. Is that true? List the set
 | 
			
		||||
#         of all amino acid pairs that are encoded by codons with a C -> T
 | 
			
		||||
#         transition. Then list the set of amino acid pairs with a C -> A
 | 
			
		||||
#         transversion. Which set of pairs is more similar?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: How many stop codons do the two mbp1-gene derived amino acid sequences
 | 
			
		||||
#         have if you translate them in the 2. or the 3. frame?
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: How does the amino acid composition change if you translate the mbp1
 | 
			
		||||
#         gene with the Alternative Yeast Nuclear code that is used by the
 | 
			
		||||
#         "GTC clade" of fungi?
 | 
			
		||||
#         (cf. https://en.wikipedia.org/wiki/Alternative_yeast_nuclear_code )
 | 
			
		||||
 | 
			
		||||
# Solution:
 | 
			
		||||
 | 
			
		||||
    # Fetch the code
 | 
			
		||||
    Biostrings::GENETIC_CODE_TABLE
 | 
			
		||||
    Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
 | 
			
		||||
    altYcode <- Biostrings::getGeneticCode("12")
 | 
			
		||||
 | 
			
		||||
    # what's the difference?
 | 
			
		||||
    (delta <- which(Biostrings::GENETIC_CODE != altYcode))
 | 
			
		||||
 | 
			
		||||
    Biostrings::GENETIC_CODE[delta]
 | 
			
		||||
    altYcode[delta]
 | 
			
		||||
 | 
			
		||||
    # translate
 | 
			
		||||
    altYAA <- character(834)
 | 
			
		||||
    for (i in seq_along(mbp1Codons)) {
 | 
			
		||||
      altYAA[i] <- altYcode[mbp1Codons[i]]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    table(mbp1AA)
 | 
			
		||||
    table(altYAA)
 | 
			
		||||
 | 
			
		||||
# Task: The genetic code has significant redundacy, i.e. there are up to six
 | 
			
		||||
#         codons that code for the same amino acid. Write code that lists how
 | 
			
		||||
#         many amino acids are present how often i.e. it should tell you that
 | 
			
		||||
#         two amino acids are encoded only with a single codon, three amino
 | 
			
		||||
#         acids have six codons, etc. Solution below, but don't peek. There
 | 
			
		||||
#         are many possible ways to do this.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Solution:
 | 
			
		||||
( x <- table(table(Biostrings::GENETIC_CODE)) )
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
sum(x * as.numeric(names(x)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,224 +1,224 @@
 | 
			
		||||
# tocID <- "FND-STA-Information_theory.R"
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-STA-Information_theory unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  0.2.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017 - 2021
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           0.2.1  Maintenance
 | 
			
		||||
#           0.2    Under development
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                  Line
 | 
			
		||||
#TOC> --------------------------------------
 | 
			
		||||
#TOC>   1        ___Section___            39
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ___Section___  =======================================================
 | 
			
		||||
 | 
			
		||||
# What level of information is "significant"
 | 
			
		||||
 | 
			
		||||
# Assume the background distribution is the database frequencies of
 | 
			
		||||
# amino acids:
 | 
			
		||||
 | 
			
		||||
AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
 | 
			
		||||
# sum to 1.0
 | 
			
		||||
AAref["A"] <- 0.0904
 | 
			
		||||
AAref["C"] <- 0.0123
 | 
			
		||||
AAref["D"] <- 0.0545
 | 
			
		||||
AAref["E"] <- 0.0617
 | 
			
		||||
AAref["F"] <- 0.0394
 | 
			
		||||
AAref["G"] <- 0.0724
 | 
			
		||||
AAref["H"] <- 0.0221
 | 
			
		||||
AAref["I"] <- 0.0573
 | 
			
		||||
AAref["K"] <- 0.0504
 | 
			
		||||
AAref["L"] <- 0.0986
 | 
			
		||||
AAref["M"] <- 0.0240
 | 
			
		||||
AAref["N"] <- 0.0392
 | 
			
		||||
AAref["P"] <- 0.0486
 | 
			
		||||
AAref["Q"] <- 0.0381
 | 
			
		||||
AAref["R"] <- 0.0570
 | 
			
		||||
AAref["S"] <- 0.0673
 | 
			
		||||
AAref["T"] <- 0.0558
 | 
			
		||||
AAref["V"] <- 0.0686
 | 
			
		||||
AAref["W"] <- 0.0129
 | 
			
		||||
AAref["Y"] <- 0.0294
 | 
			
		||||
sum(AAref)
 | 
			
		||||
 | 
			
		||||
# Function to calculate Shannon entropy
 | 
			
		||||
H <- function(pmf) {
 | 
			
		||||
  # Calculate Shannon entropy
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #   pmf (numeric) probability mass function: a vector of states and
 | 
			
		||||
  #                 associated probabilities. Each element of
 | 
			
		||||
  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #   Shannon entropy in bits.
 | 
			
		||||
  # Examples:
 | 
			
		||||
  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
 | 
			
		||||
  #                                         # nucleotide sequence
 | 
			
		||||
  #   H(1)     # If all elements are the same, entropy is zero
 | 
			
		||||
  #
 | 
			
		||||
  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
 | 
			
		||||
    stop("Input is not a discrete probability distribution.")
 | 
			
		||||
  }
 | 
			
		||||
  H <- -sum(pmf * (log(pmf) / log(2)))
 | 
			
		||||
  return(H)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Why use all.equal()? Exact comparisons with floating point numbers are
 | 
			
		||||
# brittle. Consider for example:
 | 
			
		||||
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
 | 
			
		||||
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
 | 
			
		||||
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Entropy of the database frequencies (in bits):
 | 
			
		||||
(Href <- H(AAref))
 | 
			
		||||
 | 
			
		||||
# for comparison: entropy if all amino acids are equiprobable
 | 
			
		||||
H(rep(0.05, 20))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Set up a simulation to estimate the distribution of Information values
 | 
			
		||||
# from random sequences drawn from AAref. This is the distribution for the
 | 
			
		||||
# statistical null hypothesis:
 | 
			
		||||
nObs <- 15                      # number of observations (e.g aligned sequences)
 | 
			
		||||
# nObs <- 80
 | 
			
		||||
nTrials <- 10000                # number of trials
 | 
			
		||||
IObs <- numeric(nTrials)        # vector to store Information in each trial
 | 
			
		||||
simCounts <- numeric(20)        # vector to tabulate our information ...
 | 
			
		||||
names(simCounts) <- names(AAref)# ... with the names of AAref
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for (i in 1:nTrials) {  # simulate ...
 | 
			
		||||
 | 
			
		||||
  # sample AAref letters, nObs times, with the probabilities of AAref:
 | 
			
		||||
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
 | 
			
		||||
 | 
			
		||||
  x <- table(AAobs)                            # table simulated observations
 | 
			
		||||
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
 | 
			
		||||
  simCounts[names(x)] <- x                     # overwrite with observed counts
 | 
			
		||||
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
 | 
			
		||||
  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
 | 
			
		||||
  IObs[i] <- Href - Hobs                       # store information
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# evaluate
 | 
			
		||||
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
 | 
			
		||||
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
 | 
			
		||||
 | 
			
		||||
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
 | 
			
		||||
# i.e. an actual observation that lies outside the purple lines is deemed
 | 
			
		||||
# "significant"(1)(2). Of course, this is only true to the degree that the
 | 
			
		||||
# database frequencies are a valid model for the null-hypothesis on the
 | 
			
		||||
# sequence position we are considering here.
 | 
			
		||||
 | 
			
		||||
#  (1) If we use 5% quantiles, this means a value is significantly larger
 | 
			
		||||
#      than expected, and we ignore cases when the value is < 0; if we
 | 
			
		||||
#      consider both smaller and larger values, we need to use 2.5% quantiles,
 | 
			
		||||
#      since 5% of all observations lie outside the 0.025 and 0.975
 | 
			
		||||
#      quantiles.
 | 
			
		||||
#
 | 
			
		||||
#  (2) For an actual observation of counts, we calculate its observed
 | 
			
		||||
#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You can probably now appreciate that information is a bit of a shortcut for
 | 
			
		||||
# biological sequences, and does not really take the different inherent
 | 
			
		||||
# frequencies based on the character of the amino acids into account. For
 | 
			
		||||
# example, L is the most frequent and C is the least frequent, but if we have an
 | 
			
		||||
# alignment of 1000 sequences and we see that the frequencies for L and C are
 | 
			
		||||
# swapped, that would be _very_ surprising - nevertheless, the information would
 | 
			
		||||
# be 0. In order to take that into account, we should actually compute
 | 
			
		||||
# Kullback-Leibler divergences.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Swap C and L frequencies
 | 
			
		||||
p <- AAref
 | 
			
		||||
q <- AAref
 | 
			
		||||
q["L"] <- AAref["C"]
 | 
			
		||||
q["C"] <- AAref["L"]
 | 
			
		||||
H(p)
 | 
			
		||||
H(q)
 | 
			
		||||
 | 
			
		||||
KLdiv <- function(p, q) {
 | 
			
		||||
  # p and q are two pmfs of discrete probability distributions
 | 
			
		||||
  # with the same outcomes, which are nowhere 0.
 | 
			
		||||
  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
 | 
			
		||||
 | 
			
		||||
  if (length(p) != length(q)) {
 | 
			
		||||
    stop("PANIC: input vector lengths differ!")
 | 
			
		||||
  }
 | 
			
		||||
  if (any(c((p == 0), (q == 0)))) {
 | 
			
		||||
    stop("PANIC: 0's found in input vectors!")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(sum(p * log( p / q )))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
KLdiv(p, p)
 | 
			
		||||
KLdiv(p, q)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
nObs <- 15                      # number of observations (e.g aligned sequences)
 | 
			
		||||
# nObs <- 80
 | 
			
		||||
nTrials <- 10000                # number of trials
 | 
			
		||||
KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
 | 
			
		||||
simCounts <- numeric(20)        # vector to tabulate our information ...
 | 
			
		||||
names(simCounts) <- names(AAref)# ... with the names of AAref
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for (i in 1:nTrials) {  # simulate ...
 | 
			
		||||
 | 
			
		||||
  # sample AAref letters, nObs times, with the probabilities of AAref:
 | 
			
		||||
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
 | 
			
		||||
 | 
			
		||||
  x <- table(AAobs)                            # table simulated observations
 | 
			
		||||
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
 | 
			
		||||
  simCounts[names(x)] <- x                     # overwrite with observed counts
 | 
			
		||||
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
 | 
			
		||||
  simCounts <- simCounts/sum(simCounts)        # counts to frequency
 | 
			
		||||
  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# evaluate
 | 
			
		||||
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
 | 
			
		||||
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
 | 
			
		||||
quantile(KLdivObs, 0.992)
 | 
			
		||||
 | 
			
		||||
# Running the simulation with KL does not give a fundamentally
 | 
			
		||||
# different behaviour - since we are just randomly sampling. But KL would be
 | 
			
		||||
# more sensitive in case there is biological selection, where the sampling is no
 | 
			
		||||
# longer random. If I run the same simulation, with nObs <- 80 but calculating
 | 
			
		||||
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
 | 
			
		||||
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
 | 
			
		||||
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
 | 
			
		||||
# nice addition to the toolbox.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "FND-STA-Information_theory.R"
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-STA-Information_theory unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  0.2.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017 - 2021
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           0.2.1  Maintenance
 | 
			
		||||
#           0.2    Under development
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                  Line
 | 
			
		||||
#TOC> --------------------------------------
 | 
			
		||||
#TOC>   1        ___Section___            39
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ___Section___  =======================================================
 | 
			
		||||
 | 
			
		||||
# What level of information is "significant"
 | 
			
		||||
 | 
			
		||||
# Assume the background distribution is the database frequencies of
 | 
			
		||||
# amino acids:
 | 
			
		||||
 | 
			
		||||
AAref <- numeric()  # Uniprot frequencies October 2017, slightly adjusted to
 | 
			
		||||
# sum to 1.0
 | 
			
		||||
AAref["A"] <- 0.0904
 | 
			
		||||
AAref["C"] <- 0.0123
 | 
			
		||||
AAref["D"] <- 0.0545
 | 
			
		||||
AAref["E"] <- 0.0617
 | 
			
		||||
AAref["F"] <- 0.0394
 | 
			
		||||
AAref["G"] <- 0.0724
 | 
			
		||||
AAref["H"] <- 0.0221
 | 
			
		||||
AAref["I"] <- 0.0573
 | 
			
		||||
AAref["K"] <- 0.0504
 | 
			
		||||
AAref["L"] <- 0.0986
 | 
			
		||||
AAref["M"] <- 0.0240
 | 
			
		||||
AAref["N"] <- 0.0392
 | 
			
		||||
AAref["P"] <- 0.0486
 | 
			
		||||
AAref["Q"] <- 0.0381
 | 
			
		||||
AAref["R"] <- 0.0570
 | 
			
		||||
AAref["S"] <- 0.0673
 | 
			
		||||
AAref["T"] <- 0.0558
 | 
			
		||||
AAref["V"] <- 0.0686
 | 
			
		||||
AAref["W"] <- 0.0129
 | 
			
		||||
AAref["Y"] <- 0.0294
 | 
			
		||||
sum(AAref)
 | 
			
		||||
 | 
			
		||||
# Function to calculate Shannon entropy
 | 
			
		||||
H <- function(pmf) {
 | 
			
		||||
  # Calculate Shannon entropy
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #   pmf (numeric) probability mass function: a vector of states and
 | 
			
		||||
  #                 associated probabilities. Each element of
 | 
			
		||||
  #                 pmf must be in (0, 1] and sum(pmf) must be 1.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #   Shannon entropy in bits.
 | 
			
		||||
  # Examples:
 | 
			
		||||
  #   H(c(A=0.25, C=0.25, G=0.25, T=0.25))  # 2 bits entropy in a random
 | 
			
		||||
  #                                         # nucleotide sequence
 | 
			
		||||
  #   H(1)     # If all elements are the same, entropy is zero
 | 
			
		||||
  #
 | 
			
		||||
  if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
 | 
			
		||||
    stop("Input is not a discrete probability distribution.")
 | 
			
		||||
  }
 | 
			
		||||
  H <- -sum(pmf * (log(pmf) / log(2)))
 | 
			
		||||
  return(H)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Why use all.equal()? Exact comparisons with floating point numbers are
 | 
			
		||||
# brittle. Consider for example:
 | 
			
		||||
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
 | 
			
		||||
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
 | 
			
		||||
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Entropy of the database frequencies (in bits):
 | 
			
		||||
(Href <- H(AAref))
 | 
			
		||||
 | 
			
		||||
# for comparison: entropy if all amino acids are equiprobable
 | 
			
		||||
H(rep(0.05, 20))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Set up a simulation to estimate the distribution of Information values
 | 
			
		||||
# from random sequences drawn from AAref. This is the distribution for the
 | 
			
		||||
# statistical null hypothesis:
 | 
			
		||||
nObs <- 15                      # number of observations (e.g aligned sequences)
 | 
			
		||||
# nObs <- 80
 | 
			
		||||
nTrials <- 10000                # number of trials
 | 
			
		||||
IObs <- numeric(nTrials)        # vector to store Information in each trial
 | 
			
		||||
simCounts <- numeric(20)        # vector to tabulate our information ...
 | 
			
		||||
names(simCounts) <- names(AAref)# ... with the names of AAref
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for (i in 1:nTrials) {  # simulate ...
 | 
			
		||||
 | 
			
		||||
  # sample AAref letters, nObs times, with the probabilities of AAref:
 | 
			
		||||
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
 | 
			
		||||
 | 
			
		||||
  x <- table(AAobs)                            # table simulated observations
 | 
			
		||||
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
 | 
			
		||||
  simCounts[names(x)] <- x                     # overwrite with observed counts
 | 
			
		||||
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
 | 
			
		||||
  Hobs <- H(simCounts/sum(simCounts))          # counts to frequency, calc. H
 | 
			
		||||
  IObs[i] <- Href - Hobs                       # store information
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# evaluate
 | 
			
		||||
hist(IObs, col = "#C9F4E3", xlim = c(-0.2, 1.0), breaks = 25)
 | 
			
		||||
abline(v = quantile(IObs, c(0.05, 0.95)), col = "#AA00CC")
 | 
			
		||||
 | 
			
		||||
# The purple lines are drawn at the 5% quantiles of the Iobs distributions -
 | 
			
		||||
# i.e. an actual observation that lies outside the purple lines is deemed
 | 
			
		||||
# "significant"(1)(2). Of course, this is only true to the degree that the
 | 
			
		||||
# database frequencies are a valid model for the null-hypothesis on the
 | 
			
		||||
# sequence position we are considering here.
 | 
			
		||||
 | 
			
		||||
#  (1) If we use 5% quantiles, this means a value is significantly larger
 | 
			
		||||
#      than expected, and we ignore cases when the value is < 0; if we
 | 
			
		||||
#      consider both smaller and larger values, we need to use 2.5% quantiles,
 | 
			
		||||
#      since 5% of all observations lie outside the 0.025 and 0.975
 | 
			
		||||
#      quantiles.
 | 
			
		||||
#
 | 
			
		||||
#  (2) For an actual observation of counts, we calculate its observed
 | 
			
		||||
#      _empirical_p_Value_ as (nCounts + 1)/(nTotal + 1).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# You can probably now appreciate that information is a bit of a shortcut for
 | 
			
		||||
# biological sequences, and does not really take the different inherent
 | 
			
		||||
# frequencies based on the character of the amino acids into account. For
 | 
			
		||||
# example, L is the most frequent and C is the least frequent, but if we have an
 | 
			
		||||
# alignment of 1000 sequences and we see that the frequencies for L and C are
 | 
			
		||||
# swapped, that would be _very_ surprising - nevertheless, the information would
 | 
			
		||||
# be 0. In order to take that into account, we should actually compute
 | 
			
		||||
# Kullback-Leibler divergences.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Swap C and L frequencies
 | 
			
		||||
p <- AAref
 | 
			
		||||
q <- AAref
 | 
			
		||||
q["L"] <- AAref["C"]
 | 
			
		||||
q["C"] <- AAref["L"]
 | 
			
		||||
H(p)
 | 
			
		||||
H(q)
 | 
			
		||||
 | 
			
		||||
KLdiv <- function(p, q) {
 | 
			
		||||
  # p and q are two pmfs of discrete probability distributions
 | 
			
		||||
  # with the same outcomes, which are nowhere 0.
 | 
			
		||||
  # Value:  Kullback-Leibler divergence  sum(p * log( p / q))).
 | 
			
		||||
 | 
			
		||||
  if (length(p) != length(q)) {
 | 
			
		||||
    stop("PANIC: input vector lengths differ!")
 | 
			
		||||
  }
 | 
			
		||||
  if (any(c((p == 0), (q == 0)))) {
 | 
			
		||||
    stop("PANIC: 0's found in input vectors!")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(sum(p * log( p / q )))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
KLdiv(p, p)
 | 
			
		||||
KLdiv(p, q)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
nObs <- 15                      # number of observations (e.g aligned sequences)
 | 
			
		||||
# nObs <- 80
 | 
			
		||||
nTrials <- 10000                # number of trials
 | 
			
		||||
KLdivObs <- numeric(nTrials)        # vector to store Information in each trial
 | 
			
		||||
simCounts <- numeric(20)        # vector to tabulate our information ...
 | 
			
		||||
names(simCounts) <- names(AAref)# ... with the names of AAref
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
for (i in 1:nTrials) {  # simulate ...
 | 
			
		||||
 | 
			
		||||
  # sample AAref letters, nObs times, with the probabilities of AAref:
 | 
			
		||||
  AAobs <- sample(names(AAref), size = nObs, prob = AAref, replace = TRUE)
 | 
			
		||||
 | 
			
		||||
  x <- table(AAobs)                            # table simulated observations
 | 
			
		||||
  simCounts[1:20] <- rep(0, length(simCounts)) # initialize simCounts to 0
 | 
			
		||||
  simCounts[names(x)] <- x                     # overwrite with observed counts
 | 
			
		||||
  simCounts <- simCounts + 0.5                 # add Jeffreys' pseudocounts
 | 
			
		||||
  simCounts <- simCounts/sum(simCounts)        # counts to frequency
 | 
			
		||||
  KLdivObs[i] <- sum(simCounts * log( simCounts / AAref )) # store KLdiv
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# evaluate
 | 
			
		||||
hist(KLdivObs, col = "#C9F4E3", breaks = 25)
 | 
			
		||||
abline(v = quantile(KLdivObs, c(0.05, 0.95)), col = "#AA00CC")
 | 
			
		||||
quantile(KLdivObs, 0.992)
 | 
			
		||||
 | 
			
		||||
# Running the simulation with KL does not give a fundamentally
 | 
			
		||||
# different behaviour - since we are just randomly sampling. But KL would be
 | 
			
		||||
# more sensitive in case there is biological selection, where the sampling is no
 | 
			
		||||
# longer random. If I run the same simulation, with nObs <- 80 but calculating
 | 
			
		||||
# KLdiv instead of information, I get a 5% quantile at 0.15 - but the C/L
 | 
			
		||||
# frequency swap gives me a KL divergence of 0.18 - this is significant at p =
 | 
			
		||||
# 0.008 - (remember, Information is 0 in this case). So that's actually quite a
 | 
			
		||||
# nice addition to the toolbox.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,351 +1,351 @@
 | 
			
		||||
# tocID <- "FND-STA-Significance.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-STA-Significance unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09  - 2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance. Add sample solution.
 | 
			
		||||
#           1.2    Update set.seed() usage
 | 
			
		||||
#           1.1    Corrected treatment of empirical p-value
 | 
			
		||||
#           1.0    First contents
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                              Line
 | 
			
		||||
#TOC> ------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Significance and p-value                             49
 | 
			
		||||
#TOC>   1.1        Significance levels                                60
 | 
			
		||||
#TOC>   1.2        probability and p-value                            77
 | 
			
		||||
#TOC>   1.2.1          p-value illustrated                           109
 | 
			
		||||
#TOC>   2        One- or two-sided                                   165
 | 
			
		||||
#TOC>   3        Significance by integration                         209
 | 
			
		||||
#TOC>   4        Significance by simulation or permutation           215
 | 
			
		||||
#TOC>   5        Final tasks                                         327
 | 
			
		||||
#TOC>   6        Sample solutions                                    336
 | 
			
		||||
#TOC>   6.1                                                          338
 | 
			
		||||
#TOC>   6.2                                                          342
 | 
			
		||||
#TOC>   6.3                                                          346
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Significance and p-value  ============================================
 | 
			
		||||
 | 
			
		||||
# The idea of the probability of an event has a precise mathematical
 | 
			
		||||
# interpretation, but how is it useful to know the probability? Usually we are
 | 
			
		||||
# interested in whether we should accept or reject a hypothesis based on the
 | 
			
		||||
# observations we have. A rational way to do this is to say: if the probability
 | 
			
		||||
# of observing the data is very small under the null-hypothesis, then we will
 | 
			
		||||
# assume the observation is due to something other than the null-hypothesis. But
 | 
			
		||||
# what do we mean by the "probability of our observation"? And what is "very
 | 
			
		||||
# small"?
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Significance levels  ===============================================
 | 
			
		||||
 | 
			
		||||
# A "very small" probability is purely a matter of convention - a cultural
 | 
			
		||||
# convention. In the biomedical field we usually call probabilities of less then
 | 
			
		||||
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
 | 
			
		||||
# observations with a probability of less than 0.05 "significant" and if we want
 | 
			
		||||
# to highlight this in text or in a graph, we often mark them with an asterisk
 | 
			
		||||
# (*). Also we often call observations with a probability of less than 0.01
 | 
			
		||||
# "highly significant" and mark them with two asterisks (**). But there is no
 | 
			
		||||
# special significance in these numbers, the cutoff point for significance could
 | 
			
		||||
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
 | 
			
		||||
# British statistician Ronald Fisher happened to propose for this purpose in
 | 
			
		||||
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
 | 
			
		||||
# different purposes (cf.
 | 
			
		||||
# https://en.wikipedia.org/wiki/Statistical_significance).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  probability and p-value  ===========================================
 | 
			
		||||
 | 
			
		||||
# But what do we even mean by the probability of an observation?
 | 
			
		||||
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
 | 
			
		||||
# standard deviation of 1. The sample I get is ...
 | 
			
		||||
 | 
			
		||||
set.seed(sqrt(5))
 | 
			
		||||
x <- rnorm(1)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
 | 
			
		||||
print(x, digits = 22)
 | 
			
		||||
# [1] -0.8969145466249813791748
 | 
			
		||||
 | 
			
		||||
# So what's the probability of that number? Obviously, the probability of
 | 
			
		||||
# getting exactly this number is very, very, very small. But also obviously,
 | 
			
		||||
# this does not mean that observing this number is in any way significant - we
 | 
			
		||||
# always observe some number. That's not what we mean in this case. There are
 | 
			
		||||
# several implicit assumptions when we speak of the probability of an
 | 
			
		||||
# observation:
 | 
			
		||||
 | 
			
		||||
# 1: the observation can be compared to a probability distribution;
 | 
			
		||||
# 2: that distribution can be integrated between any specific value
 | 
			
		||||
#      and its upper and lower bounds (or +- infinity).
 | 
			
		||||
 | 
			
		||||
# Then what we really mean by the probability of an observation in the context
 | 
			
		||||
# of that distribution is: the probability of observing that value, or a value
 | 
			
		||||
# more extreme than the one we have. We call this the p-value. Note that we are
 | 
			
		||||
# not talking about an individual number anymore, we are talking about the area
 | 
			
		||||
# under the curve between our observation and the upper (or lower) bound of the
 | 
			
		||||
# curve, as a fraction of the whole.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   1.2.1  p-value illustrated                      
 | 
			
		||||
 | 
			
		||||
# Let's illustrate. First we draw a million random values from our
 | 
			
		||||
# standard, normal distribution:
 | 
			
		||||
 | 
			
		||||
N <- 1e6                             # one million
 | 
			
		||||
set.seed(112358)                     # set RNG seed for repeatable randomness
 | 
			
		||||
r <- rnorm(N)                        # N values from a normal distribution
 | 
			
		||||
set.seed(NULL)                       # reset the RNG
 | 
			
		||||
 | 
			
		||||
# Let's see what the distribution looks like:
 | 
			
		||||
 | 
			
		||||
(h <- hist(r))
 | 
			
		||||
 | 
			
		||||
# The histogram details are now available in the list h -  e.g. h$counts
 | 
			
		||||
 | 
			
		||||
# Where is the value we have drawn previously?
 | 
			
		||||
abline(v = x, col = "#EE0000")
 | 
			
		||||
 | 
			
		||||
# How many values are smaller?
 | 
			
		||||
sum(r < x)
 | 
			
		||||
 | 
			
		||||
# Let's color the bars:
 | 
			
		||||
#    first, make a vector of red and green colors for the bars with breaks
 | 
			
		||||
#    smaller and larger then x, white for the bar that contains x ...
 | 
			
		||||
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
 | 
			
		||||
hCol <- c(hCol, "#FFFFFFFF")
 | 
			
		||||
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
 | 
			
		||||
# ... then plot the histogram, with colored bars ...
 | 
			
		||||
hist(r, col = hCol)
 | 
			
		||||
# ... add two colored rectangles into the white bar ...
 | 
			
		||||
idx <- sum(h$breaks < x)
 | 
			
		||||
xMin <- h$breaks[idx]
 | 
			
		||||
xMax <- h$breaks[idx + 1]
 | 
			
		||||
y <- h$counts[idx]
 | 
			
		||||
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
 | 
			
		||||
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
 | 
			
		||||
# ... and a red line for our observation.
 | 
			
		||||
abline(v = x, col = "#EE0000", lwd = 2)
 | 
			
		||||
 | 
			
		||||
# The p-value of our observation is the red area as a fraction of the
 | 
			
		||||
# whole histogram (red + green).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Explain how the expression sum(r < x) works to give us a count of values
 | 
			
		||||
#    with the property we are looking for. E.g., examine -4:4 < x
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Write an expression to estimate the probability that a value
 | 
			
		||||
#    drawn from the vector r is less-or-equal to x. The result you get
 | 
			
		||||
#    will depend on the exact values that went into the vector r but it should
 | 
			
		||||
#    be close to 0.185  That expression is the p-value associated with x.
 | 
			
		||||
#    (Sample solution 6.1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  One- or two-sided  ===================================================
 | 
			
		||||
 | 
			
		||||
# The shape of our histogram confirms that the rnorm() function has returned
 | 
			
		||||
# values that appear distributed according to a normal distribution. In a normal
 | 
			
		||||
# distribution, readily available tables tell us that 5% of the values (i.e. our
 | 
			
		||||
# significance level) lie 1.96 (or approximately 2) standard deviations away
 | 
			
		||||
# from the mean. Is this the case here? How many values in our vector r are
 | 
			
		||||
# larger than 1.96?
 | 
			
		||||
 | 
			
		||||
sum(r > 1.96)
 | 
			
		||||
# [1] 24589
 | 
			
		||||
 | 
			
		||||
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
 | 
			
		||||
 | 
			
		||||
# The answer is: we have to be careful with two-sided distributions. 2 standard
 | 
			
		||||
# deviations away from the mean means either larger or smaller than 1.96 . This
 | 
			
		||||
# can give rise to errors. If we are simply are interested in outliers, no
 | 
			
		||||
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
 | 
			
		||||
# But if we are specifically interested in, say, larger values, because a
 | 
			
		||||
# smaller value is not meaningful, then the significance cutoff, expressed as
 | 
			
		||||
# standard deviations, is relaxed. We can use the quantile function to see what
 | 
			
		||||
# the cutoff values are:
 | 
			
		||||
 | 
			
		||||
quantile(r)
 | 
			
		||||
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
 | 
			
		||||
# close to ± 1.96, as expected
 | 
			
		||||
quantile(r, probs = 0.95) # for the single 5% boundary
 | 
			
		||||
# close to 1.64 . Check counts to confirm:
 | 
			
		||||
sum(r > quantile(r, probs = 0.95))
 | 
			
		||||
# [1] 50000
 | 
			
		||||
# which is 5%, as expected.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
 | 
			
		||||
# (Sample solution 6.2)
 | 
			
		||||
 | 
			
		||||
# To summarize: when we evaluate the significance of an event, we divide a
 | 
			
		||||
# probability distribution into two parts at the point where the event was
 | 
			
		||||
# observed. We then ask whether the integral over the more extreme part is less
 | 
			
		||||
# or more than 5% of the whole. If it is less, we deem the event to be
 | 
			
		||||
# significant.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Significance by integration  =========================================
 | 
			
		||||
 | 
			
		||||
# If the underlying probability distribution can be analytically or numerically
 | 
			
		||||
# integrated, the siginificance of an observation can be directly computed.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Significance by simulation or permutation  ===========================
 | 
			
		||||
 | 
			
		||||
# But whether the integration is correct, or relies on assumptions that may not
 | 
			
		||||
# be warranted for biological data, can be a highly technical question.
 | 
			
		||||
# Fortunately, we can often simply run a simulation, a random resampling, or a
 | 
			
		||||
# permutation and then count the number of outcomes, just as we did with our
 | 
			
		||||
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
 | 
			
		||||
# p-value" is defined as (Nobs + 1) / (N + 1).  )
 | 
			
		||||
 | 
			
		||||
# Here is an example. Assume you have a protein sequence and
 | 
			
		||||
# you speculate that positively charged residues are close to negatively charged
 | 
			
		||||
# residues to balance charge locally. A statistic that would capture this is the
 | 
			
		||||
# mean minimum distance between all D,E residues and the closest R,K,H
 | 
			
		||||
# residue. Let's compute this for the sequence of yeast Mbp1.
 | 
			
		||||
 | 
			
		||||
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
 | 
			
		||||
               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
 | 
			
		||||
               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
 | 
			
		||||
               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
 | 
			
		||||
               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
 | 
			
		||||
               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
 | 
			
		||||
               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
 | 
			
		||||
               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
 | 
			
		||||
               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
 | 
			
		||||
               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
 | 
			
		||||
               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
 | 
			
		||||
               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
 | 
			
		||||
               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
 | 
			
		||||
               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
 | 
			
		||||
 | 
			
		||||
# first we split this string into individual characters:
 | 
			
		||||
v <- unlist(strsplit(MBP1, ""))
 | 
			
		||||
 | 
			
		||||
# and find the positions of our charged residues
 | 
			
		||||
 | 
			
		||||
ED  <- grep("[ED]", v)
 | 
			
		||||
RKH <- grep("[RKH]", v)
 | 
			
		||||
 | 
			
		||||
sep <- numeric(length(ED)) # this vector will hold the distances
 | 
			
		||||
for (i in seq_along(ED)) {
 | 
			
		||||
  sep[i] <- min(abs(RKH - ED[i]))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Task: read and explain this bit of code
 | 
			
		||||
 | 
			
		||||
# Now that sep is computed, what does it look like?
 | 
			
		||||
 | 
			
		||||
table(sep)  # these are the minimum distances
 | 
			
		||||
# 24 of D,E residues are adjacent to R,K,H;
 | 
			
		||||
# the longest separation is 28 residues.
 | 
			
		||||
 | 
			
		||||
# What is the mean separation?
 | 
			
		||||
mean(sep)
 | 
			
		||||
 | 
			
		||||
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
 | 
			
		||||
# to solve this analytically. But by permutation it's soooo easy.
 | 
			
		||||
 | 
			
		||||
# First, we combine what we have done above into a function:
 | 
			
		||||
 | 
			
		||||
chSep <- function(v) {
 | 
			
		||||
  # computes the mean minimum separation of oppositely charged residues
 | 
			
		||||
  # Parameter: v (char) a vector of amino acids in the one-letter code
 | 
			
		||||
  # Value: msep (numeric) mean minimum separation
 | 
			
		||||
 | 
			
		||||
  ED  <- grep("[EDed]", v)
 | 
			
		||||
  RKH <- grep("[RKHrkh]", v)
 | 
			
		||||
 | 
			
		||||
  sep <- numeric(length(ED))
 | 
			
		||||
  for (i in seq_along(ED)) {
 | 
			
		||||
    sep[i] <- min(abs(RKH - ED[i]))
 | 
			
		||||
  }
 | 
			
		||||
  return(mean(sep))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Execute the function to define it.
 | 
			
		||||
 | 
			
		||||
# Confirm that the function gives the same result as the number we
 | 
			
		||||
# calculated above:
 | 
			
		||||
chSep(v)
 | 
			
		||||
 | 
			
		||||
# Now we can produce a random permutation of v, and recalculate
 | 
			
		||||
 | 
			
		||||
set.seed(pi)                       # set RNG seed for repeatable randomness
 | 
			
		||||
w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
 | 
			
		||||
                                   # code paradigm. It is very useful.
 | 
			
		||||
set.seed(NULL)                     # reset the RNG
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
chSep(w)
 | 
			
		||||
# 3.773 ... that's actually less than what we had before.
 | 
			
		||||
 | 
			
		||||
# Let's do this 10000 times and record the results (takes a few seconds):
 | 
			
		||||
 | 
			
		||||
N <- 10000
 | 
			
		||||
chs <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  chs[i] <- chSep(sample(v, length(v))) # charge
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
hist(chs, breaks = 50)
 | 
			
		||||
abline(v = chSep(v), col = "#EE0000")
 | 
			
		||||
 | 
			
		||||
# Contrary to our expectations, the actual observed mean minimum charge
 | 
			
		||||
# separation seems to be larger than what we observe in randomly permuted
 | 
			
		||||
# sequences. But is this significant? Your task to find out.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# Calculate the empirical p-value for chsep(v)
 | 
			
		||||
# (Sample solution 6.3)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Final tasks  =========================================================
 | 
			
		||||
 | 
			
		||||
# From chs, compute the empirical p-value of a mean minimum charge separation to
 | 
			
		||||
#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
 | 
			
		||||
#   the result in your journal. Is it significant? Also note the result of
 | 
			
		||||
#   the following expression for validation:
 | 
			
		||||
seal(sum(chs))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Sample solutions  ====================================================
 | 
			
		||||
 | 
			
		||||
# ==   6.1    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
sum(r <= x) / length(r)
 | 
			
		||||
 | 
			
		||||
# ==   6.2    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
abline(v = quantile(r, probs = c(0.05)))
 | 
			
		||||
 | 
			
		||||
# ==   6.3    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "FND-STA-Significance.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the FND-STA-Significance unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-09  - 2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance. Add sample solution.
 | 
			
		||||
#           1.2    Update set.seed() usage
 | 
			
		||||
#           1.1    Corrected treatment of empirical p-value
 | 
			
		||||
#           1.0    First contents
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                              Line
 | 
			
		||||
#TOC> ------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Significance and p-value                             49
 | 
			
		||||
#TOC>   1.1        Significance levels                                60
 | 
			
		||||
#TOC>   1.2        probability and p-value                            77
 | 
			
		||||
#TOC>   1.2.1          p-value illustrated                           109
 | 
			
		||||
#TOC>   2        One- or two-sided                                   165
 | 
			
		||||
#TOC>   3        Significance by integration                         209
 | 
			
		||||
#TOC>   4        Significance by simulation or permutation           215
 | 
			
		||||
#TOC>   5        Final tasks                                         327
 | 
			
		||||
#TOC>   6        Sample solutions                                    336
 | 
			
		||||
#TOC>   6.1                                                          338
 | 
			
		||||
#TOC>   6.2                                                          342
 | 
			
		||||
#TOC>   6.3                                                          346
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Significance and p-value  ============================================
 | 
			
		||||
 | 
			
		||||
# The idea of the probability of an event has a precise mathematical
 | 
			
		||||
# interpretation, but how is it useful to know the probability? Usually we are
 | 
			
		||||
# interested in whether we should accept or reject a hypothesis based on the
 | 
			
		||||
# observations we have. A rational way to do this is to say: if the probability
 | 
			
		||||
# of observing the data is very small under the null-hypothesis, then we will
 | 
			
		||||
# assume the observation is due to something other than the null-hypothesis. But
 | 
			
		||||
# what do we mean by the "probability of our observation"? And what is "very
 | 
			
		||||
# small"?
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Significance levels  ===============================================
 | 
			
		||||
 | 
			
		||||
# A "very small" probability is purely a matter of convention - a cultural
 | 
			
		||||
# convention. In the biomedical field we usually call probabilities of less then
 | 
			
		||||
# 0.05 (5%) small enough to reject the null-hypothesis. Thus we call
 | 
			
		||||
# observations with a probability of less than 0.05 "significant" and if we want
 | 
			
		||||
# to highlight this in text or in a graph, we often mark them with an asterisk
 | 
			
		||||
# (*). Also we often call observations with a probability of less than 0.01
 | 
			
		||||
# "highly significant" and mark them with two asterisks (**). But there is no
 | 
			
		||||
# special significance in these numbers, the cutoff point for significance could
 | 
			
		||||
# also be 0.0498631, or 0.03, or 1/(pi^3). 0.05 is just the value that the
 | 
			
		||||
# British statistician Ronald Fisher happened to propose for this purpose in
 | 
			
		||||
# 1925. Incidentally, Fisher later recommended to use different cutoffs for
 | 
			
		||||
# different purposes (cf.
 | 
			
		||||
# https://en.wikipedia.org/wiki/Statistical_significance).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  probability and p-value  ===========================================
 | 
			
		||||
 | 
			
		||||
# But what do we even mean by the probability of an observation?
 | 
			
		||||
# Assume I am drawing samples from a normal distribution with a mean of 0 and a
 | 
			
		||||
# standard deviation of 1. The sample I get is ...
 | 
			
		||||
 | 
			
		||||
set.seed(sqrt(5))
 | 
			
		||||
x <- rnorm(1)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
 | 
			
		||||
print(x, digits = 22)
 | 
			
		||||
# [1] -0.8969145466249813791748
 | 
			
		||||
 | 
			
		||||
# So what's the probability of that number? Obviously, the probability of
 | 
			
		||||
# getting exactly this number is very, very, very small. But also obviously,
 | 
			
		||||
# this does not mean that observing this number is in any way significant - we
 | 
			
		||||
# always observe some number. That's not what we mean in this case. There are
 | 
			
		||||
# several implicit assumptions when we speak of the probability of an
 | 
			
		||||
# observation:
 | 
			
		||||
 | 
			
		||||
# 1: the observation can be compared to a probability distribution;
 | 
			
		||||
# 2: that distribution can be integrated between any specific value
 | 
			
		||||
#      and its upper and lower bounds (or +- infinity).
 | 
			
		||||
 | 
			
		||||
# Then what we really mean by the probability of an observation in the context
 | 
			
		||||
# of that distribution is: the probability of observing that value, or a value
 | 
			
		||||
# more extreme than the one we have. We call this the p-value. Note that we are
 | 
			
		||||
# not talking about an individual number anymore, we are talking about the area
 | 
			
		||||
# under the curve between our observation and the upper (or lower) bound of the
 | 
			
		||||
# curve, as a fraction of the whole.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   1.2.1  p-value illustrated                      
 | 
			
		||||
 | 
			
		||||
# Let's illustrate. First we draw a million random values from our
 | 
			
		||||
# standard, normal distribution:
 | 
			
		||||
 | 
			
		||||
N <- 1e6                             # one million
 | 
			
		||||
set.seed(112358)                     # set RNG seed for repeatable randomness
 | 
			
		||||
r <- rnorm(N)                        # N values from a normal distribution
 | 
			
		||||
set.seed(NULL)                       # reset the RNG
 | 
			
		||||
 | 
			
		||||
# Let's see what the distribution looks like:
 | 
			
		||||
 | 
			
		||||
(h <- hist(r))
 | 
			
		||||
 | 
			
		||||
# The histogram details are now available in the list h -  e.g. h$counts
 | 
			
		||||
 | 
			
		||||
# Where is the value we have drawn previously?
 | 
			
		||||
abline(v = x, col = "#EE0000")
 | 
			
		||||
 | 
			
		||||
# How many values are smaller?
 | 
			
		||||
sum(r < x)
 | 
			
		||||
 | 
			
		||||
# Let's color the bars:
 | 
			
		||||
#    first, make a vector of red and green colors for the bars with breaks
 | 
			
		||||
#    smaller and larger then x, white for the bar that contains x ...
 | 
			
		||||
hCol <- rep("#EE000044", sum(h$breaks < x) - 1)
 | 
			
		||||
hCol <- c(hCol, "#FFFFFFFF")
 | 
			
		||||
hCol <- c(hCol, rep("#00EE0044", sum(h$breaks > x) - 1))
 | 
			
		||||
# ... then plot the histogram, with colored bars ...
 | 
			
		||||
hist(r, col = hCol)
 | 
			
		||||
# ... add two colored rectangles into the white bar ...
 | 
			
		||||
idx <- sum(h$breaks < x)
 | 
			
		||||
xMin <- h$breaks[idx]
 | 
			
		||||
xMax <- h$breaks[idx + 1]
 | 
			
		||||
y <- h$counts[idx]
 | 
			
		||||
rect(xMin, 0, x, y, col = "#EE000044", border = TRUE)
 | 
			
		||||
rect(x, 0, xMax, y, col = "#00EE0044", border = TRUE)
 | 
			
		||||
# ... and a red line for our observation.
 | 
			
		||||
abline(v = x, col = "#EE0000", lwd = 2)
 | 
			
		||||
 | 
			
		||||
# The p-value of our observation is the red area as a fraction of the
 | 
			
		||||
# whole histogram (red + green).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Explain how the expression sum(r < x) works to give us a count of values
 | 
			
		||||
#    with the property we are looking for. E.g., examine -4:4 < x
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
#    Write an expression to estimate the probability that a value
 | 
			
		||||
#    drawn from the vector r is less-or-equal to x. The result you get
 | 
			
		||||
#    will depend on the exact values that went into the vector r but it should
 | 
			
		||||
#    be close to 0.185  That expression is the p-value associated with x.
 | 
			
		||||
#    (Sample solution 6.1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  One- or two-sided  ===================================================
 | 
			
		||||
 | 
			
		||||
# The shape of our histogram confirms that the rnorm() function has returned
 | 
			
		||||
# values that appear distributed according to a normal distribution. In a normal
 | 
			
		||||
# distribution, readily available tables tell us that 5% of the values (i.e. our
 | 
			
		||||
# significance level) lie 1.96 (or approximately 2) standard deviations away
 | 
			
		||||
# from the mean. Is this the case here? How many values in our vector r are
 | 
			
		||||
# larger than 1.96?
 | 
			
		||||
 | 
			
		||||
sum(r > 1.96)
 | 
			
		||||
# [1] 24589
 | 
			
		||||
 | 
			
		||||
# Wait - that's about 2.5% of 1,000,000, not 5% as expected. Why?
 | 
			
		||||
 | 
			
		||||
# The answer is: we have to be careful with two-sided distributions. 2 standard
 | 
			
		||||
# deviations away from the mean means either larger or smaller than 1.96 . This
 | 
			
		||||
# can give rise to errors. If we are simply are interested in outliers, no
 | 
			
		||||
# matter larger or smaller, then the 1.96 SD cutoff for significance is correct.
 | 
			
		||||
# But if we are specifically interested in, say, larger values, because a
 | 
			
		||||
# smaller value is not meaningful, then the significance cutoff, expressed as
 | 
			
		||||
# standard deviations, is relaxed. We can use the quantile function to see what
 | 
			
		||||
# the cutoff values are:
 | 
			
		||||
 | 
			
		||||
quantile(r)
 | 
			
		||||
quantile(r, probs = c(0.025, 0.975)) # for the symmetric 2.5% boundaries
 | 
			
		||||
# close to ± 1.96, as expected
 | 
			
		||||
quantile(r, probs = 0.95) # for the single 5% boundary
 | 
			
		||||
# close to 1.64 . Check counts to confirm:
 | 
			
		||||
sum(r > quantile(r, probs = 0.95))
 | 
			
		||||
# [1] 50000
 | 
			
		||||
# which is 5%, as expected.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# Use abline() to add the p = 0.05 boundary for smaller values to the histogram.
 | 
			
		||||
# (Sample solution 6.2)
 | 
			
		||||
 | 
			
		||||
# To summarize: when we evaluate the significance of an event, we divide a
 | 
			
		||||
# probability distribution into two parts at the point where the event was
 | 
			
		||||
# observed. We then ask whether the integral over the more extreme part is less
 | 
			
		||||
# or more than 5% of the whole. If it is less, we deem the event to be
 | 
			
		||||
# significant.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Significance by integration  =========================================
 | 
			
		||||
 | 
			
		||||
# If the underlying probability distribution can be analytically or numerically
 | 
			
		||||
# integrated, the siginificance of an observation can be directly computed.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Significance by simulation or permutation  ===========================
 | 
			
		||||
 | 
			
		||||
# But whether the integration is correct, or relies on assumptions that may not
 | 
			
		||||
# be warranted for biological data, can be a highly technical question.
 | 
			
		||||
# Fortunately, we can often simply run a simulation, a random resampling, or a
 | 
			
		||||
# permutation and then count the number of outcomes, just as we did with our
 | 
			
		||||
# rnorm() samples. We call this an empirical p-value. (Actually, the "empirical
 | 
			
		||||
# p-value" is defined as (Nobs + 1) / (N + 1).  )
 | 
			
		||||
 | 
			
		||||
# Here is an example. Assume you have a protein sequence and
 | 
			
		||||
# you speculate that positively charged residues are close to negatively charged
 | 
			
		||||
# residues to balance charge locally. A statistic that would capture this is the
 | 
			
		||||
# mean minimum distance between all D,E residues and the closest R,K,H
 | 
			
		||||
# residue. Let's compute this for the sequence of yeast Mbp1.
 | 
			
		||||
 | 
			
		||||
MBP1 <- paste0("MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK",
 | 
			
		||||
               "ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA",
 | 
			
		||||
               "SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR",
 | 
			
		||||
               "KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ",
 | 
			
		||||
               "QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS",
 | 
			
		||||
               "PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY",
 | 
			
		||||
               "FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
 | 
			
		||||
               "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP",
 | 
			
		||||
               "SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT",
 | 
			
		||||
               "ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP",
 | 
			
		||||
               "VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK",
 | 
			
		||||
               "IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR",
 | 
			
		||||
               "QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK",
 | 
			
		||||
               "IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA")
 | 
			
		||||
 | 
			
		||||
# first we split this string into individual characters:
 | 
			
		||||
v <- unlist(strsplit(MBP1, ""))
 | 
			
		||||
 | 
			
		||||
# and find the positions of our charged residues
 | 
			
		||||
 | 
			
		||||
ED  <- grep("[ED]", v)
 | 
			
		||||
RKH <- grep("[RKH]", v)
 | 
			
		||||
 | 
			
		||||
sep <- numeric(length(ED)) # this vector will hold the distances
 | 
			
		||||
for (i in seq_along(ED)) {
 | 
			
		||||
  sep[i] <- min(abs(RKH - ED[i]))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Task: read and explain this bit of code
 | 
			
		||||
 | 
			
		||||
# Now that sep is computed, what does it look like?
 | 
			
		||||
 | 
			
		||||
table(sep)  # these are the minimum distances
 | 
			
		||||
# 24 of D,E residues are adjacent to R,K,H;
 | 
			
		||||
# the longest separation is 28 residues.
 | 
			
		||||
 | 
			
		||||
# What is the mean separation?
 | 
			
		||||
mean(sep)
 | 
			
		||||
 | 
			
		||||
# The value is 4.1 . Is this significant? Honestly, I would be hard pressed
 | 
			
		||||
# to solve this analytically. But by permutation it's soooo easy.
 | 
			
		||||
 | 
			
		||||
# First, we combine what we have done above into a function:
 | 
			
		||||
 | 
			
		||||
chSep <- function(v) {
 | 
			
		||||
  # computes the mean minimum separation of oppositely charged residues
 | 
			
		||||
  # Parameter: v (char) a vector of amino acids in the one-letter code
 | 
			
		||||
  # Value: msep (numeric) mean minimum separation
 | 
			
		||||
 | 
			
		||||
  ED  <- grep("[EDed]", v)
 | 
			
		||||
  RKH <- grep("[RKHrkh]", v)
 | 
			
		||||
 | 
			
		||||
  sep <- numeric(length(ED))
 | 
			
		||||
  for (i in seq_along(ED)) {
 | 
			
		||||
    sep[i] <- min(abs(RKH - ED[i]))
 | 
			
		||||
  }
 | 
			
		||||
  return(mean(sep))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Execute the function to define it.
 | 
			
		||||
 | 
			
		||||
# Confirm that the function gives the same result as the number we
 | 
			
		||||
# calculated above:
 | 
			
		||||
chSep(v)
 | 
			
		||||
 | 
			
		||||
# Now we can produce a random permutation of v, and recalculate
 | 
			
		||||
 | 
			
		||||
set.seed(pi)                       # set RNG seed for repeatable randomness
 | 
			
		||||
w <- sample(v, length(v))          # This shuffles the vector v. Memorize this
 | 
			
		||||
                                   # code paradigm. It is very useful.
 | 
			
		||||
set.seed(NULL)                     # reset the RNG
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
chSep(w)
 | 
			
		||||
# 3.773 ... that's actually less than what we had before.
 | 
			
		||||
 | 
			
		||||
# Let's do this 10000 times and record the results (takes a few seconds):
 | 
			
		||||
 | 
			
		||||
N <- 10000
 | 
			
		||||
chs <- numeric(N)
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  chs[i] <- chSep(sample(v, length(v))) # charge
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
hist(chs, breaks = 50)
 | 
			
		||||
abline(v = chSep(v), col = "#EE0000")
 | 
			
		||||
 | 
			
		||||
# Contrary to our expectations, the actual observed mean minimum charge
 | 
			
		||||
# separation seems to be larger than what we observe in randomly permuted
 | 
			
		||||
# sequences. But is this significant? Your task to find out.
 | 
			
		||||
 | 
			
		||||
# Task:
 | 
			
		||||
# Calculate the empirical p-value for chsep(v)
 | 
			
		||||
# (Sample solution 6.3)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Final tasks  =========================================================
 | 
			
		||||
 | 
			
		||||
# From chs, compute the empirical p-value of a mean minimum charge separation to
 | 
			
		||||
#   be larger or equal to the value observed for the yeast MBP1 sequence. Note
 | 
			
		||||
#   the result in your journal. Is it significant? Also note the result of
 | 
			
		||||
#   the following expression for validation:
 | 
			
		||||
seal(sum(chs))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Sample solutions  ====================================================
 | 
			
		||||
 | 
			
		||||
# ==   6.1    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
sum(r <= x) / length(r)
 | 
			
		||||
 | 
			
		||||
# ==   6.2    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
abline(v = quantile(r, probs = c(0.05)))
 | 
			
		||||
 | 
			
		||||
# ==   6.3    ==================================================================
 | 
			
		||||
#
 | 
			
		||||
( x <- (sum(chs >= chSep(v)) + 1) / (length(chs) + 1) )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,3 +1,3 @@
 | 
			
		||||
# BCH441-WORK-ABC-units
 | 
			
		||||
 | 
			
		||||
# BCH441-WORK-ABC-units
 | 
			
		||||
 | 
			
		||||
This is a fork of the project [ABC-units](https://github.com/hyginn/ABC-units) designed for BCH441. This setup allows changes to be committed here but updates pushed to the original repository can be fetched and pulled to keep up to date.
 | 
			
		||||
							
								
								
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							
							
						
						
									
										490
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							@@ -1,245 +1,245 @@
 | 
			
		||||
# tocID <- "RPR-Biostrings.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Biostrings unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    2017 Revisions
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                             Line
 | 
			
		||||
#TOC> -----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        The Biostrings:: Package                            56
 | 
			
		||||
#TOC>   2        Getting Data into Biostrings:: Objects              88
 | 
			
		||||
#TOC>   3        Working with Biostrings:: Objects                  110
 | 
			
		||||
#TOC>   3.1        Properties                                       127
 | 
			
		||||
#TOC>   3.2        Subsetting                                       168
 | 
			
		||||
#TOC>   3.3        Operators                                        180
 | 
			
		||||
#TOC>   3.4        Transformations                                  187
 | 
			
		||||
#TOC>   4        Getting Data out of Biostrings:: Objects           194
 | 
			
		||||
#TOC>   5        More                                               203
 | 
			
		||||
#TOC>   5.1        Views                                            205
 | 
			
		||||
#TOC>   5.2        Iranges                                          219
 | 
			
		||||
#TOC>   5.3        StringSets                                       225
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This is a very brief introduction to the Biostrings:: package, other units will
 | 
			
		||||
# be using more of the Biostrings:: functions.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  The Biostrings:: Package  ============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# First, we install and load the Biostrings:: package from bioconductor (if we
 | 
			
		||||
# haven't done so already).
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Examine the package information:
 | 
			
		||||
library(help = Biostrings)       # basic information
 | 
			
		||||
browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# At its core, Biostrings:: objects are "classes" of type XString (you can think
 | 
			
		||||
# of a "class" in R as a special kind of list), that can take on particular
 | 
			
		||||
# flavours for RNA, DNA or amino acid sequence information.
 | 
			
		||||
 | 
			
		||||
class(Biostrings::RNAString("AUG"))
 | 
			
		||||
class(Biostrings::DNAString("ATG"))
 | 
			
		||||
class(Biostrings::AAString("M"))
 | 
			
		||||
 | 
			
		||||
# An essential property of Biostrings:: objects is that they only allow letters
 | 
			
		||||
# from the applicable IUPAC alphabet:
 | 
			
		||||
Biostrings::RNAString("AUG")
 | 
			
		||||
Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Getting Data into Biostrings:: Objects  ==============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
 | 
			
		||||
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
rawSeq <- dbSanitizeSequence(rawSeq)
 | 
			
		||||
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
 | 
			
		||||
                                            # into an object of class DNAstring
 | 
			
		||||
 | 
			
		||||
# Multi FASTA files can be read directly as a "XStringSet) ...
 | 
			
		||||
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
 | 
			
		||||
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
 | 
			
		||||
 | 
			
		||||
# ... and if you subset one sequence from the set, you get an XString object
 | 
			
		||||
# back again.
 | 
			
		||||
(Xseq <- biosDNASet[[1]])
 | 
			
		||||
 | 
			
		||||
biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
 | 
			
		||||
identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Working with Biostrings:: Objects  ===================================
 | 
			
		||||
 | 
			
		||||
# Biostrings:: is a highly engineered package that is tightly integrated into
 | 
			
		||||
# the Bioconductor world - unfortunately that brings with it a somewhat
 | 
			
		||||
# undesirable level of computational overhead and dependencies. Using the
 | 
			
		||||
# package as we normally do - i.e. calling required functions with their
 | 
			
		||||
# explicit package prefix is therefore not advisable. There are generics
 | 
			
		||||
# that won't be propery dispatched. If you only need a small number of
 | 
			
		||||
# functions for a very specific context, you will probably get away with
 | 
			
		||||
# Biostrings::<function>() - but even in the demonstration code of this script
 | 
			
		||||
# not everything works out of the box. We'll therefore load the library,
 | 
			
		||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
 | 
			
		||||
# the functions come from.
 | 
			
		||||
 | 
			
		||||
library(Biostrings)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Properties  ========================================================
 | 
			
		||||
str(rawSeq)
 | 
			
		||||
str(biosDNAseq)
 | 
			
		||||
 | 
			
		||||
length(rawSeq)       # ... is 1: one string only. To get the number of
 | 
			
		||||
                     # characters in a string, you need nchar().
 | 
			
		||||
length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
 | 
			
		||||
nchar(rawSeq)
 | 
			
		||||
nchar(biosDNAseq)    # ... but nchar() works too.
 | 
			
		||||
 | 
			
		||||
(uL <- Biostrings::uniqueLetters(biosDNAseq))
 | 
			
		||||
 | 
			
		||||
# Count frequencies - with strings, you would strsplit() into a character
 | 
			
		||||
# vector and then use table(). biost
 | 
			
		||||
Biostrings::alphabetFrequency(biosDNAseq)
 | 
			
		||||
 | 
			
		||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
 | 
			
		||||
# returns.
 | 
			
		||||
Biostrings::letterFrequency(biosDNAseq, uL)
 | 
			
		||||
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
 | 
			
		||||
  length(biosDNAseq) # GC contents
 | 
			
		||||
 | 
			
		||||
Biostrings::dinucleotideFrequency(biosDNAseq)
 | 
			
		||||
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
 | 
			
		||||
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
 | 
			
		||||
barplot(sort(triNuc), col="#4499EE33")
 | 
			
		||||
triNuc[triNuc == max(triNuc)]
 | 
			
		||||
triNuc[triNuc == min(triNuc)]
 | 
			
		||||
max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
 | 
			
		||||
 | 
			
		||||
# compare to a shuffled sequence:
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
 | 
			
		||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
 | 
			
		||||
max(triNuc)
 | 
			
		||||
# Interpret this plot.
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
 | 
			
		||||
barplot(sort(triNuc), col="#EEEE4433")
 | 
			
		||||
max(triNuc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Subsetting  ========================================================
 | 
			
		||||
 | 
			
		||||
# Subsetting any XString object works as expected:
 | 
			
		||||
biosDNAseq[4:15]
 | 
			
		||||
 | 
			
		||||
# ... well - maybe not expected, because rawSeq[4:15] would not work.
 | 
			
		||||
 | 
			
		||||
# Alternatively to the "[" operator, use the subseq() function - especially for
 | 
			
		||||
# long sequences. This is far more efficient.
 | 
			
		||||
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Operators  =========================================================
 | 
			
		||||
 | 
			
		||||
# RNAstring() and DNAstring() objects compare U and T as equals!
 | 
			
		||||
  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
 | 
			
		||||
  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.4  Transformations  ===================================================
 | 
			
		||||
 | 
			
		||||
biosDNAseq[4:15]
 | 
			
		||||
Biostrings::reverseComplement(biosDNAseq[4:15])
 | 
			
		||||
Biostrings::translate(biosDNAseq[4:15])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Getting Data out of Biostrings:: Objects  ============================
 | 
			
		||||
 | 
			
		||||
# If you need a character object, use toString():
 | 
			
		||||
 | 
			
		||||
Biostrings::toString(biosDNAseq[4:15])
 | 
			
		||||
 | 
			
		||||
# saveRDS() and readRDS() works like on all other R objects.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  More  ================================================================
 | 
			
		||||
 | 
			
		||||
# ==   5.1  Views  =============================================================
 | 
			
		||||
 | 
			
		||||
# Biostring "Views" are objects that store multiple substrings of one
 | 
			
		||||
# Biostring object.
 | 
			
		||||
 | 
			
		||||
(myView <- Biostrings::Views(biosDNAseq,
 | 
			
		||||
                             start = c(1, 19, 37),
 | 
			
		||||
                             end = c(15, 30, 45)))
 | 
			
		||||
 | 
			
		||||
# Views are convenient to store feature annotations
 | 
			
		||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
 | 
			
		||||
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   5.2  Iranges  ===========================================================
 | 
			
		||||
 | 
			
		||||
# Biostrings:: Iranges are like Views with a common start point. These can be
 | 
			
		||||
# useful for feature annotations. Instead of start/end you store start/width.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   5.3  StringSets  ========================================================
 | 
			
		||||
 | 
			
		||||
# Biostring "StringSets" store multiple sequences.
 | 
			
		||||
#
 | 
			
		||||
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
 | 
			
		||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
 | 
			
		||||
 | 
			
		||||
x <- Biostrings::toString(ompA)
 | 
			
		||||
for (i in 2:10) {
 | 
			
		||||
  x[i] <- Biostrings::toString(sample(ompA))
 | 
			
		||||
}
 | 
			
		||||
shuffledPeptideSet <- Biostrings::AAStringSet(x)
 | 
			
		||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
 | 
			
		||||
shuffledPeptideSet
 | 
			
		||||
 | 
			
		||||
length(shuffledPeptideSet)
 | 
			
		||||
Biostrings::width(shuffledPeptideSet)
 | 
			
		||||
Biostrings::alphabetFrequency(shuffledPeptideSet)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-Biostrings.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Biostrings unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.0    2017 Revisions
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                             Line
 | 
			
		||||
#TOC> -----------------------------------------------------------------
 | 
			
		||||
#TOC>   1        The Biostrings:: Package                            56
 | 
			
		||||
#TOC>   2        Getting Data into Biostrings:: Objects              88
 | 
			
		||||
#TOC>   3        Working with Biostrings:: Objects                  110
 | 
			
		||||
#TOC>   3.1        Properties                                       127
 | 
			
		||||
#TOC>   3.2        Subsetting                                       168
 | 
			
		||||
#TOC>   3.3        Operators                                        180
 | 
			
		||||
#TOC>   3.4        Transformations                                  187
 | 
			
		||||
#TOC>   4        Getting Data out of Biostrings:: Objects           194
 | 
			
		||||
#TOC>   5        More                                               203
 | 
			
		||||
#TOC>   5.1        Views                                            205
 | 
			
		||||
#TOC>   5.2        Iranges                                          219
 | 
			
		||||
#TOC>   5.3        StringSets                                       225
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This is a very brief introduction to the Biostrings:: package, other units will
 | 
			
		||||
# be using more of the Biostrings:: functions.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  The Biostrings:: Package  ============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# First, we install and load the Biostrings:: package from bioconductor (if we
 | 
			
		||||
# haven't done so already).
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Examine the package information:
 | 
			
		||||
library(help = Biostrings)       # basic information
 | 
			
		||||
browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# At its core, Biostrings:: objects are "classes" of type XString (you can think
 | 
			
		||||
# of a "class" in R as a special kind of list), that can take on particular
 | 
			
		||||
# flavours for RNA, DNA or amino acid sequence information.
 | 
			
		||||
 | 
			
		||||
class(Biostrings::RNAString("AUG"))
 | 
			
		||||
class(Biostrings::DNAString("ATG"))
 | 
			
		||||
class(Biostrings::AAString("M"))
 | 
			
		||||
 | 
			
		||||
# An essential property of Biostrings:: objects is that they only allow letters
 | 
			
		||||
# from the applicable IUPAC alphabet:
 | 
			
		||||
Biostrings::RNAString("AUG")
 | 
			
		||||
Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Getting Data into Biostrings:: Objects  ==============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
 | 
			
		||||
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
rawSeq <- dbSanitizeSequence(rawSeq)
 | 
			
		||||
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
 | 
			
		||||
                                            # into an object of class DNAstring
 | 
			
		||||
 | 
			
		||||
# Multi FASTA files can be read directly as a "XStringSet) ...
 | 
			
		||||
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
 | 
			
		||||
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
 | 
			
		||||
 | 
			
		||||
# ... and if you subset one sequence from the set, you get an XString object
 | 
			
		||||
# back again.
 | 
			
		||||
(Xseq <- biosDNASet[[1]])
 | 
			
		||||
 | 
			
		||||
biosDNAseq == Xseq           # the comparison evaluates to TRUE ...
 | 
			
		||||
identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Working with Biostrings:: Objects  ===================================
 | 
			
		||||
 | 
			
		||||
# Biostrings:: is a highly engineered package that is tightly integrated into
 | 
			
		||||
# the Bioconductor world - unfortunately that brings with it a somewhat
 | 
			
		||||
# undesirable level of computational overhead and dependencies. Using the
 | 
			
		||||
# package as we normally do - i.e. calling required functions with their
 | 
			
		||||
# explicit package prefix is therefore not advisable. There are generics
 | 
			
		||||
# that won't be propery dispatched. If you only need a small number of
 | 
			
		||||
# functions for a very specific context, you will probably get away with
 | 
			
		||||
# Biostrings::<function>() - but even in the demonstration code of this script
 | 
			
		||||
# not everything works out of the box. We'll therefore load the library,
 | 
			
		||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
 | 
			
		||||
# the functions come from.
 | 
			
		||||
 | 
			
		||||
library(Biostrings)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Properties  ========================================================
 | 
			
		||||
str(rawSeq)
 | 
			
		||||
str(biosDNAseq)
 | 
			
		||||
 | 
			
		||||
length(rawSeq)       # ... is 1: one string only. To get the number of
 | 
			
		||||
                     # characters in a string, you need nchar().
 | 
			
		||||
length(biosDNAseq)   # but the length of a "Bstring" is the number of elements
 | 
			
		||||
nchar(rawSeq)
 | 
			
		||||
nchar(biosDNAseq)    # ... but nchar() works too.
 | 
			
		||||
 | 
			
		||||
(uL <- Biostrings::uniqueLetters(biosDNAseq))
 | 
			
		||||
 | 
			
		||||
# Count frequencies - with strings, you would strsplit() into a character
 | 
			
		||||
# vector and then use table(). biost
 | 
			
		||||
Biostrings::alphabetFrequency(biosDNAseq)
 | 
			
		||||
 | 
			
		||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
 | 
			
		||||
# returns.
 | 
			
		||||
Biostrings::letterFrequency(biosDNAseq, uL)
 | 
			
		||||
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
 | 
			
		||||
  length(biosDNAseq) # GC contents
 | 
			
		||||
 | 
			
		||||
Biostrings::dinucleotideFrequency(biosDNAseq)
 | 
			
		||||
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
 | 
			
		||||
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
 | 
			
		||||
barplot(sort(triNuc), col="#4499EE33")
 | 
			
		||||
triNuc[triNuc == max(triNuc)]
 | 
			
		||||
triNuc[triNuc == min(triNuc)]
 | 
			
		||||
max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT
 | 
			
		||||
 | 
			
		||||
# compare to a shuffled sequence:
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
 | 
			
		||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
 | 
			
		||||
max(triNuc)
 | 
			
		||||
# Interpret this plot.
 | 
			
		||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
 | 
			
		||||
barplot(sort(triNuc), col="#EEEE4433")
 | 
			
		||||
max(triNuc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Subsetting  ========================================================
 | 
			
		||||
 | 
			
		||||
# Subsetting any XString object works as expected:
 | 
			
		||||
biosDNAseq[4:15]
 | 
			
		||||
 | 
			
		||||
# ... well - maybe not expected, because rawSeq[4:15] would not work.
 | 
			
		||||
 | 
			
		||||
# Alternatively to the "[" operator, use the subseq() function - especially for
 | 
			
		||||
# long sequences. This is far more efficient.
 | 
			
		||||
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.3  Operators  =========================================================
 | 
			
		||||
 | 
			
		||||
# RNAstring() and DNAstring() objects compare U and T as equals!
 | 
			
		||||
  Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
 | 
			
		||||
  Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.4  Transformations  ===================================================
 | 
			
		||||
 | 
			
		||||
biosDNAseq[4:15]
 | 
			
		||||
Biostrings::reverseComplement(biosDNAseq[4:15])
 | 
			
		||||
Biostrings::translate(biosDNAseq[4:15])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Getting Data out of Biostrings:: Objects  ============================
 | 
			
		||||
 | 
			
		||||
# If you need a character object, use toString():
 | 
			
		||||
 | 
			
		||||
Biostrings::toString(biosDNAseq[4:15])
 | 
			
		||||
 | 
			
		||||
# saveRDS() and readRDS() works like on all other R objects.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  More  ================================================================
 | 
			
		||||
 | 
			
		||||
# ==   5.1  Views  =============================================================
 | 
			
		||||
 | 
			
		||||
# Biostring "Views" are objects that store multiple substrings of one
 | 
			
		||||
# Biostring object.
 | 
			
		||||
 | 
			
		||||
(myView <- Biostrings::Views(biosDNAseq,
 | 
			
		||||
                             start = c(1, 19, 37),
 | 
			
		||||
                             end = c(15, 30, 45)))
 | 
			
		||||
 | 
			
		||||
# Views are convenient to store feature annotations
 | 
			
		||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
 | 
			
		||||
cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   5.2  Iranges  ===========================================================
 | 
			
		||||
 | 
			
		||||
# Biostrings:: Iranges are like Views with a common start point. These can be
 | 
			
		||||
# useful for feature annotations. Instead of start/end you store start/width.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   5.3  StringSets  ========================================================
 | 
			
		||||
 | 
			
		||||
# Biostring "StringSets" store multiple sequences.
 | 
			
		||||
#
 | 
			
		||||
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
 | 
			
		||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
 | 
			
		||||
 | 
			
		||||
x <- Biostrings::toString(ompA)
 | 
			
		||||
for (i in 2:10) {
 | 
			
		||||
  x[i] <- Biostrings::toString(sample(ompA))
 | 
			
		||||
}
 | 
			
		||||
shuffledPeptideSet <- Biostrings::AAStringSet(x)
 | 
			
		||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
 | 
			
		||||
shuffledPeptideSet
 | 
			
		||||
 | 
			
		||||
length(shuffledPeptideSet)
 | 
			
		||||
Biostrings::width(shuffledPeptideSet)
 | 
			
		||||
Biostrings::alphabetFrequency(shuffledPeptideSet)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,165 +1,165 @@
 | 
			
		||||
# tocID <- "RPR-ChimeraX_remote.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code demonstrating remote scripting of ChimeraX.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0.1  2021 Minimal updates
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#    %-encode and escape quotes, or just pass-through?
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                  Line
 | 
			
		||||
#TOC> ------------------------------------------------------
 | 
			
		||||
#TOC>   1        ChimeraX REMOTE SCRIPTING                41
 | 
			
		||||
#TOC>   1.1        Defining a Port                        59
 | 
			
		||||
#TOC>   1.2        Open ChimeraX                          81
 | 
			
		||||
#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# One of the cool features of ChimeraX is that it can be driven by Python code,
 | 
			
		||||
# both within a running session and through Python scripts. What I find even
 | 
			
		||||
# cooler though is that ChimeraX can be driven from any programming language via
 | 
			
		||||
# its remote control function that can listen to commands sent from any other
 | 
			
		||||
# application. The interface that is used here is the standard REST (method) -
 | 
			
		||||
# the GET and POST verbs that ubiquitously underly the communication of clients
 | 
			
		||||
# and servers on the Web.
 | 
			
		||||
 | 
			
		||||
# In order to establish the communication between this script and ChimeraX, all
 | 
			
		||||
# we need to do is:
 | 
			
		||||
#  - open ChimeraX;
 | 
			
		||||
#  - tell it to listen on a specific "port";
 | 
			
		||||
#  - send commands to that port via httr::
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Defining a Port  ===================================================
 | 
			
		||||
 | 
			
		||||
# The httr:: package needs to be available
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# We need to think od a port. Any available port number between 49152-65535 is
 | 
			
		||||
# fine. We'll choose 61803 because that's the fractional part of the golden
 | 
			
		||||
# ratio. But one could choose another.
 | 
			
		||||
 | 
			
		||||
CXPORT <- 61803
 | 
			
		||||
 | 
			
		||||
# Check that our current version of R supports sockets (default since V 3.3)
 | 
			
		||||
capabilities("sockets")   # MUST be TRUE. If not, don't continue.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  Open ChimeraX  =====================================================
 | 
			
		||||
 | 
			
		||||
#  - Open a fresh, new session of recently updated version of ChimeraX
 | 
			
		||||
#  - type:
 | 
			
		||||
#
 | 
			
		||||
#       remotecontrol rest start port 61803
 | 
			
		||||
#
 | 
			
		||||
#    ... or whatever the value of CXPORT is.
 | 
			
		||||
 | 
			
		||||
# Now watch what happens in ChimeraX when you execute the following line:
 | 
			
		||||
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
 | 
			
		||||
 | 
			
		||||
# The .utilities.R script includes the function CX(), based on this principle,
 | 
			
		||||
# through which you can send commands to ChimeraX
 | 
			
		||||
 | 
			
		||||
CX("camera sbs")
 | 
			
		||||
CX("lighting soft")
 | 
			
		||||
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
 | 
			
		||||
 | 
			
		||||
# The command echos Chimera's response if the parameter "quietly" is
 | 
			
		||||
# FALSE (default), and we can silence output with quietly = TRUE :
 | 
			
		||||
CX("info models #1 attribute num_residues")
 | 
			
		||||
CX("info models #1 attribute num_residues", quietly = TRUE)
 | 
			
		||||
 | 
			
		||||
# Either way, the command also returns Chimera's responses "invisibly";
 | 
			
		||||
# i.e. we can use the results by assigning the output to a variable:
 | 
			
		||||
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
 | 
			
		||||
x <- read.table(file = textConnection(hBonds), skip = 9,
 | 
			
		||||
                blank.lines.skip = TRUE, fill = TRUE)
 | 
			
		||||
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
 | 
			
		||||
 | 
			
		||||
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
 | 
			
		||||
# to explore possible DNA binding regions in 1BM8
 | 
			
		||||
 | 
			
		||||
# The model for 1BM8 is already open as model 1  (#1)
 | 
			
		||||
CX("hide #1 cartoons")        # hide model 1 cartoon representation
 | 
			
		||||
CX("open 1DUX")               # assume this is opened as model #2
 | 
			
		||||
CX("hide #2")                 # hide everything ...
 | 
			
		||||
CX("select #2/C")             # chain c (protein)
 | 
			
		||||
CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
 | 
			
		||||
CX("color sequential sel target c palette steelblue:darkmagenta")
 | 
			
		||||
CX("view #2/C")               # re-center the display
 | 
			
		||||
CX("cofr #2/C:62@CA")         # set pivot to an interface residue
 | 
			
		||||
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
 | 
			
		||||
CX("style sel stick")
 | 
			
		||||
CX("show sel target ab")      # show atoms/bonds
 | 
			
		||||
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
 | 
			
		||||
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
 | 
			
		||||
CX("surface sel enclose sel") # compute joint accessible surface of both chains
 | 
			
		||||
CX("transparency 50")
 | 
			
		||||
CX("select clear")
 | 
			
		||||
 | 
			
		||||
# Now superimpose the 1BM8 chain onto 1DUX chain C
 | 
			
		||||
CX("show #1 cartoons")
 | 
			
		||||
CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
 | 
			
		||||
 | 
			
		||||
# study the general layout, and the position of the 1mb8 secondary structure
 | 
			
		||||
# elements relative to 1DUX
 | 
			
		||||
 | 
			
		||||
# Let's examine side chain orientations in more detail
 | 
			
		||||
CX("hide #2/C cartoons")  # hide the 1DUX protein
 | 
			
		||||
 | 
			
		||||
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
 | 
			
		||||
CX("select zone #2/A,B 3.5 #1 & protein residues true")
 | 
			
		||||
CX("~select sel & H")  # de-select H atoms
 | 
			
		||||
CX("show sel target ab")
 | 
			
		||||
CX("size stickRadius 0.4")
 | 
			
		||||
CX("select clear")
 | 
			
		||||
 | 
			
		||||
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
 | 
			
		||||
# transcription factor binding mode; the detailed conformations of side chains
 | 
			
		||||
# would need to change only to a minor degree. There is a very significant
 | 
			
		||||
# degree of structural similarity; remarkable, given that the DNA is not the
 | 
			
		||||
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
 | 
			
		||||
# determined without a DNA ligand.
 | 
			
		||||
 | 
			
		||||
CX("remotecontrol rest stop")  # release the socket
 | 
			
		||||
# Done.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-ChimeraX_remote.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code demonstrating remote scripting of ChimeraX.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0.1  2021 Minimal updates
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#    %-encode and escape quotes, or just pass-through?
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                  Line
 | 
			
		||||
#TOC> ------------------------------------------------------
 | 
			
		||||
#TOC>   1        ChimeraX REMOTE SCRIPTING                41
 | 
			
		||||
#TOC>   1.1        Defining a Port                        59
 | 
			
		||||
#TOC>   1.2        Open ChimeraX                          81
 | 
			
		||||
#TOC>   2        WORKED EXAMPLE: SUPERPOSITION           113
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  ChimeraX REMOTE SCRIPTING  ===========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# One of the cool features of ChimeraX is that it can be driven by Python code,
 | 
			
		||||
# both within a running session and through Python scripts. What I find even
 | 
			
		||||
# cooler though is that ChimeraX can be driven from any programming language via
 | 
			
		||||
# its remote control function that can listen to commands sent from any other
 | 
			
		||||
# application. The interface that is used here is the standard REST (method) -
 | 
			
		||||
# the GET and POST verbs that ubiquitously underly the communication of clients
 | 
			
		||||
# and servers on the Web.
 | 
			
		||||
 | 
			
		||||
# In order to establish the communication between this script and ChimeraX, all
 | 
			
		||||
# we need to do is:
 | 
			
		||||
#  - open ChimeraX;
 | 
			
		||||
#  - tell it to listen on a specific "port";
 | 
			
		||||
#  - send commands to that port via httr::
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Defining a Port  ===================================================
 | 
			
		||||
 | 
			
		||||
# The httr:: package needs to be available
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# We need to think od a port. Any available port number between 49152-65535 is
 | 
			
		||||
# fine. We'll choose 61803 because that's the fractional part of the golden
 | 
			
		||||
# ratio. But one could choose another.
 | 
			
		||||
 | 
			
		||||
CXPORT <- 61803
 | 
			
		||||
 | 
			
		||||
# Check that our current version of R supports sockets (default since V 3.3)
 | 
			
		||||
capabilities("sockets")   # MUST be TRUE. If not, don't continue.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.2  Open ChimeraX  =====================================================
 | 
			
		||||
 | 
			
		||||
#  - Open a fresh, new session of recently updated version of ChimeraX
 | 
			
		||||
#  - type:
 | 
			
		||||
#
 | 
			
		||||
#       remotecontrol rest start port 61803
 | 
			
		||||
#
 | 
			
		||||
#    ... or whatever the value of CXPORT is.
 | 
			
		||||
 | 
			
		||||
# Now watch what happens in ChimeraX when you execute the following line:
 | 
			
		||||
( x <- httr::GET("http://127.0.0.1:61803/run?command=open+1BM8") )
 | 
			
		||||
 | 
			
		||||
# The .utilities.R script includes the function CX(), based on this principle,
 | 
			
		||||
# through which you can send commands to ChimeraX
 | 
			
		||||
 | 
			
		||||
CX("camera sbs")
 | 
			
		||||
CX("lighting soft")
 | 
			
		||||
CX("color sequential #1 & protein target abc palette powderblue:orchid:white")
 | 
			
		||||
 | 
			
		||||
# The command echos Chimera's response if the parameter "quietly" is
 | 
			
		||||
# FALSE (default), and we can silence output with quietly = TRUE :
 | 
			
		||||
CX("info models #1 attribute num_residues")
 | 
			
		||||
CX("info models #1 attribute num_residues", quietly = TRUE)
 | 
			
		||||
 | 
			
		||||
# Either way, the command also returns Chimera's responses "invisibly";
 | 
			
		||||
# i.e. we can use the results by assigning the output to a variable:
 | 
			
		||||
hBonds <- CX("hbonds #1 & protein makePseudobonds false log true", quietly=TRUE)
 | 
			
		||||
x <- read.table(file = textConnection(hBonds), skip = 9,
 | 
			
		||||
                blank.lines.skip = TRUE, fill = TRUE)
 | 
			
		||||
hist(x[,13], main="H-bonds", xlab="D···A (Å)", ylab="counts", col="#c9dcff")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  WORKED EXAMPLE: SUPERPOSITION  =======================================
 | 
			
		||||
 | 
			
		||||
# We superimpose the 1BM8 structure with the 1DUX crystal structure to be able
 | 
			
		||||
# to explore possible DNA binding regions in 1BM8
 | 
			
		||||
 | 
			
		||||
# The model for 1BM8 is already open as model 1  (#1)
 | 
			
		||||
CX("hide #1 cartoons")        # hide model 1 cartoon representation
 | 
			
		||||
CX("open 1DUX")               # assume this is opened as model #2
 | 
			
		||||
CX("hide #2")                 # hide everything ...
 | 
			
		||||
CX("select #2/C")             # chain c (protein)
 | 
			
		||||
CX("show sel cartoons")       # ... and show cartoons of chain c (protein)
 | 
			
		||||
CX("color sequential sel target c palette steelblue:darkmagenta")
 | 
			
		||||
CX("view #2/C")               # re-center the display
 | 
			
		||||
CX("cofr #2/C:62@CA")         # set pivot to an interface residue
 | 
			
		||||
CX("select #2/A,B & nucleic-acid") # chains A, B are the cognate DNA
 | 
			
		||||
CX("style sel stick")
 | 
			
		||||
CX("show sel target ab")      # show atoms/bonds
 | 
			
		||||
CX("color sequential #2/A & nucleic-acid target ab palette teal:lightcyan")
 | 
			
		||||
CX("color sequential #2/B & nucleic-acid target ab palette teal:lightcyan")
 | 
			
		||||
CX("surface sel enclose sel") # compute joint accessible surface of both chains
 | 
			
		||||
CX("transparency 50")
 | 
			
		||||
CX("select clear")
 | 
			
		||||
 | 
			
		||||
# Now superimpose the 1BM8 chain onto 1DUX chain C
 | 
			
		||||
CX("show #1 cartoons")
 | 
			
		||||
CX("matchmaker #1/A to #2/C pairing ss")  # the actual superposition
 | 
			
		||||
 | 
			
		||||
# study the general layout, and the position of the 1mb8 secondary structure
 | 
			
		||||
# elements relative to 1DUX
 | 
			
		||||
 | 
			
		||||
# Let's examine side chain orientations in more detail
 | 
			
		||||
CX("hide #2/C cartoons")  # hide the 1DUX protein
 | 
			
		||||
 | 
			
		||||
# select all residues in 1BM8 that are within 3.5 A of the DNA chains (a, b)
 | 
			
		||||
CX("select zone #2/A,B 3.5 #1 & protein residues true")
 | 
			
		||||
CX("~select sel & H")  # de-select H atoms
 | 
			
		||||
CX("show sel target ab")
 | 
			
		||||
CX("size stickRadius 0.4")
 | 
			
		||||
CX("select clear")
 | 
			
		||||
 | 
			
		||||
# The overall architecture of the Mbp1 APSES domain is a good match for the Elk
 | 
			
		||||
# transcription factor binding mode; the detailed conformations of side chains
 | 
			
		||||
# would need to change only to a minor degree. There is a very significant
 | 
			
		||||
# degree of structural similarity; remarkable, given that the DNA is not the
 | 
			
		||||
# target sequence of the Mbp1 transcription factor, AND the 1MB8 structure was
 | 
			
		||||
# determined without a DNA ligand.
 | 
			
		||||
 | 
			
		||||
CX("remotecontrol rest stop")  # release the socket
 | 
			
		||||
# Done.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							
							
						
						
									
										644
									
								
								RPR-FASTA.R
									
									
									
									
									
								
							@@ -1,322 +1,322 @@
 | 
			
		||||
# tocID <- "RPR-FASTA.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-FASTA unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2021-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.1.2  style update
 | 
			
		||||
#           1.1.1  bugfix - wrong function name
 | 
			
		||||
#           1.1    2020 Maintenance. Rewrite validation logic. Add data
 | 
			
		||||
#                  to utilities. Define AACOLS
 | 
			
		||||
#           1.0    New unit.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO: Make a simple solution first, then extend it to error checking, and
 | 
			
		||||
#       to handle .mfa files.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                 Line
 | 
			
		||||
#TOC> -----------------------------------------------------
 | 
			
		||||
#TOC>   1        Reading and validating FASTA            45
 | 
			
		||||
#TOC>   1.1        Validating FASTA                      81
 | 
			
		||||
#TOC>   2        Parsing FASTA                          227
 | 
			
		||||
#TOC>   3        Interpreting FASTA                     247
 | 
			
		||||
#TOC>   4        Writing FASTA                          274
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Reading and validating FASTA  ========================================
 | 
			
		||||
 | 
			
		||||
# FASTA is a text based format, structured in lines that are separated by
 | 
			
		||||
# line-feed or paragraph-break characters. Which one of these is used, depends
 | 
			
		||||
# on your operating system. But R's readLines() function knows how to handle
 | 
			
		||||
# these correctly, accross platforms. Don't try to read such files "by hand".
 | 
			
		||||
# Here is the yeast Mbp1 gene, via SGD.
 | 
			
		||||
 | 
			
		||||
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
 | 
			
		||||
# The warning is generated because the programmer at the NCBI who implemented
 | 
			
		||||
# the code to write this FASTA file neglected to place a line-break character
 | 
			
		||||
# after the last sequence character. While this is not technically incorrect,
 | 
			
		||||
# it is poor practice: the resulting file can't be distinguished from one that
 | 
			
		||||
# has been truncated in transmission.
 | 
			
		||||
 | 
			
		||||
head(faMBP1)
 | 
			
		||||
 | 
			
		||||
# Note that there are NO line-break characters ("\n") at the end of these
 | 
			
		||||
# strings, even though they were present in the original file. readLines()
 | 
			
		||||
# has "consumed" these characters while reading - but every single line is in
 | 
			
		||||
# a vector of its own.
 | 
			
		||||
 | 
			
		||||
tail(faMBP1)
 | 
			
		||||
 | 
			
		||||
# Also note that the last line has fewer characters - this means readLines()
 | 
			
		||||
# imported the whole line, despite it not being terminated by "\n".
 | 
			
		||||
 | 
			
		||||
# It's very straightforward to work with such data, for example by collapsing
 | 
			
		||||
# everything except the first line into a single string ...
 | 
			
		||||
 | 
			
		||||
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
 | 
			
		||||
 | 
			
		||||
f[1]
 | 
			
		||||
nchar(f[2])
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Validating FASTA  ==================================================
 | 
			
		||||
 | 
			
		||||
# The code above is making the assumption that everything from line 2 until
 | 
			
		||||
#  the end IS sequence, the whole sequence and nothing but sequence.
 | 
			
		||||
#  That assumption can break down in many ways:
 | 
			
		||||
#
 | 
			
		||||
#  - there could be more than one header line. The specification says otherwise,
 | 
			
		||||
#       but some older files use multiple, consecutive header lines. You don't
 | 
			
		||||
#       want that to end up in your sequence.
 | 
			
		||||
#  - this could be not a FASTA file at all. It could be raw sequence, a
 | 
			
		||||
#       different sequence file format, or a wholly different file altogether.
 | 
			
		||||
#       If you look at the file, you can immediately tell, but if you are
 | 
			
		||||
#       reading the file in a complex workflow, your could easily import wrong
 | 
			
		||||
#       data into your analysis.
 | 
			
		||||
#  - there could be more than one sequence in the file. Such Multi-FASTA files
 | 
			
		||||
#       occur commonly, as downloads of ORFs from genome regions or other
 | 
			
		||||
#       sets of genes or proteins, or as the input / output for multiple
 | 
			
		||||
#       sequence alignment programs.
 | 
			
		||||
#
 | 
			
		||||
# Data "from the wild" can (and usually does) have the most unexpected
 | 
			
		||||
# variations and it is really, really important to be clear about the
 | 
			
		||||
# assumptions that you are making. It is possible to "fix" things, according
 | 
			
		||||
# to the "Robustness Principle" :
 | 
			
		||||
#      "Be conservative in what you send,
 | 
			
		||||
#       be liberal in what you accept".
 | 
			
		||||
#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
 | 
			
		||||
# ... but if you think about this, that's actually a really poor idea,
 | 
			
		||||
# which is much more likely to dilute standards, make unwarranted
 | 
			
		||||
# assumptions, and allow errors to pass silently and corrupt data.
 | 
			
		||||
#
 | 
			
		||||
# Let's discard this principle on the trash-heap of
 | 
			
		||||
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
 | 
			
		||||
# identify problems, and follow the principle: "crash early, crash often". Of
 | 
			
		||||
# course I can write code that would reformat any possible input as a FASTA
 | 
			
		||||
# file - but what good will it do me if it parses the file I receive
 | 
			
		||||
# from a server into FASTA format like:
 | 
			
		||||
#
 | 
			
		||||
#   >404- Page Not Found</title</head>
 | 
			
		||||
#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
 | 
			
		||||
#   spellingrcntacttheadministratrsdyhtml
 | 
			
		||||
#
 | 
			
		||||
# Therefore, we write ourselves a FASTA checker that will enforce the following:
 | 
			
		||||
#   (1) a FASTA file contains one or more sequences separated by zero or
 | 
			
		||||
#       more empty lines
 | 
			
		||||
#   (2) a sequence contains one header line followed by
 | 
			
		||||
#       one or more sequence lines
 | 
			
		||||
#   (3) a sequence line contains one or more uppercase or lowercase single
 | 
			
		||||
#       letter amino acid codes, hyphens (gap character), or * (stop).
 | 
			
		||||
#
 | 
			
		||||
#   Anything else should generate an error.
 | 
			
		||||
 | 
			
		||||
#   (Case 1): Header(s) exist
 | 
			
		||||
fX <- c("ABC",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grepl("^>", fX)  # "^>" is a regular expression that
 | 
			
		||||
                        # means: the exact character ">" at the
 | 
			
		||||
                        # beginning ("^") of the line.
 | 
			
		||||
if ( ! any(sel) ) { stop("no header lines in input.") }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#   (Case 2) No adjacent header lines
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        ">123",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grepl("^>", fX)
 | 
			
		||||
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
 | 
			
		||||
if ( any(sel)) { stop("adjacent header lines in input.") }
 | 
			
		||||
 | 
			
		||||
#   (Case 3.1) all sequence lines contain only valid characters
 | 
			
		||||
#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
 | 
			
		||||
#               are defined with the .utilities.R script)
 | 
			
		||||
AAVALID
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        "def ;-) ghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
 | 
			
		||||
sel <- ! grepl("^>", fX)              # NOT headers
 | 
			
		||||
if (any(grepl(myRegex, fX[sel]))) {
 | 
			
		||||
  stop("invalid chracter(s) outside of header lines.")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#   (Case 3.2) all headers are followed directly by
 | 
			
		||||
#              at least one letter of sequence
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        "",
 | 
			
		||||
        ">123",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grep("^>", fX) + 1             # indexes of headers + 1
 | 
			
		||||
myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
 | 
			
		||||
if (! all(grepl(myRegex, fX[sel]))) {
 | 
			
		||||
  stop("a header has no adjacent sequence.")
 | 
			
		||||
}
 | 
			
		||||
# Ah, you might ask - couldn't we just have dropped all empty lines, and
 | 
			
		||||
# then caught this in Case 2? No - for two reasons: we would still miss headers
 | 
			
		||||
# at the end of file, and, we would have changed the line numbering - and
 | 
			
		||||
# ideally our "production" function will create information about where the
 | 
			
		||||
# error is to be found.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Now combine this into a function ...
 | 
			
		||||
 | 
			
		||||
val <- function(fa) {
 | 
			
		||||
 | 
			
		||||
  if ( ! any(grepl("^>", fa)) ) {
 | 
			
		||||
    stop("no header lines in input.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- grepl("^>", fa)
 | 
			
		||||
  if ( any(sel[- length(sel)] & sel[-1])) {
 | 
			
		||||
    stop("adjacent header lines in input.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- ! grepl("^>", fa)
 | 
			
		||||
  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
 | 
			
		||||
    stop("invalid chracter(s) outside of header lines.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- grep("^>", fa) + 1
 | 
			
		||||
  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
 | 
			
		||||
    stop("a header has no adjacent sequence.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(invisible(NULL))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Here is an example
 | 
			
		||||
FA <- c(">head1",
 | 
			
		||||
        "acdef",
 | 
			
		||||
        "ghi",
 | 
			
		||||
        "",
 | 
			
		||||
        ">head2",
 | 
			
		||||
        "kl",
 | 
			
		||||
        ">head3",
 | 
			
		||||
        "mn",
 | 
			
		||||
        "pqrs")
 | 
			
		||||
val(FA)     # ... should not create an error
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A somewhat more elaborate validateFA() function was loaded with the
 | 
			
		||||
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
 | 
			
		||||
# fasta files have space-characters in their spacer lines. Try it ...
 | 
			
		||||
validateFA(FA)
 | 
			
		||||
 | 
			
		||||
# =    2  Parsing FASTA  =======================================================
 | 
			
		||||
 | 
			
		||||
# Once we have validated our assumptions about our input, it's quite
 | 
			
		||||
# painless to parse it. I have put this together as a function and the function
 | 
			
		||||
# gets loaded from ./.utilities.R
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Lets try this:
 | 
			
		||||
#   - the first 3 elements of faMBP1:
 | 
			
		||||
readFASTA(faMBP1[1:3])
 | 
			
		||||
 | 
			
		||||
#   - a multi FASTA file of aligned APSES domain sequences:
 | 
			
		||||
 | 
			
		||||
refAPSES <- readFASTA("./data/refAPSES.mfa")
 | 
			
		||||
 | 
			
		||||
# Subset the sequence with "P39678" in the header
 | 
			
		||||
refAPSES[grep("P39678", refAPSES$head) ,]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Interpreting FASTA  ==================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# FASTA files are straightforward to interpret - just one thing may be of note:
 | 
			
		||||
# when working with strings, we can use substr(<string>, <start>, <stop>) to
 | 
			
		||||
# extract substrings, but more often we expand the string into a vector of
 | 
			
		||||
# single characters with strsplit(<string>, ""). strsplit() returns a list,
 | 
			
		||||
# to accommodate that <string> could be a vector of many elements, therefore
 | 
			
		||||
# we usually unlist() the result if we use it only on a single string.
 | 
			
		||||
 | 
			
		||||
# Example: How many positive charged residues in "MBP1_SACCE"?
 | 
			
		||||
 | 
			
		||||
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
 | 
			
		||||
s
 | 
			
		||||
 | 
			
		||||
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
 | 
			
		||||
                       # for the characters, sum() coerces to 1 and 0
 | 
			
		||||
                       # respectively, and that gives us the result.
 | 
			
		||||
 | 
			
		||||
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
 | 
			
		||||
 | 
			
		||||
# residue distribution
 | 
			
		||||
x <- factor(s, levels = names(AACOLS))
 | 
			
		||||
pie(table(x)[names(AACOLS)], col = AACOLS)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Writing FASTA  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Writing FASTA files is mostly just the reverse of reading, with one
 | 
			
		||||
# twist: we need to break the long sequence string into chunks of the desired
 | 
			
		||||
# width. The FASTA specification calls for a maximum of 120 characters per line,
 | 
			
		||||
# but writing out much less than that is common, since it allows to comfortably
 | 
			
		||||
# view lines on the console, or printing them on a sheet of paper (do we still
 | 
			
		||||
# do that actually?). How do we break a string into chunks? A combination of
 | 
			
		||||
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
 | 
			
		||||
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
 | 
			
		||||
# loop through our FASTA object in memory, we can build the output by c()'ing
 | 
			
		||||
# blocks of header + sequence to each other. For VERY large objects this might
 | 
			
		||||
# be slow - in that case, we might want to precalculate the size of the output
 | 
			
		||||
# object. But that's more of a hypothetical consideration.
 | 
			
		||||
 | 
			
		||||
( s <- refAPSES$seq[2] )
 | 
			
		||||
nchar(s)
 | 
			
		||||
w <- 30     # width of chunk
 | 
			
		||||
(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
 | 
			
		||||
(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
 | 
			
		||||
 | 
			
		||||
# Task: Is this safe? What happens if nchar(s) is shorter than w?
 | 
			
		||||
#       What happens if nchar(s) is an exact multiple of w?
 | 
			
		||||
 | 
			
		||||
substring(s, starts, ends)
 | 
			
		||||
# confirm that the output contains the first and last residue, and both
 | 
			
		||||
# residues adjacent to the breaks
 | 
			
		||||
 | 
			
		||||
# As always, the function has been defined in ".utilities.R" for to use
 | 
			
		||||
# any time...  type   writeFASTA  to examine it.
 | 
			
		||||
 | 
			
		||||
# Let's try this...
 | 
			
		||||
 | 
			
		||||
writeFASTA(refAPSES, width = 40)
 | 
			
		||||
 | 
			
		||||
# roundtrip for validation: write refAPSES with a different format,
 | 
			
		||||
# read it back in - the new dataframe must be identical
 | 
			
		||||
# to the original dataframe.
 | 
			
		||||
fname <- tempfile()
 | 
			
		||||
writeFASTA(refAPSES, fn = fname, width = 30)
 | 
			
		||||
identical(refAPSES, readFASTA(fname))
 | 
			
		||||
 | 
			
		||||
# ...works for me  :-)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-FASTA.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-FASTA unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2021-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.1.2  style update
 | 
			
		||||
#           1.1.1  bugfix - wrong function name
 | 
			
		||||
#           1.1    2020 Maintenance. Rewrite validation logic. Add data
 | 
			
		||||
#                  to utilities. Define AACOLS
 | 
			
		||||
#           1.0    New unit.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO: Make a simple solution first, then extend it to error checking, and
 | 
			
		||||
#       to handle .mfa files.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                 Line
 | 
			
		||||
#TOC> -----------------------------------------------------
 | 
			
		||||
#TOC>   1        Reading and validating FASTA            45
 | 
			
		||||
#TOC>   1.1        Validating FASTA                      81
 | 
			
		||||
#TOC>   2        Parsing FASTA                          227
 | 
			
		||||
#TOC>   3        Interpreting FASTA                     247
 | 
			
		||||
#TOC>   4        Writing FASTA                          274
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Reading and validating FASTA  ========================================
 | 
			
		||||
 | 
			
		||||
# FASTA is a text based format, structured in lines that are separated by
 | 
			
		||||
# line-feed or paragraph-break characters. Which one of these is used, depends
 | 
			
		||||
# on your operating system. But R's readLines() function knows how to handle
 | 
			
		||||
# these correctly, accross platforms. Don't try to read such files "by hand".
 | 
			
		||||
# Here is the yeast Mbp1 gene, via SGD.
 | 
			
		||||
 | 
			
		||||
file.show("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
faMBP1 <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
 | 
			
		||||
 | 
			
		||||
# The warning is generated because the programmer at the NCBI who implemented
 | 
			
		||||
# the code to write this FASTA file neglected to place a line-break character
 | 
			
		||||
# after the last sequence character. While this is not technically incorrect,
 | 
			
		||||
# it is poor practice: the resulting file can't be distinguished from one that
 | 
			
		||||
# has been truncated in transmission.
 | 
			
		||||
 | 
			
		||||
head(faMBP1)
 | 
			
		||||
 | 
			
		||||
# Note that there are NO line-break characters ("\n") at the end of these
 | 
			
		||||
# strings, even though they were present in the original file. readLines()
 | 
			
		||||
# has "consumed" these characters while reading - but every single line is in
 | 
			
		||||
# a vector of its own.
 | 
			
		||||
 | 
			
		||||
tail(faMBP1)
 | 
			
		||||
 | 
			
		||||
# Also note that the last line has fewer characters - this means readLines()
 | 
			
		||||
# imported the whole line, despite it not being terminated by "\n".
 | 
			
		||||
 | 
			
		||||
# It's very straightforward to work with such data, for example by collapsing
 | 
			
		||||
# everything except the first line into a single string ...
 | 
			
		||||
 | 
			
		||||
f <- c(faMBP1[1], paste(faMBP1[-1], sep = "", collapse = ""))
 | 
			
		||||
 | 
			
		||||
f[1]
 | 
			
		||||
nchar(f[2])
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Validating FASTA  ==================================================
 | 
			
		||||
 | 
			
		||||
# The code above is making the assumption that everything from line 2 until
 | 
			
		||||
#  the end IS sequence, the whole sequence and nothing but sequence.
 | 
			
		||||
#  That assumption can break down in many ways:
 | 
			
		||||
#
 | 
			
		||||
#  - there could be more than one header line. The specification says otherwise,
 | 
			
		||||
#       but some older files use multiple, consecutive header lines. You don't
 | 
			
		||||
#       want that to end up in your sequence.
 | 
			
		||||
#  - this could be not a FASTA file at all. It could be raw sequence, a
 | 
			
		||||
#       different sequence file format, or a wholly different file altogether.
 | 
			
		||||
#       If you look at the file, you can immediately tell, but if you are
 | 
			
		||||
#       reading the file in a complex workflow, your could easily import wrong
 | 
			
		||||
#       data into your analysis.
 | 
			
		||||
#  - there could be more than one sequence in the file. Such Multi-FASTA files
 | 
			
		||||
#       occur commonly, as downloads of ORFs from genome regions or other
 | 
			
		||||
#       sets of genes or proteins, or as the input / output for multiple
 | 
			
		||||
#       sequence alignment programs.
 | 
			
		||||
#
 | 
			
		||||
# Data "from the wild" can (and usually does) have the most unexpected
 | 
			
		||||
# variations and it is really, really important to be clear about the
 | 
			
		||||
# assumptions that you are making. It is possible to "fix" things, according
 | 
			
		||||
# to the "Robustness Principle" :
 | 
			
		||||
#      "Be conservative in what you send,
 | 
			
		||||
#       be liberal in what you accept".
 | 
			
		||||
#       (cf. https://en.wikipedia.org/wiki/Robustness_principle )
 | 
			
		||||
# ... but if you think about this, that's actually a really poor idea,
 | 
			
		||||
# which is much more likely to dilute standards, make unwarranted
 | 
			
		||||
# assumptions, and allow errors to pass silently and corrupt data.
 | 
			
		||||
#
 | 
			
		||||
# Let's discard this principle on the trash-heap of
 | 
			
		||||
# things-that-sound-like-a-good-idea-but-aren't. What we do instead is test,
 | 
			
		||||
# identify problems, and follow the principle: "crash early, crash often". Of
 | 
			
		||||
# course I can write code that would reformat any possible input as a FASTA
 | 
			
		||||
# file - but what good will it do me if it parses the file I receive
 | 
			
		||||
# from a server into FASTA format like:
 | 
			
		||||
#
 | 
			
		||||
#   >404- Page Not Found</title</head>
 | 
			
		||||
#   dyh-PagentfndhpThepageyreqesteddesnteistnthisserverCheckthe
 | 
			
		||||
#   spellingrcntacttheadministratrsdyhtml
 | 
			
		||||
#
 | 
			
		||||
# Therefore, we write ourselves a FASTA checker that will enforce the following:
 | 
			
		||||
#   (1) a FASTA file contains one or more sequences separated by zero or
 | 
			
		||||
#       more empty lines
 | 
			
		||||
#   (2) a sequence contains one header line followed by
 | 
			
		||||
#       one or more sequence lines
 | 
			
		||||
#   (3) a sequence line contains one or more uppercase or lowercase single
 | 
			
		||||
#       letter amino acid codes, hyphens (gap character), or * (stop).
 | 
			
		||||
#
 | 
			
		||||
#   Anything else should generate an error.
 | 
			
		||||
 | 
			
		||||
#   (Case 1): Header(s) exist
 | 
			
		||||
fX <- c("ABC",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grepl("^>", fX)  # "^>" is a regular expression that
 | 
			
		||||
                        # means: the exact character ">" at the
 | 
			
		||||
                        # beginning ("^") of the line.
 | 
			
		||||
if ( ! any(sel) ) { stop("no header lines in input.") }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#   (Case 2) No adjacent header lines
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        ">123",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grepl("^>", fX)
 | 
			
		||||
sel <- sel[- length(sel)] & sel[-1] # comparing shifted vectors
 | 
			
		||||
if ( any(sel)) { stop("adjacent header lines in input.") }
 | 
			
		||||
 | 
			
		||||
#   (Case 3.1) all sequence lines contain only valid characters
 | 
			
		||||
#              (constants for valid characters AAVALID, NUCVALID, and NUCAMBIG
 | 
			
		||||
#               are defined with the .utilities.R script)
 | 
			
		||||
AAVALID
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        "def ;-) ghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
myRegex <- sprintf("[^%s]", AAVALID)  # NOT a valid character
 | 
			
		||||
sel <- ! grepl("^>", fX)              # NOT headers
 | 
			
		||||
if (any(grepl(myRegex, fX[sel]))) {
 | 
			
		||||
  stop("invalid chracter(s) outside of header lines.")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#   (Case 3.2) all headers are followed directly by
 | 
			
		||||
#              at least one letter of sequence
 | 
			
		||||
fX <- c(">ABC",
 | 
			
		||||
        "",
 | 
			
		||||
        ">123",
 | 
			
		||||
        "defghi",
 | 
			
		||||
        "klmnpq")
 | 
			
		||||
sel <- grep("^>", fX) + 1             # indexes of headers + 1
 | 
			
		||||
myRegex <- sprintf("[%s]+", AAVALID)  # at least one valid character
 | 
			
		||||
if (! all(grepl(myRegex, fX[sel]))) {
 | 
			
		||||
  stop("a header has no adjacent sequence.")
 | 
			
		||||
}
 | 
			
		||||
# Ah, you might ask - couldn't we just have dropped all empty lines, and
 | 
			
		||||
# then caught this in Case 2? No - for two reasons: we would still miss headers
 | 
			
		||||
# at the end of file, and, we would have changed the line numbering - and
 | 
			
		||||
# ideally our "production" function will create information about where the
 | 
			
		||||
# error is to be found.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Now combine this into a function ...
 | 
			
		||||
 | 
			
		||||
val <- function(fa) {
 | 
			
		||||
 | 
			
		||||
  if ( ! any(grepl("^>", fa)) ) {
 | 
			
		||||
    stop("no header lines in input.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- grepl("^>", fa)
 | 
			
		||||
  if ( any(sel[- length(sel)] & sel[-1])) {
 | 
			
		||||
    stop("adjacent header lines in input.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- ! grepl("^>", fa)
 | 
			
		||||
  if ( any(grepl(sprintf("[^%s]", AAVALID), fa[sel]))) {
 | 
			
		||||
    stop("invalid chracter(s) outside of header lines.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  sel <- grep("^>", fa) + 1
 | 
			
		||||
  if (! all(grepl(sprintf("[%s]+", AAVALID), fa[sel]))) {
 | 
			
		||||
    stop("a header has no adjacent sequence.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(invisible(NULL))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Here is an example
 | 
			
		||||
FA <- c(">head1",
 | 
			
		||||
        "acdef",
 | 
			
		||||
        "ghi",
 | 
			
		||||
        "",
 | 
			
		||||
        ">head2",
 | 
			
		||||
        "kl",
 | 
			
		||||
        ">head3",
 | 
			
		||||
        "mn",
 | 
			
		||||
        "pqrs")
 | 
			
		||||
val(FA)     # ... should not create an error
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A somewhat more elaborate validateFA() function was loaded with the
 | 
			
		||||
# ./utilities.R script. It needs a bit more bookkeeping, since NCBI multi-
 | 
			
		||||
# fasta files have space-characters in their spacer lines. Try it ...
 | 
			
		||||
validateFA(FA)
 | 
			
		||||
 | 
			
		||||
# =    2  Parsing FASTA  =======================================================
 | 
			
		||||
 | 
			
		||||
# Once we have validated our assumptions about our input, it's quite
 | 
			
		||||
# painless to parse it. I have put this together as a function and the function
 | 
			
		||||
# gets loaded from ./.utilities.R
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Lets try this:
 | 
			
		||||
#   - the first 3 elements of faMBP1:
 | 
			
		||||
readFASTA(faMBP1[1:3])
 | 
			
		||||
 | 
			
		||||
#   - a multi FASTA file of aligned APSES domain sequences:
 | 
			
		||||
 | 
			
		||||
refAPSES <- readFASTA("./data/refAPSES.mfa")
 | 
			
		||||
 | 
			
		||||
# Subset the sequence with "P39678" in the header
 | 
			
		||||
refAPSES[grep("P39678", refAPSES$head) ,]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Interpreting FASTA  ==================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# FASTA files are straightforward to interpret - just one thing may be of note:
 | 
			
		||||
# when working with strings, we can use substr(<string>, <start>, <stop>) to
 | 
			
		||||
# extract substrings, but more often we expand the string into a vector of
 | 
			
		||||
# single characters with strsplit(<string>, ""). strsplit() returns a list,
 | 
			
		||||
# to accommodate that <string> could be a vector of many elements, therefore
 | 
			
		||||
# we usually unlist() the result if we use it only on a single string.
 | 
			
		||||
 | 
			
		||||
# Example: How many positive charged residues in "MBP1_SACCE"?
 | 
			
		||||
 | 
			
		||||
s <- unlist(strsplit(refAPSES$seq[grep("MBP1_SACCE", refAPSES$head)], ""))
 | 
			
		||||
s
 | 
			
		||||
 | 
			
		||||
sum(grepl("[HKR]", s)) # 20 (+) charged residues. grepl() returns TRUE and FALSE
 | 
			
		||||
                       # for the characters, sum() coerces to 1 and 0
 | 
			
		||||
                       # respectively, and that gives us the result.
 | 
			
		||||
 | 
			
		||||
100 * sum(grepl("[HKR]", s)) / length(s) # in percent: 20.2 %
 | 
			
		||||
 | 
			
		||||
# residue distribution
 | 
			
		||||
x <- factor(s, levels = names(AACOLS))
 | 
			
		||||
pie(table(x)[names(AACOLS)], col = AACOLS)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Writing FASTA  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Writing FASTA files is mostly just the reverse of reading, with one
 | 
			
		||||
# twist: we need to break the long sequence string into chunks of the desired
 | 
			
		||||
# width. The FASTA specification calls for a maximum of 120 characters per line,
 | 
			
		||||
# but writing out much less than that is common, since it allows to comfortably
 | 
			
		||||
# view lines on the console, or printing them on a sheet of paper (do we still
 | 
			
		||||
# do that actually?). How do we break a string into chunks? A combination of
 | 
			
		||||
# seq(<from>, <to>, <by>) with substring(<string>, <start>, <stop>) will work
 | 
			
		||||
# nicely. (Note that substring() is vectorized, whereas substr() is not!) As we
 | 
			
		||||
# loop through our FASTA object in memory, we can build the output by c()'ing
 | 
			
		||||
# blocks of header + sequence to each other. For VERY large objects this might
 | 
			
		||||
# be slow - in that case, we might want to precalculate the size of the output
 | 
			
		||||
# object. But that's more of a hypothetical consideration.
 | 
			
		||||
 | 
			
		||||
( s <- refAPSES$seq[2] )
 | 
			
		||||
nchar(s)
 | 
			
		||||
w <- 30     # width of chunk
 | 
			
		||||
(starts <- seq(1, nchar(s), by = w))      # starting index of chunk
 | 
			
		||||
(ends <- c((starts - 1)[-1], nchar(s)))   # ending index of chunk
 | 
			
		||||
 | 
			
		||||
# Task: Is this safe? What happens if nchar(s) is shorter than w?
 | 
			
		||||
#       What happens if nchar(s) is an exact multiple of w?
 | 
			
		||||
 | 
			
		||||
substring(s, starts, ends)
 | 
			
		||||
# confirm that the output contains the first and last residue, and both
 | 
			
		||||
# residues adjacent to the breaks
 | 
			
		||||
 | 
			
		||||
# As always, the function has been defined in ".utilities.R" for to use
 | 
			
		||||
# any time...  type   writeFASTA  to examine it.
 | 
			
		||||
 | 
			
		||||
# Let's try this...
 | 
			
		||||
 | 
			
		||||
writeFASTA(refAPSES, width = 40)
 | 
			
		||||
 | 
			
		||||
# roundtrip for validation: write refAPSES with a different format,
 | 
			
		||||
# read it back in - the new dataframe must be identical
 | 
			
		||||
# to the original dataframe.
 | 
			
		||||
fname <- tempfile()
 | 
			
		||||
writeFASTA(refAPSES, fn = fname, width = 30)
 | 
			
		||||
identical(refAPSES, readFASTA(fname))
 | 
			
		||||
 | 
			
		||||
# ...works for me  :-)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
							
						
						
									
										1348
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,385 +1,385 @@
 | 
			
		||||
# tocID <- "RPR-Genetic_code_optimality.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Genetic_code_optimality unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance
 | 
			
		||||
#           1.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.1      Update set.seed() usage
 | 
			
		||||
#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
 | 
			
		||||
#           1.0      New material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                          Line
 | 
			
		||||
#TOC> --------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Designing a computational experiment             58
 | 
			
		||||
#TOC>   2        Setting up the tools                             74
 | 
			
		||||
#TOC>   2.1        Natural and alternative genetic codes          77
 | 
			
		||||
#TOC>   2.2        Effect of mutations                           135
 | 
			
		||||
#TOC>   2.2.1          reverse-translate                         146
 | 
			
		||||
#TOC>   2.2.2          Randomly mutate                           171
 | 
			
		||||
#TOC>   2.2.3          Forward- translate                        196
 | 
			
		||||
#TOC>   2.2.4          measure effect                            213
 | 
			
		||||
#TOC>   3        Run the experiment                              267
 | 
			
		||||
#TOC>   4        Task solutions                                  363
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
 | 
			
		||||
# their robsustness to code changes. The approaches are quite simple and you
 | 
			
		||||
# will be able to come up with obvious refinements; the point of this code is to
 | 
			
		||||
# demonstrate some R programming techniques, in preparation for more
 | 
			
		||||
# sophisticated questions later.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Designing a computational experiment  ================================
 | 
			
		||||
 | 
			
		||||
# Computational experiments are conducted like wet-lab experiments. We begin
 | 
			
		||||
# with a hypothesis, then define the observables that relate to the hypothesis,
 | 
			
		||||
# then define the measures we apply to observations, and finally we interpret
 | 
			
		||||
# our observations. If we want to learn something about the evolution of the
 | 
			
		||||
# genetic code ...
 | 
			
		||||
 | 
			
		||||
#  - we construct a hypothesis such as: the genetic code has evolved so as to
 | 
			
		||||
#      minimize the effect of mutations;
 | 
			
		||||
#  - we define the observables: the effect of mutations in
 | 
			
		||||
#      sequences, given the natural and possible alternative codes;
 | 
			
		||||
#  - we define the measures to quantify the effect of mutations;
 | 
			
		||||
#  - then we compute alternatives and interpret the results.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Setting up the tools  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Natural and alternative genetic codes  =============================
 | 
			
		||||
 | 
			
		||||
# Load genetic code tables from the Biostrings package
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# There are many ways to generate alternative codes. The simplest way is to
 | 
			
		||||
# randomly assign amino acids to codons. A more sophisticated way is to keep the
 | 
			
		||||
# redundancy of codons intact, since it may reflect some form of symmetry
 | 
			
		||||
# breaking that ignores the third nucleotide of a codon for the most part;
 | 
			
		||||
# therefore we only replace the amino acids of the existing code with random
 | 
			
		||||
# others. Here are two functions that implement these two ideas about alternate
 | 
			
		||||
# codes.
 | 
			
		||||
 | 
			
		||||
randomGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly assigned amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments in which
 | 
			
		||||
  #                       every amino acid and "*" is encoded at least once.
 | 
			
		||||
 | 
			
		||||
  aa <- unique(GC)                           # the amino acids in the input code
 | 
			
		||||
  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
 | 
			
		||||
  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
 | 
			
		||||
                                             # does not contain all amino acids,
 | 
			
		||||
                                             # then we sample() again.
 | 
			
		||||
    GC[1:64] <- sample(aa, 64, replace = TRUE)
 | 
			
		||||
  }
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
swappedGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly swapped amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments where the
 | 
			
		||||
  #                       amino acids have been swapped.
 | 
			
		||||
 | 
			
		||||
  aaOrig <- unique(GC)                       # the amino acids in the input code
 | 
			
		||||
  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
 | 
			
		||||
  names(aaSwap) <- aaOrig                    # name them after the original
 | 
			
		||||
  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
 | 
			
		||||
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Effect of mutations  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# To evaluate the effects of mutations we will do the following:
 | 
			
		||||
#   - we take an amino acid sequence (Mbp1 will do just nicely);
 | 
			
		||||
#   - we reverse-translate it into a nucleotide sequence;
 | 
			
		||||
#   - we mutate it randomly;
 | 
			
		||||
#   - we translate it back to amino acids;
 | 
			
		||||
#   - we count the number of mutations and evaluate their severity.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.1  reverse-translate                    
 | 
			
		||||
 | 
			
		||||
# To reverse-translate an amino acid vector, we randomly pick one of its
 | 
			
		||||
# codons from a genetic code, and assemble all codons to a sequence.
 | 
			
		||||
 | 
			
		||||
traRev <- function(s, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      s   chr   a sequence vector
 | 
			
		||||
  #      GC  chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A reverse-translated vector of codons
 | 
			
		||||
  vC <- character(length(s))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(s)) {
 | 
			
		||||
    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
 | 
			
		||||
    if (length(codon) > 1) {         # if there's more than one ...
 | 
			
		||||
      codon <- sample(codon, 1)      # pick one at random ...
 | 
			
		||||
    }
 | 
			
		||||
    vC[i] <- codon                   # store it
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(vC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.2  Randomly mutate                      
 | 
			
		||||
 | 
			
		||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
 | 
			
		||||
# one of the three with another nucleotide.
 | 
			
		||||
 | 
			
		||||
randMut <- function(vC) {
 | 
			
		||||
  # Parameter:
 | 
			
		||||
  #    vC   chr     a vector of codons
 | 
			
		||||
  # Value:  chr     a vector of codons with a single point mutation from vC
 | 
			
		||||
 | 
			
		||||
  nuc <- c("A", "C", "G", "T")
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
 | 
			
		||||
    iNuc <- sample(1:3, 1)                         # choose one of the three
 | 
			
		||||
    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
 | 
			
		||||
    triplet[iNuc] <- mutNuc                        # replace the original
 | 
			
		||||
    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
 | 
			
		||||
  }
 | 
			
		||||
  return(vC)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.3  Forward- translate                   
 | 
			
		||||
 | 
			
		||||
traFor <- function(vC, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      vC   chr   a codon vector
 | 
			
		||||
  #      GC   chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A vector of amino acids
 | 
			
		||||
  vAA <- character(length(vC))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    vAA[i] <- GC[vC[i]]         # translate and store
 | 
			
		||||
  }
 | 
			
		||||
  return(vAA)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.4  measure effect                       
 | 
			
		||||
 | 
			
		||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
 | 
			
		||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
 | 
			
		||||
# categories, according to their free energy of transfer from water to octanol:
 | 
			
		||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
# Then we will penalize as follows:
 | 
			
		||||
# Changes within one category: 0.1
 | 
			
		||||
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
 | 
			
		||||
# Changes from hydrophobic to hydrophilic or back: 1.0
 | 
			
		||||
# Changes to stop-codon: 3.0
 | 
			
		||||
 | 
			
		||||
evalMut <- function(nat, mut) {
 | 
			
		||||
  # Evaluate severity of mutations between amino acid sequence vectors nat and
 | 
			
		||||
  # mut in an ad hoc approach based on hydrophobicity changes.
 | 
			
		||||
  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
  aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
  penalties <- numeric(length(nat))
 | 
			
		||||
  lMut <- nat != mut    # logical TRUE for all mutated positions
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
 | 
			
		||||
 | 
			
		||||
  return(sum(penalties))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# A more sophisticated approach could take additional quantities into account,
 | 
			
		||||
# such as charge, size, or flexibility - and it could add heuristics, such as:
 | 
			
		||||
# proline is always bad in secondary structure, charged amino acids are terrible
 | 
			
		||||
# in the folded core of a protein, replacing a small by a large amino acid in
 | 
			
		||||
# the core is very disruptive ... etc.
 | 
			
		||||
#
 | 
			
		||||
# For our experiment, we should not  use a mutation data matrix however:
 | 
			
		||||
# empirical mutation probabilities are superbly suited to estimate evolutionary
 | 
			
		||||
# relationships. Here however, as we are trying to evaluate effects of random
 | 
			
		||||
# mutations on genetic codes, our reasoning would be circular - we would
 | 
			
		||||
# discover that the natural genetic code is optimal ... because it is most
 | 
			
		||||
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Run the experiment  ==================================================
 | 
			
		||||
 | 
			
		||||
# Fetch the standard Genetic code from Biostrings::
 | 
			
		||||
 | 
			
		||||
stdCode <- Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# Fetch the nucleotide sequence for MBP1:
 | 
			
		||||
 | 
			
		||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
 | 
			
		||||
myDNA <- paste0(myDNA, collapse = "")
 | 
			
		||||
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
 | 
			
		||||
myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
 | 
			
		||||
 | 
			
		||||
myAA <- traFor(myDNA, stdCode)
 | 
			
		||||
 | 
			
		||||
# Mutate and evaluate
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
x <- randMut(myDNA)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
x <- traFor(x, stdCode)
 | 
			
		||||
evalMut(myAA, x)  # 166.4
 | 
			
		||||
 | 
			
		||||
# Try this 200 times, and see how the values are distributed.
 | 
			
		||||
N <- 200
 | 
			
		||||
valSTDC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                   # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {                   # this takes a few seconds ...
 | 
			
		||||
  x <- randMut(myDNA)              # mutate
 | 
			
		||||
  x <- traFor(x, stdCode)     # translate
 | 
			
		||||
  valSTDC[i] <- evalMut(myAA, x)    # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                     # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valSTDC,
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     col = "palegoldenrod",
 | 
			
		||||
     xlim = c(0, 400),
 | 
			
		||||
     ylim = c(0, N/4),
 | 
			
		||||
     main = "Standard vs. Synthetic Genetic Code",
 | 
			
		||||
     xlab = "Mutation penalty")
 | 
			
		||||
 | 
			
		||||
# This looks like a normal distribution. Let's assume the effect of mutations
 | 
			
		||||
# under the standard genetic code is the mean of this distribution:
 | 
			
		||||
effectSTDC <- mean(valSTDC)  # 178.1
 | 
			
		||||
 | 
			
		||||
# Now we can look at the effects of alternate genetic codes:
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
# choose a new code
 | 
			
		||||
GC <- randomGC(stdCode)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
 | 
			
		||||
# reverse translate hypothetical sequence according to the new code
 | 
			
		||||
x <- traRev(myAA, GC)
 | 
			
		||||
 | 
			
		||||
x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
 | 
			
		||||
x <- traFor(x, GC)     # translate back, with the new code
 | 
			
		||||
evalMut(myAA, x)       # evaluate mutation effects: 298.5
 | 
			
		||||
 | 
			
		||||
# That seems a fair bit higher than what we saw as "effectUGC"
 | 
			
		||||
# Let's try with different genetic codes. 200 trials - but this time every trial
 | 
			
		||||
# is with a different, synthetic genetic code.
 | 
			
		||||
 | 
			
		||||
N <- 200
 | 
			
		||||
valXGC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(1414214)                # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- randomGC(stdCode)   # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valXGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                   # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valXGC,
 | 
			
		||||
     col = "plum",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
# These two distributions are very widely separated!
 | 
			
		||||
 | 
			
		||||
# Task: Perform the same experiment with the swapped genetic code.
 | 
			
		||||
#       Compare the distributions. Interpret the result.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# These are simple experiments, under assumptions that can be refined in
 | 
			
		||||
# meaningful ways. Yet, even those simple computational experiments show
 | 
			
		||||
# that the Universal Genetic Code has features that one would predict if
 | 
			
		||||
# it has evolved under selective pressure to minimize the effects of mutations.
 | 
			
		||||
# Gradual change under mutation is benificial to evolution, disruptive
 | 
			
		||||
# change is not.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
N <- 200
 | 
			
		||||
valSGC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(2718282)                # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- swappedGC(stdCode)  # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valSGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                   # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valSGC,
 | 
			
		||||
     col = "#6688FF88",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-Genetic_code_optimality.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Genetic_code_optimality unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.3
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.3    2020 Maintenance
 | 
			
		||||
#           1.2    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#                      use Biocmanager:: not biocLite()
 | 
			
		||||
#           1.1      Update set.seed() usage
 | 
			
		||||
#           1.0.1    Fixed two bugs discovered by Suan Chin Yeo.
 | 
			
		||||
#           1.0      New material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                          Line
 | 
			
		||||
#TOC> --------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Designing a computational experiment             58
 | 
			
		||||
#TOC>   2        Setting up the tools                             74
 | 
			
		||||
#TOC>   2.1        Natural and alternative genetic codes          77
 | 
			
		||||
#TOC>   2.2        Effect of mutations                           135
 | 
			
		||||
#TOC>   2.2.1          reverse-translate                         146
 | 
			
		||||
#TOC>   2.2.2          Randomly mutate                           171
 | 
			
		||||
#TOC>   2.2.3          Forward- translate                        196
 | 
			
		||||
#TOC>   2.2.4          measure effect                            213
 | 
			
		||||
#TOC>   3        Run the experiment                              267
 | 
			
		||||
#TOC>   4        Task solutions                                  363
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This unit demonstrates R code to simulate alternate genetic codes and evaluate
 | 
			
		||||
# their robsustness to code changes. The approaches are quite simple and you
 | 
			
		||||
# will be able to come up with obvious refinements; the point of this code is to
 | 
			
		||||
# demonstrate some R programming techniques, in preparation for more
 | 
			
		||||
# sophisticated questions later.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Designing a computational experiment  ================================
 | 
			
		||||
 | 
			
		||||
# Computational experiments are conducted like wet-lab experiments. We begin
 | 
			
		||||
# with a hypothesis, then define the observables that relate to the hypothesis,
 | 
			
		||||
# then define the measures we apply to observations, and finally we interpret
 | 
			
		||||
# our observations. If we want to learn something about the evolution of the
 | 
			
		||||
# genetic code ...
 | 
			
		||||
 | 
			
		||||
#  - we construct a hypothesis such as: the genetic code has evolved so as to
 | 
			
		||||
#      minimize the effect of mutations;
 | 
			
		||||
#  - we define the observables: the effect of mutations in
 | 
			
		||||
#      sequences, given the natural and possible alternative codes;
 | 
			
		||||
#  - we define the measures to quantify the effect of mutations;
 | 
			
		||||
#  - then we compute alternatives and interpret the results.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Setting up the tools  ================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Natural and alternative genetic codes  =============================
 | 
			
		||||
 | 
			
		||||
# Load genetic code tables from the Biostrings package
 | 
			
		||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
 | 
			
		||||
  install.packages("BiocManager")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
 | 
			
		||||
  BiocManager::install("Biostrings")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = Biostrings)       # basic information
 | 
			
		||||
#  browseVignettes("Biostrings")    # available vignettes
 | 
			
		||||
#  data(package = "Biostrings")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# There are many ways to generate alternative codes. The simplest way is to
 | 
			
		||||
# randomly assign amino acids to codons. A more sophisticated way is to keep the
 | 
			
		||||
# redundancy of codons intact, since it may reflect some form of symmetry
 | 
			
		||||
# breaking that ignores the third nucleotide of a codon for the most part;
 | 
			
		||||
# therefore we only replace the amino acids of the existing code with random
 | 
			
		||||
# others. Here are two functions that implement these two ideas about alternate
 | 
			
		||||
# codes.
 | 
			
		||||
 | 
			
		||||
randomGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly assigned amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments in which
 | 
			
		||||
  #                       every amino acid and "*" is encoded at least once.
 | 
			
		||||
 | 
			
		||||
  aa <- unique(GC)                           # the amino acids in the input code
 | 
			
		||||
  GC[1:64] <- sample(aa, 64, replace = TRUE) # random code
 | 
			
		||||
  while(length(unique(GC)) < length(aa)) {   # We could end up with a code that
 | 
			
		||||
                                             # does not contain all amino acids,
 | 
			
		||||
                                             # then we sample() again.
 | 
			
		||||
    GC[1:64] <- sample(aa, 64, replace = TRUE)
 | 
			
		||||
  }
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
swappedGC <- function(GC) {
 | 
			
		||||
  # Return a genetic code with randomly swapped amino acids.
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    GC   named chr  length-64 character vector of 20 amino acid one-letter
 | 
			
		||||
  #                       codes plus "*" (stop), named with the codon triplet.
 | 
			
		||||
  # Value:  named chr  same vector with random amino acid assignments where the
 | 
			
		||||
  #                       amino acids have been swapped.
 | 
			
		||||
 | 
			
		||||
  aaOrig <- unique(GC)                       # the amino acids in the input code
 | 
			
		||||
  aaSwap <- sample(aaOrig, length(aaOrig))   # shuffled
 | 
			
		||||
  names(aaSwap) <- aaOrig                    # name them after the original
 | 
			
		||||
  GC[1:64] <- aaSwap[GC]                     # replace original with shuffled
 | 
			
		||||
 | 
			
		||||
  return(GC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Effect of mutations  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# To evaluate the effects of mutations we will do the following:
 | 
			
		||||
#   - we take an amino acid sequence (Mbp1 will do just nicely);
 | 
			
		||||
#   - we reverse-translate it into a nucleotide sequence;
 | 
			
		||||
#   - we mutate it randomly;
 | 
			
		||||
#   - we translate it back to amino acids;
 | 
			
		||||
#   - we count the number of mutations and evaluate their severity.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.1  reverse-translate                    
 | 
			
		||||
 | 
			
		||||
# To reverse-translate an amino acid vector, we randomly pick one of its
 | 
			
		||||
# codons from a genetic code, and assemble all codons to a sequence.
 | 
			
		||||
 | 
			
		||||
traRev <- function(s, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      s   chr   a sequence vector
 | 
			
		||||
  #      GC  chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A reverse-translated vector of codons
 | 
			
		||||
  vC <- character(length(s))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(s)) {
 | 
			
		||||
    codon <- names(GC)[GC == s[i]]   # get all codons for this AA
 | 
			
		||||
    if (length(codon) > 1) {         # if there's more than one ...
 | 
			
		||||
      codon <- sample(codon, 1)      # pick one at random ...
 | 
			
		||||
    }
 | 
			
		||||
    vC[i] <- codon                   # store it
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return(vC)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.2  Randomly mutate                      
 | 
			
		||||
 | 
			
		||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
 | 
			
		||||
# one of the three with another nucleotide.
 | 
			
		||||
 | 
			
		||||
randMut <- function(vC) {
 | 
			
		||||
  # Parameter:
 | 
			
		||||
  #    vC   chr     a vector of codons
 | 
			
		||||
  # Value:  chr     a vector of codons with a single point mutation from vC
 | 
			
		||||
 | 
			
		||||
  nuc <- c("A", "C", "G", "T")
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    triplet <- unlist(strsplit(vC[i], ""))         # split into three nucl.
 | 
			
		||||
    iNuc <- sample(1:3, 1)                         # choose one of the three
 | 
			
		||||
    mutNuc <- sample(nuc[nuc != triplet[iNuc]], 1) # chose a mutated nucleotide
 | 
			
		||||
    triplet[iNuc] <- mutNuc                        # replace the original
 | 
			
		||||
    vC[i] <- paste0(triplet, collapse = "")        # collapse it to a codon
 | 
			
		||||
  }
 | 
			
		||||
  return(vC)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.3  Forward- translate                   
 | 
			
		||||
 | 
			
		||||
traFor <- function(vC, GC) {
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #      vC   chr   a codon vector
 | 
			
		||||
  #      GC   chr   a genetic code
 | 
			
		||||
  # Value:
 | 
			
		||||
  #      A vector of amino acids
 | 
			
		||||
  vAA <- character(length(vC))
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(vC)) {
 | 
			
		||||
    vAA[i] <- GC[vC[i]]         # translate and store
 | 
			
		||||
  }
 | 
			
		||||
  return(vAA)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ===   2.2.4  measure effect                       
 | 
			
		||||
 | 
			
		||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
 | 
			
		||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
 | 
			
		||||
# categories, according to their free energy of transfer from water to octanol:
 | 
			
		||||
aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
# Then we will penalize as follows:
 | 
			
		||||
# Changes within one category: 0.1
 | 
			
		||||
# Changes from hydrophobic or hydrophilic to neutral or back: 0.3
 | 
			
		||||
# Changes from hydrophobic to hydrophilic or back: 1.0
 | 
			
		||||
# Changes to stop-codon: 3.0
 | 
			
		||||
 | 
			
		||||
evalMut <- function(nat, mut) {
 | 
			
		||||
  # Evaluate severity of mutations between amino acid sequence vectors nat and
 | 
			
		||||
  # mut in an ad hoc approach based on hydrophobicity changes.
 | 
			
		||||
  aaHphobic <- c("M", "I", "L", "C", "W", "Y", "F")
 | 
			
		||||
  aaHphilic <- c("E", "D", "Q", "N", "P", "K", "R")
 | 
			
		||||
  aaNeutral <- c("A", "H", "T", "S", "V", "G")
 | 
			
		||||
 | 
			
		||||
  penalties <- numeric(length(nat))
 | 
			
		||||
  lMut <- nat != mut    # logical TRUE for all mutated positions
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphobic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaHphilic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphobic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphobic)] <- 1.0
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaHphilic)] <- 0.1
 | 
			
		||||
  penalties[lMut & (nat %in% aaHphilic) & (mut %in% aaNeutral)] <- 0.3
 | 
			
		||||
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphobic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaHphilic)] <- 0.3
 | 
			
		||||
  penalties[lMut & (nat %in% aaNeutral) & (mut %in% aaNeutral)] <- 0.1
 | 
			
		||||
 | 
			
		||||
  return(sum(penalties))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# A more sophisticated approach could take additional quantities into account,
 | 
			
		||||
# such as charge, size, or flexibility - and it could add heuristics, such as:
 | 
			
		||||
# proline is always bad in secondary structure, charged amino acids are terrible
 | 
			
		||||
# in the folded core of a protein, replacing a small by a large amino acid in
 | 
			
		||||
# the core is very disruptive ... etc.
 | 
			
		||||
#
 | 
			
		||||
# For our experiment, we should not  use a mutation data matrix however:
 | 
			
		||||
# empirical mutation probabilities are superbly suited to estimate evolutionary
 | 
			
		||||
# relationships. Here however, as we are trying to evaluate effects of random
 | 
			
		||||
# mutations on genetic codes, our reasoning would be circular - we would
 | 
			
		||||
# discover that the natural genetic code is optimal ... because it is most
 | 
			
		||||
# similar to the natural genetic code. That would be Cargo Cult bioinformatics.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Run the experiment  ==================================================
 | 
			
		||||
 | 
			
		||||
# Fetch the standard Genetic code from Biostrings::
 | 
			
		||||
 | 
			
		||||
stdCode <- Biostrings::GENETIC_CODE
 | 
			
		||||
 | 
			
		||||
# Fetch the nucleotide sequence for MBP1:
 | 
			
		||||
 | 
			
		||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
 | 
			
		||||
myDNA <- paste0(myDNA, collapse = "")
 | 
			
		||||
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
 | 
			
		||||
myDNA <- myDNA[-length(myDNA)]  # drop the stop codon
 | 
			
		||||
 | 
			
		||||
myAA <- traFor(myDNA, stdCode)
 | 
			
		||||
 | 
			
		||||
# Mutate and evaluate
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
x <- randMut(myDNA)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
x <- traFor(x, stdCode)
 | 
			
		||||
evalMut(myAA, x)  # 166.4
 | 
			
		||||
 | 
			
		||||
# Try this 200 times, and see how the values are distributed.
 | 
			
		||||
N <- 200
 | 
			
		||||
valSTDC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(112358)                   # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {                   # this takes a few seconds ...
 | 
			
		||||
  x <- randMut(myDNA)              # mutate
 | 
			
		||||
  x <- traFor(x, stdCode)     # translate
 | 
			
		||||
  valSTDC[i] <- evalMut(myAA, x)    # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                     # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valSTDC,
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     col = "palegoldenrod",
 | 
			
		||||
     xlim = c(0, 400),
 | 
			
		||||
     ylim = c(0, N/4),
 | 
			
		||||
     main = "Standard vs. Synthetic Genetic Code",
 | 
			
		||||
     xlab = "Mutation penalty")
 | 
			
		||||
 | 
			
		||||
# This looks like a normal distribution. Let's assume the effect of mutations
 | 
			
		||||
# under the standard genetic code is the mean of this distribution:
 | 
			
		||||
effectSTDC <- mean(valSTDC)  # 178.1
 | 
			
		||||
 | 
			
		||||
# Now we can look at the effects of alternate genetic codes:
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
# choose a new code
 | 
			
		||||
GC <- randomGC(stdCode)
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
 | 
			
		||||
# reverse translate hypothetical sequence according to the new code
 | 
			
		||||
x <- traRev(myAA, GC)
 | 
			
		||||
 | 
			
		||||
x <- randMut(x)        # randomly mutate hypothetical nucleotide sequence
 | 
			
		||||
x <- traFor(x, GC)     # translate back, with the new code
 | 
			
		||||
evalMut(myAA, x)       # evaluate mutation effects: 298.5
 | 
			
		||||
 | 
			
		||||
# That seems a fair bit higher than what we saw as "effectUGC"
 | 
			
		||||
# Let's try with different genetic codes. 200 trials - but this time every trial
 | 
			
		||||
# is with a different, synthetic genetic code.
 | 
			
		||||
 | 
			
		||||
N <- 200
 | 
			
		||||
valXGC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(1414214)                # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- randomGC(stdCode)   # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valXGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                   # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valXGC,
 | 
			
		||||
     col = "plum",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
# These two distributions are very widely separated!
 | 
			
		||||
 | 
			
		||||
# Task: Perform the same experiment with the swapped genetic code.
 | 
			
		||||
#       Compare the distributions. Interpret the result.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# These are simple experiments, under assumptions that can be refined in
 | 
			
		||||
# meaningful ways. Yet, even those simple computational experiments show
 | 
			
		||||
# that the Universal Genetic Code has features that one would predict if
 | 
			
		||||
# it has evolved under selective pressure to minimize the effects of mutations.
 | 
			
		||||
# Gradual change under mutation is benificial to evolution, disruptive
 | 
			
		||||
# change is not.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
N <- 200
 | 
			
		||||
valSGC <- numeric(N)
 | 
			
		||||
 | 
			
		||||
set.seed(2718282)                # set RNG seed for repeatable randomness
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  GC <- swappedGC(stdCode)  # Choose code
 | 
			
		||||
  x <- traRev(myAA, GC)          # reverse translate
 | 
			
		||||
  x <- randMut(x)                # mutate
 | 
			
		||||
  x <- traFor(x, GC)             # translate
 | 
			
		||||
  valSGC[i] <- evalMut(myAA, x)  # evaluate
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)                   # reset the RNG
 | 
			
		||||
 | 
			
		||||
hist(valSGC,
 | 
			
		||||
     col = "#6688FF88",
 | 
			
		||||
     breaks = 15,
 | 
			
		||||
     add = TRUE)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,50 +1,50 @@
 | 
			
		||||
# tocID <- "RPR-Introduction.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Introduction unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2020-09-18
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 1.0    Updtaed workflow; live
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# === TASK: Local script
 | 
			
		||||
#
 | 
			
		||||
# - Open the file myScript.R
 | 
			
		||||
#
 | 
			
		||||
# - Create a section header with a date.
 | 
			
		||||
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
 | 
			
		||||
#     from 0). Not a loop - a single expression. The first number you get must
 | 
			
		||||
#     be 1. The last number you get must be 1024.
 | 
			
		||||
#
 | 
			
		||||
# - Save the file in the myScripts folder, and close it.
 | 
			
		||||
#
 | 
			
		||||
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
 | 
			
		||||
#   to execute it.
 | 
			
		||||
#
 | 
			
		||||
# - Done
 | 
			
		||||
 | 
			
		||||
# (This task is meant  to make sure that writing R expressions, saving
 | 
			
		||||
#  them in scripts, opening script files and executing code in the file works
 | 
			
		||||
#  for you. If there is an issue, get in touch.)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-Introduction.R"
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Introduction unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2020-09-18
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 1.0    Updtaed workflow; live
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
# === TASK: Local script
 | 
			
		||||
#
 | 
			
		||||
# - Open the file myScript.R
 | 
			
		||||
#
 | 
			
		||||
# - Create a section header with a date.
 | 
			
		||||
# - Enter an R-expression that will produce the first 11 powers of 2 (starting
 | 
			
		||||
#     from 0). Not a loop - a single expression. The first number you get must
 | 
			
		||||
#     be 1. The last number you get must be 1024.
 | 
			
		||||
#
 | 
			
		||||
# - Save the file in the myScripts folder, and close it.
 | 
			
		||||
#
 | 
			
		||||
# - Open the file again, select the expression and type Cmd+Enter (or Cmd+R)
 | 
			
		||||
#   to execute it.
 | 
			
		||||
#
 | 
			
		||||
# - Done
 | 
			
		||||
 | 
			
		||||
# (This task is meant  to make sure that writing R expressions, saving
 | 
			
		||||
#  them in scripts, opening script files and executing code in the file works
 | 
			
		||||
#  for you. If there is an issue, get in touch.)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,168 +1,168 @@
 | 
			
		||||
# tocID <- "RPR-PROSITE_POST.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#           1.0.1  Updates for slightly changed interfaces
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                 Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Constructing a POST command from a Web query            43
 | 
			
		||||
#TOC>   1.1        Task - fetchPrositeFeatures() function               148
 | 
			
		||||
#TOC>   2        Task solutions                                         156
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Constructing a POST command from a Web query  ========================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We have reverse engineered the Web form for a ScanProsite request, and can
 | 
			
		||||
# construct a valid POST request from knowing the required field names. The POST
 | 
			
		||||
# command is similar to GET(), but we need an explicit request body that
 | 
			
		||||
# contains a list of key/value pairs
 | 
			
		||||
 | 
			
		||||
UniProtID <- "P39678"
 | 
			
		||||
 | 
			
		||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
 | 
			
		||||
 | 
			
		||||
response <- httr::POST(URL,
 | 
			
		||||
                       body = list(meta = "opt1",
 | 
			
		||||
                                   meta1_protein = "opt1",
 | 
			
		||||
                                   seq = UniProtID,
 | 
			
		||||
                                   skip = "on",
 | 
			
		||||
                                   output = "tabular"))
 | 
			
		||||
 | 
			
		||||
# Send off this request, and you should have a response in a few
 | 
			
		||||
# seconds. Let's check the status first:
 | 
			
		||||
 | 
			
		||||
httr::status_code(response)  # If this is not 200, something went wrong and it
 | 
			
		||||
                             # makes no sense to continue. If this persists, ask
 | 
			
		||||
                             # on the Discussion Board what to do.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The text contents of the response is available with the
 | 
			
		||||
# content() function:
 | 
			
		||||
httr::content(response, "text")
 | 
			
		||||
 | 
			
		||||
# ... should show you the same as the page contents that you have seen in the
 | 
			
		||||
# browser. Now we need to extract the data from the page. For this simple
 | 
			
		||||
# example we can get away with using regular expressions, but in general we need
 | 
			
		||||
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
 | 
			
		||||
# strsplit() the response into individual lines, since each of our data elements
 | 
			
		||||
# is on its own line, and then capture the contents. The way Prosite has
 | 
			
		||||
# formatted their HTML we can simply split on the "\\n" newline character - but
 | 
			
		||||
# they could write the same valid HTML without any newline-characters at all.
 | 
			
		||||
# Understand that we are working with a bit of a "hack" here: exploting
 | 
			
		||||
# empirical assumptions rather than a formal specification. But sometimes quick
 | 
			
		||||
# and dirty is fine, because quick.
 | 
			
		||||
 | 
			
		||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
 | 
			
		||||
head(lines)
 | 
			
		||||
 | 
			
		||||
# Now we define a query pattern for the lines we want:
 | 
			
		||||
# we can use the uID, bracketed by two "|" pipe
 | 
			
		||||
# characters:
 | 
			
		||||
 | 
			
		||||
patt <- sprintf("\\|%s\\|", UniProtID)
 | 
			
		||||
 | 
			
		||||
# ... and select only the lines that match this
 | 
			
		||||
# pattern:
 | 
			
		||||
 | 
			
		||||
( lines <- lines[grep(patt, lines)] )
 | 
			
		||||
 | 
			
		||||
# ... captures the three lines of output.
 | 
			
		||||
 | 
			
		||||
# Now we break the lines apart into tokens: this is another application of
 | 
			
		||||
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
 | 
			
		||||
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
 | 
			
		||||
 | 
			
		||||
unlist(strsplit(lines[1], "\\t|\\|"))
 | 
			
		||||
 | 
			
		||||
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
 | 
			
		||||
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
 | 
			
		||||
# not the literal character "t". And "|" has to be escaped because we mean the
 | 
			
		||||
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
 | 
			
		||||
# backslash turns a special meaning off, and sometimes it turns a special
 | 
			
		||||
# meaning on. Unfortunately there's no easy way to tell - you just need to
 | 
			
		||||
# remember the characters - or have a reference handy. The metacharacters are
 | 
			
		||||
# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
 | 
			
		||||
# where in the regex they are.
 | 
			
		||||
 | 
			
		||||
# Let's put the tokens into named slots of a data frame
 | 
			
		||||
 | 
			
		||||
features <- data.frame()
 | 
			
		||||
for (line in lines) {
 | 
			
		||||
  tokens <- unlist(strsplit(line, "\\t|\\|"))
 | 
			
		||||
  features <- rbind(features,
 | 
			
		||||
                    data.frame(uID   =  tokens[2],
 | 
			
		||||
                               start =  as.numeric(tokens[4]),
 | 
			
		||||
                               end   =  as.numeric(tokens[5]),
 | 
			
		||||
                               psID  =  tokens[6],
 | 
			
		||||
                               psName = tokens[7],
 | 
			
		||||
                               psSeq  = tokens[11]))
 | 
			
		||||
}
 | 
			
		||||
features
 | 
			
		||||
 | 
			
		||||
#  This forms the base of a function that collects the features automatically
 | 
			
		||||
#  from a PrositeScan result. You can write this!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchPrositeFeatures() function  ============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: write a function that takes as input a UniProt ID, fetches the
 | 
			
		||||
# features it contains from ScanProsite and returns a data frame as given above, or
 | 
			
		||||
# an empty data frame if there is an error.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
 | 
			
		||||
# clicking on  dbFetchPrositeFeatures() in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test:
 | 
			
		||||
dbFetchPrositeFeatures("Q5KMQ9")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-PROSITE_POST.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout,
 | 
			
		||||
#           1.0.1  Updates for slightly changed interfaces
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                                 Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------------
 | 
			
		||||
#TOC>   1        Constructing a POST command from a Web query            43
 | 
			
		||||
#TOC>   1.1        Task - fetchPrositeFeatures() function               148
 | 
			
		||||
#TOC>   2        Task solutions                                         156
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Constructing a POST command from a Web query  ========================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We have reverse engineered the Web form for a ScanProsite request, and can
 | 
			
		||||
# construct a valid POST request from knowing the required field names. The POST
 | 
			
		||||
# command is similar to GET(), but we need an explicit request body that
 | 
			
		||||
# contains a list of key/value pairs
 | 
			
		||||
 | 
			
		||||
UniProtID <- "P39678"
 | 
			
		||||
 | 
			
		||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
 | 
			
		||||
 | 
			
		||||
response <- httr::POST(URL,
 | 
			
		||||
                       body = list(meta = "opt1",
 | 
			
		||||
                                   meta1_protein = "opt1",
 | 
			
		||||
                                   seq = UniProtID,
 | 
			
		||||
                                   skip = "on",
 | 
			
		||||
                                   output = "tabular"))
 | 
			
		||||
 | 
			
		||||
# Send off this request, and you should have a response in a few
 | 
			
		||||
# seconds. Let's check the status first:
 | 
			
		||||
 | 
			
		||||
httr::status_code(response)  # If this is not 200, something went wrong and it
 | 
			
		||||
                             # makes no sense to continue. If this persists, ask
 | 
			
		||||
                             # on the Discussion Board what to do.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The text contents of the response is available with the
 | 
			
		||||
# content() function:
 | 
			
		||||
httr::content(response, "text")
 | 
			
		||||
 | 
			
		||||
# ... should show you the same as the page contents that you have seen in the
 | 
			
		||||
# browser. Now we need to extract the data from the page. For this simple
 | 
			
		||||
# example we can get away with using regular expressions, but in general we need
 | 
			
		||||
# a real XML parser to parse HTML. We'll cover that in a later unit. Here, we
 | 
			
		||||
# strsplit() the response into individual lines, since each of our data elements
 | 
			
		||||
# is on its own line, and then capture the contents. The way Prosite has
 | 
			
		||||
# formatted their HTML we can simply split on the "\\n" newline character - but
 | 
			
		||||
# they could write the same valid HTML without any newline-characters at all.
 | 
			
		||||
# Understand that we are working with a bit of a "hack" here: exploting
 | 
			
		||||
# empirical assumptions rather than a formal specification. But sometimes quick
 | 
			
		||||
# and dirty is fine, because quick.
 | 
			
		||||
 | 
			
		||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
 | 
			
		||||
head(lines)
 | 
			
		||||
 | 
			
		||||
# Now we define a query pattern for the lines we want:
 | 
			
		||||
# we can use the uID, bracketed by two "|" pipe
 | 
			
		||||
# characters:
 | 
			
		||||
 | 
			
		||||
patt <- sprintf("\\|%s\\|", UniProtID)
 | 
			
		||||
 | 
			
		||||
# ... and select only the lines that match this
 | 
			
		||||
# pattern:
 | 
			
		||||
 | 
			
		||||
( lines <- lines[grep(patt, lines)] )
 | 
			
		||||
 | 
			
		||||
# ... captures the three lines of output.
 | 
			
		||||
 | 
			
		||||
# Now we break the lines apart into tokens: this is another application of
 | 
			
		||||
# strsplit(), but this time we split either on "pipe" characters, "|" OR on tabs
 | 
			
		||||
# "\t". Look at the regex "\\t|\\|" in the strsplit() call:
 | 
			
		||||
 | 
			
		||||
unlist(strsplit(lines[1], "\\t|\\|"))
 | 
			
		||||
 | 
			
		||||
# Its parts are (\\t)=tab (|)=or (\\|)=pipe. Both "t" and "|" need to be escaped
 | 
			
		||||
# with a backslash. "t" has to be escaped because we want to match a tab (\t),
 | 
			
		||||
# not the literal character "t". And "|" has to be escaped because we mean the
 | 
			
		||||
# literal pipe character, not its metacharacter meaning OR. Thus sometimes the
 | 
			
		||||
# backslash turns a special meaning off, and sometimes it turns a special
 | 
			
		||||
# meaning on. Unfortunately there's no easy way to tell - you just need to
 | 
			
		||||
# remember the characters - or have a reference handy. The metacharacters are
 | 
			
		||||
# (){}[]^$?*+.|&-   ... and some of them have different meanings depending on
 | 
			
		||||
# where in the regex they are.
 | 
			
		||||
 | 
			
		||||
# Let's put the tokens into named slots of a data frame
 | 
			
		||||
 | 
			
		||||
features <- data.frame()
 | 
			
		||||
for (line in lines) {
 | 
			
		||||
  tokens <- unlist(strsplit(line, "\\t|\\|"))
 | 
			
		||||
  features <- rbind(features,
 | 
			
		||||
                    data.frame(uID   =  tokens[2],
 | 
			
		||||
                               start =  as.numeric(tokens[4]),
 | 
			
		||||
                               end   =  as.numeric(tokens[5]),
 | 
			
		||||
                               psID  =  tokens[6],
 | 
			
		||||
                               psName = tokens[7],
 | 
			
		||||
                               psSeq  = tokens[11]))
 | 
			
		||||
}
 | 
			
		||||
features
 | 
			
		||||
 | 
			
		||||
#  This forms the base of a function that collects the features automatically
 | 
			
		||||
#  from a PrositeScan result. You can write this!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchPrositeFeatures() function  ============================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: write a function that takes as input a UniProt ID, fetches the
 | 
			
		||||
# features it contains from ScanProsite and returns a data frame as given above, or
 | 
			
		||||
# an empty data frame if there is an error.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have placed such a function into the ABC-dbUtilities.R script: look it up by
 | 
			
		||||
# clicking on  dbFetchPrositeFeatures() in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test:
 | 
			
		||||
dbFetchPrositeFeatures("Q5KMQ9")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							
							
						
						
									
										270
									
								
								RPR-Pipe.R
									
									
									
									
									
								
							@@ -1,135 +1,135 @@
 | 
			
		||||
# tocID <- "RPR-Pipe.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              Discussing pipe operators.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2021  10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   - find more interesting examples
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                            Line
 | 
			
		||||
#TOC> ------------------------------------------------
 | 
			
		||||
#TOC>   1        Pipe  Concept                      41
 | 
			
		||||
#TOC>   2        Nested Expression                  73
 | 
			
		||||
#TOC>   3        magrittr:: Pipe                    78
 | 
			
		||||
#TOC>   4        Base R Pipe                        93
 | 
			
		||||
#TOC>   5        Intermediate Assignment           108
 | 
			
		||||
#TOC>   6        Postscript                        127
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Pipe  Concept  =======================================================
 | 
			
		||||
 | 
			
		||||
# Pipes are actually an awesome idea for any code that implements a workflow -
 | 
			
		||||
# a sequence of operations, each of which transforms data in a specialized way.
 | 
			
		||||
#
 | 
			
		||||
# This principle is familiar from maths: chained functions. If have a function
 | 
			
		||||
# y = f(x) and want to use those results as in z = g(y), I can just write
 | 
			
		||||
# z = g(f(x))
 | 
			
		||||
#
 | 
			
		||||
# On the unix command line, pipes were used from the very beginning, implemented
 | 
			
		||||
# with the "|" pipe character.
 | 
			
		||||
#
 | 
			
		||||
# In R, the magrittr package provided the %>% operator, and recently the |>
 | 
			
		||||
# operator has been introduced into base R.
 | 
			
		||||
#
 | 
			
		||||
# However there are alternatives: intermediate assignment, and nested functions
 | 
			
		||||
# that have always existed in base R anyway.
 | 
			
		||||
#
 | 
			
		||||
# Let us look at an example. In writing this, I found out that virtually
 | 
			
		||||
# ALL non-trivial examples I came up with don't translate well into this idiom
 | 
			
		||||
# at all. It is actually quite limited to simple filtering operations on
 | 
			
		||||
# data. A more interesting example might be added in the future, let me know if
 | 
			
		||||
# you have a good idea.
 | 
			
		||||
#
 | 
			
		||||
# A somewhat contrived example is to sort a list of files by the
 | 
			
		||||
# length of the file names:
 | 
			
		||||
 | 
			
		||||
myFiles <- list.files(pattern = "\\.R$")
 | 
			
		||||
 | 
			
		||||
# nchar() gives the number of characters in a string, order() produces indices
 | 
			
		||||
# that map an array to its sorted form.
 | 
			
		||||
#
 | 
			
		||||
# =    2  Nested Expression  ===================================================
 | 
			
		||||
 | 
			
		||||
myFiles[order(nchar(myFiles))]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  magrittr:: Pipe  =====================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("magrittr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("magrittr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = magrittr)       # basic information
 | 
			
		||||
#  browseVignettes("magrittr")    # available vignettes
 | 
			
		||||
#  data(package = "magrittr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
library(magrittr)
 | 
			
		||||
 | 
			
		||||
myFiles  %>% nchar %>% order %>% myFiles[.]
 | 
			
		||||
 | 
			
		||||
# =    4  Base R Pipe  =========================================================
 | 
			
		||||
 | 
			
		||||
# Since version 4.1, base R now supports a pipe operator without the need
 | 
			
		||||
# to load a special package. Such an introductions of external functionality
 | 
			
		||||
# into the language is very rare.
 | 
			
		||||
#
 | 
			
		||||
# Unfortunately it won't (yet) work with the '[' function, so we need to write
 | 
			
		||||
# an intermediate function for this example
 | 
			
		||||
extract <- function(x, v) {
 | 
			
		||||
  return(v[x])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
myFiles |> nchar() |> order() |> extract(myFiles)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Intermediate Assignment  =============================================
 | 
			
		||||
 | 
			
		||||
# So what's the problem? As you can see, the piped code may be concise and
 | 
			
		||||
# expressive. But there is also a large amount of implicit assignment and
 | 
			
		||||
# processing going on and that is usually a bad idea because it makes code hard
 | 
			
		||||
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
 | 
			
		||||
# replacing it with the pipe makes things much better. My preferred idiom is
 | 
			
		||||
# to use intermediate assignments. Only then is it convenient to examine
 | 
			
		||||
# the code step by step and validate every single step. And that is the most
 | 
			
		||||
# important objective at all: no code is good if it does not compute
 | 
			
		||||
# correctly.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
x <- nchar(myFiles)
 | 
			
		||||
x <- order(x)
 | 
			
		||||
myFiles[x]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Postscript  ==========================================================
 | 
			
		||||
 | 
			
		||||
# I tried to write an example that strips all comments from a list of files, and
 | 
			
		||||
# another example that finds all files that were not yet updated this year
 | 
			
		||||
# (according to the "# Date: in the header). Neither examples can be well
 | 
			
		||||
# written without intermediate assignments, or at least sapply() functions
 | 
			
		||||
# that are not simpler at all than the intermediate assignment.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-Pipe.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              Discussing pipe operators.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2021  10
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   - find more interesting examples
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                            Line
 | 
			
		||||
#TOC> ------------------------------------------------
 | 
			
		||||
#TOC>   1        Pipe  Concept                      41
 | 
			
		||||
#TOC>   2        Nested Expression                  73
 | 
			
		||||
#TOC>   3        magrittr:: Pipe                    78
 | 
			
		||||
#TOC>   4        Base R Pipe                        93
 | 
			
		||||
#TOC>   5        Intermediate Assignment           108
 | 
			
		||||
#TOC>   6        Postscript                        127
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Pipe  Concept  =======================================================
 | 
			
		||||
 | 
			
		||||
# Pipes are actually an awesome idea for any code that implements a workflow -
 | 
			
		||||
# a sequence of operations, each of which transforms data in a specialized way.
 | 
			
		||||
#
 | 
			
		||||
# This principle is familiar from maths: chained functions. If have a function
 | 
			
		||||
# y = f(x) and want to use those results as in z = g(y), I can just write
 | 
			
		||||
# z = g(f(x))
 | 
			
		||||
#
 | 
			
		||||
# On the unix command line, pipes were used from the very beginning, implemented
 | 
			
		||||
# with the "|" pipe character.
 | 
			
		||||
#
 | 
			
		||||
# In R, the magrittr package provided the %>% operator, and recently the |>
 | 
			
		||||
# operator has been introduced into base R.
 | 
			
		||||
#
 | 
			
		||||
# However there are alternatives: intermediate assignment, and nested functions
 | 
			
		||||
# that have always existed in base R anyway.
 | 
			
		||||
#
 | 
			
		||||
# Let us look at an example. In writing this, I found out that virtually
 | 
			
		||||
# ALL non-trivial examples I came up with don't translate well into this idiom
 | 
			
		||||
# at all. It is actually quite limited to simple filtering operations on
 | 
			
		||||
# data. A more interesting example might be added in the future, let me know if
 | 
			
		||||
# you have a good idea.
 | 
			
		||||
#
 | 
			
		||||
# A somewhat contrived example is to sort a list of files by the
 | 
			
		||||
# length of the file names:
 | 
			
		||||
 | 
			
		||||
myFiles <- list.files(pattern = "\\.R$")
 | 
			
		||||
 | 
			
		||||
# nchar() gives the number of characters in a string, order() produces indices
 | 
			
		||||
# that map an array to its sorted form.
 | 
			
		||||
#
 | 
			
		||||
# =    2  Nested Expression  ===================================================
 | 
			
		||||
 | 
			
		||||
myFiles[order(nchar(myFiles))]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  magrittr:: Pipe  =====================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("magrittr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("magrittr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = magrittr)       # basic information
 | 
			
		||||
#  browseVignettes("magrittr")    # available vignettes
 | 
			
		||||
#  data(package = "magrittr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
library(magrittr)
 | 
			
		||||
 | 
			
		||||
myFiles  %>% nchar %>% order %>% myFiles[.]
 | 
			
		||||
 | 
			
		||||
# =    4  Base R Pipe  =========================================================
 | 
			
		||||
 | 
			
		||||
# Since version 4.1, base R now supports a pipe operator without the need
 | 
			
		||||
# to load a special package. Such an introductions of external functionality
 | 
			
		||||
# into the language is very rare.
 | 
			
		||||
#
 | 
			
		||||
# Unfortunately it won't (yet) work with the '[' function, so we need to write
 | 
			
		||||
# an intermediate function for this example
 | 
			
		||||
extract <- function(x, v) {
 | 
			
		||||
  return(v[x])
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
myFiles |> nchar() |> order() |> extract(myFiles)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Intermediate Assignment  =============================================
 | 
			
		||||
 | 
			
		||||
# So what's the problem? As you can see, the piped code may be concise and
 | 
			
		||||
# expressive. But there is also a large amount of implicit assignment and
 | 
			
		||||
# processing going on and that is usually a bad idea because it makes code hard
 | 
			
		||||
# to maintain. I am NOT a big fan of the nested syntax, but I don't think that
 | 
			
		||||
# replacing it with the pipe makes things much better. My preferred idiom is
 | 
			
		||||
# to use intermediate assignments. Only then is it convenient to examine
 | 
			
		||||
# the code step by step and validate every single step. And that is the most
 | 
			
		||||
# important objective at all: no code is good if it does not compute
 | 
			
		||||
# correctly.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
x <- nchar(myFiles)
 | 
			
		||||
x <- order(x)
 | 
			
		||||
myFiles[x]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Postscript  ==========================================================
 | 
			
		||||
 | 
			
		||||
# I tried to write an example that strips all comments from a list of files, and
 | 
			
		||||
# another example that finds all files that were not yet updated this year
 | 
			
		||||
# (according to the "# Date: in the header). Neither examples can be well
 | 
			
		||||
# written without intermediate assignments, or at least sapply() functions
 | 
			
		||||
# that are not simpler at all than the intermediate assignment.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							
							
						
						
									
										360
									
								
								RPR-RegEx.R
									
									
									
									
									
								
							@@ -1,180 +1,180 @@
 | 
			
		||||
# tocID <- "RPR-RegEx.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-RegEx unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-08  -  2020-09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 0.1    Maintenance 2020
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                Line
 | 
			
		||||
#TOC> ----------------------------------------------------
 | 
			
		||||
#TOC>   1        A regex example                        41
 | 
			
		||||
#TOC>   2        Counting lines                        108
 | 
			
		||||
#TOC>   2.1        Counting C-alpha atoms only         126
 | 
			
		||||
#TOC>   3        Code Solutions                        142
 | 
			
		||||
#TOC>   3.1        Counting atoms                      144
 | 
			
		||||
#TOC>   3.2        Counting C-alpha records            160
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  A regex example  =====================================================
 | 
			
		||||
 | 
			
		||||
# The canonical FASTA version of yeast Mbp1 at Uniprot
 | 
			
		||||
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
 | 
			
		||||
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
 | 
			
		||||
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
 | 
			
		||||
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
 | 
			
		||||
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
 | 
			
		||||
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
 | 
			
		||||
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
 | 
			
		||||
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
 | 
			
		||||
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
 | 
			
		||||
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
 | 
			
		||||
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
 | 
			
		||||
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
 | 
			
		||||
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
 | 
			
		||||
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
 | 
			
		||||
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
 | 
			
		||||
 | 
			
		||||
nchar(s)
 | 
			
		||||
# Must be 969
 | 
			
		||||
 | 
			
		||||
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
 | 
			
		||||
# two vertical bars ("pipes") in the header record.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Develop the regular expression:
 | 
			
		||||
                      # Just five characters returned, so we know we are using
 | 
			
		||||
patt <- "^>(.{5})"    # the right functions
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)|"    # everything to the pipe character
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# Ooops - "|" is a metacharacter - we must escape it
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)\|"    # using "\|"
 | 
			
		||||
# Ooops - that's not how we escape: must double the \ to send a literal
 | 
			
		||||
# "\" plus the character "|" to the regex engine.
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)\\|"    # using "\\|"
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# Good. Now let's first match everything that is not a "|", then match a "|"
 | 
			
		||||
patt <- "^>([^|]*)\\|"
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# the same thing again, but capture the second match. And insist that there
 | 
			
		||||
# must be at least one character captured
 | 
			
		||||
 | 
			
		||||
patt <- "^>[^|]*\\|([^|]+)\\|"
 | 
			
		||||
# Analyze this pattern:
 | 
			
		||||
#    ^           anchor the match at the beginning of the line
 | 
			
		||||
#    >           ">" must be the first character
 | 
			
		||||
#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
 | 
			
		||||
#                  we don't know what other versions of the string "sp"
 | 
			
		||||
#                  might appear. Note that within the brackets "|" is NOT a
 | 
			
		||||
#                  metacharacter.
 | 
			
		||||
#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
 | 
			
		||||
#                  and means "OR"; we need to escape it to match a literal "|".
 | 
			
		||||
#    (           open parenthesis: capture what comes next ...
 | 
			
		||||
#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
 | 
			
		||||
#    )           close parenthesis: stop capturing here
 | 
			
		||||
#    \\|           second "|" character, escaped
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Counting lines  ======================================================
 | 
			
		||||
 | 
			
		||||
# Task: Write a function that returns the number of atoms in a PDB file. Call it
 | 
			
		||||
#       atomCount(). Sample data is here:
 | 
			
		||||
myPDB <- readLines("./data/0TST.pdb")
 | 
			
		||||
 | 
			
		||||
#       Specification:
 | 
			
		||||
#       Read a file from its path given as the only argument.
 | 
			
		||||
#       Return the number of lines in that file that begin with "ATOM  "
 | 
			
		||||
#       or with "HETATM".
 | 
			
		||||
 | 
			
		||||
#       Try this. Write a function. Solution code is at the end of this file.
 | 
			
		||||
#       Don't peek.
 | 
			
		||||
 | 
			
		||||
atomCount("./data/0TST.pdb")  # must return 6
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Counting C-alpha atoms only  =======================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function based on the previous one that matches only CA records,
 | 
			
		||||
#       i.e. it can be used to count the number of amino acids. Don't get
 | 
			
		||||
#       fooled by calcium atoms, or the string CA appearing elsewhere.
 | 
			
		||||
#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
 | 
			
		||||
 | 
			
		||||
#       Specification:
 | 
			
		||||
#       Read a file from its path given as the only argument.
 | 
			
		||||
#       Return the number of lines in that file that have a C-alpha atom.
 | 
			
		||||
 | 
			
		||||
#       Try this. Solution code is at the end of this file. Don't peek.
 | 
			
		||||
 | 
			
		||||
CAcount("./data/0TST.pdb")  # must return 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Code Solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Counting atoms  ====================================================
 | 
			
		||||
 | 
			
		||||
atomCount <- function(IN) {
 | 
			
		||||
  # count the number of atoms in a PDB formatted file
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     IN  chr  path of the file to read
 | 
			
		||||
  # Value:
 | 
			
		||||
  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
 | 
			
		||||
  # Note: the regex MUST be anchored to the beginning of the line, otherwise
 | 
			
		||||
  # it might match somewhere in a comment!
 | 
			
		||||
  x <- readLines(IN)
 | 
			
		||||
  patt <- "(^ATOM  )|(^HETATM)"
 | 
			
		||||
  return(length(grep(patt, x)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Counting C-alpha records  ==========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CAcount <- function(IN) {
 | 
			
		||||
  # count the number of C-alpha atoms in a PDB formatted file
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     IN  chr  path of the file to read
 | 
			
		||||
  # Value:
 | 
			
		||||
  #         numeric  number of lines that match " CA " in position 13 - 16 of
 | 
			
		||||
  #                  an ATOM record.
 | 
			
		||||
  # Note: the regex MUST be aligned into the right position, otherwise it
 | 
			
		||||
  #       might match Calcium records!
 | 
			
		||||
  x <- readLines(IN)
 | 
			
		||||
  patt <- "^ATOM  ...... CA "
 | 
			
		||||
  return(length(grep(patt, x)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-RegEx.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose: A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-RegEx unit
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:    2017-08  -  2020-09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# V 0.1    Maintenance 2020
 | 
			
		||||
# V 0.1    First code
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT SIMPLY  source()  THESE FILES!
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
#  going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC>   Section  Title                                Line
 | 
			
		||||
#TOC> ----------------------------------------------------
 | 
			
		||||
#TOC>   1        A regex example                        41
 | 
			
		||||
#TOC>   2        Counting lines                        108
 | 
			
		||||
#TOC>   2.1        Counting C-alpha atoms only         126
 | 
			
		||||
#TOC>   3        Code Solutions                        142
 | 
			
		||||
#TOC>   3.1        Counting atoms                      144
 | 
			
		||||
#TOC>   3.2        Counting C-alpha records            160
 | 
			
		||||
#TOC>
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  A regex example  =====================================================
 | 
			
		||||
 | 
			
		||||
# The canonical FASTA version of yeast Mbp1 at Uniprot
 | 
			
		||||
s <- ">sp|P39678|MBP1_YEAST Transcription factor MBP1 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) GN=MBP1 PE=1 SV=1
 | 
			
		||||
MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLK
 | 
			
		||||
ETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHA
 | 
			
		||||
SKVDRKKAIRSASTSAIMETKRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRR
 | 
			
		||||
KLGVNLQRSQSDMGFPRPAIPNSSISTTQLPSIRSTMGPQSPTLGILEEERHDSRQQQPQ
 | 
			
		||||
QNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQQSSLIQTQQTESMATSVSSS
 | 
			
		||||
PSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKVNKYLSKLVDY
 | 
			
		||||
FISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS
 | 
			
		||||
IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTP
 | 
			
		||||
SAVYYLDVVLSKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTT
 | 
			
		||||
ISNKEGLTANEIMNQQYEQMMIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSP
 | 
			
		||||
VSPSDYITYPSQIATNISRNIPNVVNSMKQMASIYNDLHEQHDNEIKSLQKTLKSISKTK
 | 
			
		||||
IQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTKKLRKRLIRYKRLIKQKLEYR
 | 
			
		||||
QTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSSLVKKFEDNAK
 | 
			
		||||
IHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"
 | 
			
		||||
 | 
			
		||||
nchar(s)
 | 
			
		||||
# Must be 969
 | 
			
		||||
 | 
			
		||||
# Task: Fetch the Uniprot ID by retrieving the first string that appears between
 | 
			
		||||
# two vertical bars ("pipes") in the header record.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
# Develop the regular expression:
 | 
			
		||||
                      # Just five characters returned, so we know we are using
 | 
			
		||||
patt <- "^>(.{5})"    # the right functions
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)|"    # everything to the pipe character
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# Ooops - "|" is a metacharacter - we must escape it
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)\|"    # using "\|"
 | 
			
		||||
# Ooops - that's not how we escape: must double the \ to send a literal
 | 
			
		||||
# "\" plus the character "|" to the regex engine.
 | 
			
		||||
 | 
			
		||||
patt <- "^>(.*)\\|"    # using "\\|"
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# Good. Now let's first match everything that is not a "|", then match a "|"
 | 
			
		||||
patt <- "^>([^|]*)\\|"
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
# the same thing again, but capture the second match. And insist that there
 | 
			
		||||
# must be at least one character captured
 | 
			
		||||
 | 
			
		||||
patt <- "^>[^|]*\\|([^|]+)\\|"
 | 
			
		||||
# Analyze this pattern:
 | 
			
		||||
#    ^           anchor the match at the beginning of the line
 | 
			
		||||
#    >           ">" must be the first character
 | 
			
		||||
#    [^|]*       all-characters-except-a-vertical-bar, 0 or more times because
 | 
			
		||||
#                  we don't know what other versions of the string "sp"
 | 
			
		||||
#                  might appear. Note that within the brackets "|" is NOT a
 | 
			
		||||
#                  metacharacter.
 | 
			
		||||
#    \\|         "|" character: ouside of square brackets "|" is a metacharacter
 | 
			
		||||
#                  and means "OR"; we need to escape it to match a literal "|".
 | 
			
		||||
#    (           open parenthesis: capture what comes next ...
 | 
			
		||||
#       [^|]+    all-characters-except-a-vertical-bar, 1 or more times
 | 
			
		||||
#    )           close parenthesis: stop capturing here
 | 
			
		||||
#    \\|           second "|" character, escaped
 | 
			
		||||
regmatches(s, regexec(patt, s, perl = TRUE))[[1]][2]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Counting lines  ======================================================
 | 
			
		||||
 | 
			
		||||
# Task: Write a function that returns the number of atoms in a PDB file. Call it
 | 
			
		||||
#       atomCount(). Sample data is here:
 | 
			
		||||
myPDB <- readLines("./data/0TST.pdb")
 | 
			
		||||
 | 
			
		||||
#       Specification:
 | 
			
		||||
#       Read a file from its path given as the only argument.
 | 
			
		||||
#       Return the number of lines in that file that begin with "ATOM  "
 | 
			
		||||
#       or with "HETATM".
 | 
			
		||||
 | 
			
		||||
#       Try this. Write a function. Solution code is at the end of this file.
 | 
			
		||||
#       Don't peek.
 | 
			
		||||
 | 
			
		||||
atomCount("./data/0TST.pdb")  # must return 6
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Counting C-alpha atoms only  =======================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function based on the previous one that matches only CA records,
 | 
			
		||||
#       i.e. it can be used to count the number of amino acids. Don't get
 | 
			
		||||
#       fooled by calcium atoms, or the string CA appearing elsewhere.
 | 
			
		||||
#       cf. https://www.wwpdb.org/documentation/file-format-content/format33/sect9.html#ATOM
 | 
			
		||||
 | 
			
		||||
#       Specification:
 | 
			
		||||
#       Read a file from its path given as the only argument.
 | 
			
		||||
#       Return the number of lines in that file that have a C-alpha atom.
 | 
			
		||||
 | 
			
		||||
#       Try this. Solution code is at the end of this file. Don't peek.
 | 
			
		||||
 | 
			
		||||
CAcount("./data/0TST.pdb")  # must return 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Code Solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Counting atoms  ====================================================
 | 
			
		||||
 | 
			
		||||
atomCount <- function(IN) {
 | 
			
		||||
  # count the number of atoms in a PDB formatted file
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     IN  chr  path of the file to read
 | 
			
		||||
  # Value:
 | 
			
		||||
  #         numeric  number of lines that match "^ATOM  " or "^HETATM"
 | 
			
		||||
  # Note: the regex MUST be anchored to the beginning of the line, otherwise
 | 
			
		||||
  # it might match somewhere in a comment!
 | 
			
		||||
  x <- readLines(IN)
 | 
			
		||||
  patt <- "(^ATOM  )|(^HETATM)"
 | 
			
		||||
  return(length(grep(patt, x)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.2  Counting C-alpha records  ==========================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
CAcount <- function(IN) {
 | 
			
		||||
  # count the number of C-alpha atoms in a PDB formatted file
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     IN  chr  path of the file to read
 | 
			
		||||
  # Value:
 | 
			
		||||
  #         numeric  number of lines that match " CA " in position 13 - 16 of
 | 
			
		||||
  #                  an ATOM record.
 | 
			
		||||
  # Note: the regex MUST be aligned into the right position, otherwise it
 | 
			
		||||
  #       might match Calcium records!
 | 
			
		||||
  x <- readLines(IN)
 | 
			
		||||
  patt <- "^ATOM  ...... CA "
 | 
			
		||||
  return(length(grep(patt, x)))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
							
						
						
									
										1658
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,135 +1,135 @@
 | 
			
		||||
# tocID <- "RPR-UniProt_GET.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
 | 
			
		||||
#                  added FASTA headers as attribute
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                      Line
 | 
			
		||||
#TOC> ----------------------------------------------------------
 | 
			
		||||
#TOC>   1        UniProt files via GET                        43
 | 
			
		||||
#TOC>   1.1        Task - fetchUniProtSeq() function         105
 | 
			
		||||
#TOC>   2        Task solutions                              118
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  UniProt files via GET  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Perhaps the simplest example of scripted download is to retrieve a protein
 | 
			
		||||
# FASTA sequence from UniProt. All we need is to construct an URL with the
 | 
			
		||||
# correct UniProt ID.
 | 
			
		||||
 | 
			
		||||
# An interface between R scripts and Web servers is provided by the httr::
 | 
			
		||||
# package. This sends and receives information via the http protocol, just like
 | 
			
		||||
# a Web browser. Since this is a short and simple request, the GET verb is the
 | 
			
		||||
# right tool:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The UniProt ID for Mbp1 is ...
 | 
			
		||||
 | 
			
		||||
UniProtID <- "P39678"
 | 
			
		||||
 | 
			
		||||
# and the base URL to retrieve data is  ...
 | 
			
		||||
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
 | 
			
		||||
# retrieve a FASTA sequence:
 | 
			
		||||
 | 
			
		||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
 | 
			
		||||
 | 
			
		||||
# the GET() function from httr will get the data.
 | 
			
		||||
response <- httr::GET(URL)
 | 
			
		||||
 | 
			
		||||
str(response) # the response object is a bit complex ...
 | 
			
		||||
as.character(response) # ... but it is easy to pull out the data.
 | 
			
		||||
 | 
			
		||||
# to process  ...
 | 
			
		||||
x <- as.character(response)
 | 
			
		||||
x <- strsplit(x, "\n")
 | 
			
		||||
dbSanitizeSequence(x)
 | 
			
		||||
 | 
			
		||||
# Simple.
 | 
			
		||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
 | 
			
		||||
 | 
			
		||||
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
 | 
			
		||||
as.character(response)
 | 
			
		||||
# this is a large HTML page that tells us the URL was not found. So we need to
 | 
			
		||||
# check for errors.  The Right Way to do this is to evaluate the staus code that
 | 
			
		||||
# every Web server returns for every transaction.
 | 
			
		||||
#
 | 
			
		||||
httr::status_code(response)  # 404 == Page Not Found
 | 
			
		||||
 | 
			
		||||
# There are many possible codes, but the only code we will be happy with
 | 
			
		||||
# is 200 - oK.
 | 
			
		||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
 | 
			
		||||
 | 
			
		||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
 | 
			
		||||
response <- httr::GET(URL)
 | 
			
		||||
httr::status_code(response)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchUniProtSeq() function  =================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function that
 | 
			
		||||
#   - takes as input a vector of UniProt IDs,
 | 
			
		||||
#   - fetches the FASTA sequence for each
 | 
			
		||||
#   - returns a vector of the same length as the input, where an element is:
 | 
			
		||||
#   -  ...  the sequence, if the query was successful
 | 
			
		||||
#   -  ...  NA if there was an error
 | 
			
		||||
#   - each element has the UniProt ID as the name()
 | 
			
		||||
#   - bonus: the output has an attribute "headers" that is a vector of the
 | 
			
		||||
#            FASTA headers ( cf. ?attr )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have placed such a function - dbFetchUniProtSeq() - into
 | 
			
		||||
# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
 | 
			
		||||
# in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test this:
 | 
			
		||||
( x <- dbFetchUniProtSeq("P39678") )
 | 
			
		||||
names(x)[1]
 | 
			
		||||
attr(x, "headers")[1]
 | 
			
		||||
x[1]
 | 
			
		||||
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
 | 
			
		||||
               width = 40), sep = "\n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-UniProt_GET.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Maintenance. Made dbFetchUniProtSeq() vector-safe and
 | 
			
		||||
#                  added FASTA headers as attribute
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                      Line
 | 
			
		||||
#TOC> ----------------------------------------------------------
 | 
			
		||||
#TOC>   1        UniProt files via GET                        43
 | 
			
		||||
#TOC>   1.1        Task - fetchUniProtSeq() function         105
 | 
			
		||||
#TOC>   2        Task solutions                              118
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  UniProt files via GET  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Perhaps the simplest example of scripted download is to retrieve a protein
 | 
			
		||||
# FASTA sequence from UniProt. All we need is to construct an URL with the
 | 
			
		||||
# correct UniProt ID.
 | 
			
		||||
 | 
			
		||||
# An interface between R scripts and Web servers is provided by the httr::
 | 
			
		||||
# package. This sends and receives information via the http protocol, just like
 | 
			
		||||
# a Web browser. Since this is a short and simple request, the GET verb is the
 | 
			
		||||
# right tool:
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = httr)       # basic information
 | 
			
		||||
#  browseVignettes("httr")    # available vignettes
 | 
			
		||||
#  data(package = "httr")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# The UniProt ID for Mbp1 is ...
 | 
			
		||||
 | 
			
		||||
UniProtID <- "P39678"
 | 
			
		||||
 | 
			
		||||
# and the base URL to retrieve data is  ...
 | 
			
		||||
# http://www.uniprot.org/uniprot/ . We can construct a simple URL to
 | 
			
		||||
# retrieve a FASTA sequence:
 | 
			
		||||
 | 
			
		||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
 | 
			
		||||
 | 
			
		||||
# the GET() function from httr will get the data.
 | 
			
		||||
response <- httr::GET(URL)
 | 
			
		||||
 | 
			
		||||
str(response) # the response object is a bit complex ...
 | 
			
		||||
as.character(response) # ... but it is easy to pull out the data.
 | 
			
		||||
 | 
			
		||||
# to process  ...
 | 
			
		||||
x <- as.character(response)
 | 
			
		||||
x <- strsplit(x, "\n")
 | 
			
		||||
dbSanitizeSequence(x)
 | 
			
		||||
 | 
			
		||||
# Simple.
 | 
			
		||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
 | 
			
		||||
 | 
			
		||||
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
 | 
			
		||||
as.character(response)
 | 
			
		||||
# this is a large HTML page that tells us the URL was not found. So we need to
 | 
			
		||||
# check for errors.  The Right Way to do this is to evaluate the staus code that
 | 
			
		||||
# every Web server returns for every transaction.
 | 
			
		||||
#
 | 
			
		||||
httr::status_code(response)  # 404 == Page Not Found
 | 
			
		||||
 | 
			
		||||
# There are many possible codes, but the only code we will be happy with
 | 
			
		||||
# is 200 - oK.
 | 
			
		||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
 | 
			
		||||
 | 
			
		||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
 | 
			
		||||
response <- httr::GET(URL)
 | 
			
		||||
httr::status_code(response)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchUniProtSeq() function  =================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function that
 | 
			
		||||
#   - takes as input a vector of UniProt IDs,
 | 
			
		||||
#   - fetches the FASTA sequence for each
 | 
			
		||||
#   - returns a vector of the same length as the input, where an element is:
 | 
			
		||||
#   -  ...  the sequence, if the query was successful
 | 
			
		||||
#   -  ...  NA if there was an error
 | 
			
		||||
#   - each element has the UniProt ID as the name()
 | 
			
		||||
#   - bonus: the output has an attribute "headers" that is a vector of the
 | 
			
		||||
#            FASTA headers ( cf. ?attr )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# I have placed such a function - dbFetchUniProtSeq() - into
 | 
			
		||||
# "./scripts/ABC-dbUtilities.R": look it up by clicking on  dbFetchUniProtSeq()
 | 
			
		||||
# in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test this:
 | 
			
		||||
( x <- dbFetchUniProtSeq("P39678") )
 | 
			
		||||
names(x)[1]
 | 
			
		||||
attr(x, "headers")[1]
 | 
			
		||||
x[1]
 | 
			
		||||
cat(writeFASTA(data.frame(head = attr(x, "headers")[1], seq  =x[1]),
 | 
			
		||||
               width = 40), sep = "\n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,234 +1,234 @@
 | 
			
		||||
# tocID <- "RPR-Unit_testing.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Unit_testing unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017  10  -  2019  01
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates. Discuss local tests.
 | 
			
		||||
#           1.1    Change from require() to requireNamespace()
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Unit Tests with testthat            42
 | 
			
		||||
#TOC>   2        Organizing your tests              165
 | 
			
		||||
#TOC>   2.1        Testing scripts                  189
 | 
			
		||||
#TOC>   2.2        Rethinking testing               202
 | 
			
		||||
#TOC>   3        Task solutions                     220
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Unit Tests with testthat  ============================================
 | 
			
		||||
 | 
			
		||||
# The testthat package supports writing and executing unit tests in many ways.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("testthat", quietly = TRUE)) {
 | 
			
		||||
  install.packages("testthat")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = testthat)       # basic information
 | 
			
		||||
#  browseVignettes("testthat")    # available vignettes
 | 
			
		||||
#  data(package = "testthat")     # available datasets
 | 
			
		||||
 | 
			
		||||
# testthat is one of those packages that we either use A LOT in a script,
 | 
			
		||||
# or not at all. Therefore it's more reasonable to depart from our usual
 | 
			
		||||
# <package>::<function>() idiom, and load the entire library. In fact, if
 | 
			
		||||
# we author packages, it is common practice to load testthat in the part
 | 
			
		||||
# of the package that automates testing.
 | 
			
		||||
 | 
			
		||||
library(testthat)
 | 
			
		||||
 | 
			
		||||
# An atomic test consists of an expectation about the bahaviour of a function or
 | 
			
		||||
# the existence of an object. testthat provides a number of useful expectations:
 | 
			
		||||
 | 
			
		||||
# At the most basic level, you can use expect_true() and expect_false():
 | 
			
		||||
 | 
			
		||||
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
 | 
			
		||||
expect_true(file.exists("NO-SUCH-FILE.txt"))
 | 
			
		||||
 | 
			
		||||
expect_false(is.integer(NA))
 | 
			
		||||
 | 
			
		||||
# More commonly, you will test for equality of an output with a given result.
 | 
			
		||||
# But you need to consider what it means for two numbers to be "equal" on a
 | 
			
		||||
# digital computer. Consider:
 | 
			
		||||
 | 
			
		||||
49*(1/49) == 1      # Surprised? Read FAQ 7.31
 | 
			
		||||
                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
 | 
			
		||||
49*(1/49) - 1       # NOT zero (but almost)
 | 
			
		||||
 | 
			
		||||
# This is really unpredictable ...
 | 
			
		||||
0.1 + 0.05 == 0.15
 | 
			
		||||
0.2 + 0.07 == 0.27
 | 
			
		||||
 | 
			
		||||
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
 | 
			
		||||
# R uses the function all.equal() to test whether two numbers are equal for
 | 
			
		||||
# practical puposes up to machine precision.
 | 
			
		||||
49*(1/49) == 1
 | 
			
		||||
all.equal(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# The testthat function expect_equal() uses all.equal internally:
 | 
			
		||||
expect_equal(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# ... which is reasonable, or, if things MUST be exactly the same ...
 | 
			
		||||
expect_identical(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# ... but consider:
 | 
			
		||||
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
 | 
			
		||||
 | 
			
		||||
# Some very useful expectations are expect_warning(), and expect_error(), for
 | 
			
		||||
# constructing tests that check for erroneous output:
 | 
			
		||||
 | 
			
		||||
as.integer(c("1", "2", "three"))
 | 
			
		||||
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
 | 
			
		||||
                                                 # printed.
 | 
			
		||||
1/"x"
 | 
			
		||||
expect_warning(1/"x")
 | 
			
		||||
expect_error(1/"x")      # Again: note that the error is NOT printed, as well
 | 
			
		||||
                         # code execution will continue.
 | 
			
		||||
 | 
			
		||||
# Even better, you can check if the warning or error is what you expect it
 | 
			
		||||
# to be - because it could actually have occured somewhere else in your code.
 | 
			
		||||
 | 
			
		||||
v <- c("1", "x")
 | 
			
		||||
log(v[1:2])
 | 
			
		||||
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
 | 
			
		||||
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
 | 
			
		||||
expect_error(log(v[1,2]))                # This appears oK, but ...
 | 
			
		||||
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
 | 
			
		||||
 | 
			
		||||
# Producing unit tests simply means: we define a function, and then we check
 | 
			
		||||
# whether all test pass. Consider a function that is loaded on startup from
 | 
			
		||||
# the .utilities.R script:
 | 
			
		||||
 | 
			
		||||
biCode
 | 
			
		||||
 | 
			
		||||
# We could test it like so:
 | 
			
		||||
 | 
			
		||||
expect_equal(biCode(""), ".....")
 | 
			
		||||
expect_equal(biCode(" "), ".....")
 | 
			
		||||
expect_equal(biCode("123 12"), ".....")
 | 
			
		||||
expect_equal(biCode("h sapiens"), "H..SA")
 | 
			
		||||
expect_equal(biCode("homo sapiens"), "HOMSA")
 | 
			
		||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
 | 
			
		||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
 | 
			
		||||
             c("PHACI", "MACRU"))
 | 
			
		||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
 | 
			
		||||
 | 
			
		||||
# The test_that() function allows to group related tests, include an informative
 | 
			
		||||
# message which test is being executed, and run a number of tests that are
 | 
			
		||||
# passed to the function inside a code block - i.e. {...}
 | 
			
		||||
# test_that("<descriptive string>, {<code block>})
 | 
			
		||||
 | 
			
		||||
test_that("NA values are preserved", {
 | 
			
		||||
  # bicode() respects vector length: input and output must have the smae length.
 | 
			
		||||
  # Therefore NA's can't be simply skipped, bust must be properly passed
 | 
			
		||||
  # into output:
 | 
			
		||||
  expect_true(is.na((biCode(NA))))
 | 
			
		||||
  expect_equal(biCode(c("first", NA, "last")),
 | 
			
		||||
               c("FIRST", NA, "LAST."))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Write a function calcGC() that calculates GC content in a sequence.
 | 
			
		||||
#       Hint: you could strsplit() the sequence into a vector, and count
 | 
			
		||||
#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
 | 
			
		||||
#       A's and T's, and use nchar() before and after to calculate the content
 | 
			
		||||
#       from the length difference.
 | 
			
		||||
#       Then write tests that:
 | 
			
		||||
#          confirm that calcGC("AATT") is 0;
 | 
			
		||||
#          confirm that calcGC("ATGC") is 0.5;
 | 
			
		||||
#          confirm that calcGC("AC")   is 0.5;
 | 
			
		||||
#          confirm that calcGC("CGCG") is 1;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Organizing your tests  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Tests are only useful if they are actually executed and we need to make sure
 | 
			
		||||
# there are no barriers to do that. The testthat package supports automatic
 | 
			
		||||
# execution of tests:
 | 
			
		||||
#  - put your tests into an R-script,
 | 
			
		||||
#  - save your tests in a file called "test_<my-function-name>.R"
 | 
			
		||||
#  - execute the test with test_file("test_<my-function-name>.R") ...
 | 
			
		||||
#  ... or, if you are working on a project ...
 | 
			
		||||
#  - place the file in a test-directory (e.g. the directory "test" in this
 | 
			
		||||
#      project),
 | 
			
		||||
#  - execute all your tests with test_dir("<my-test-directory>")
 | 
			
		||||
 | 
			
		||||
# For example I have provided a "tests" directory with this project, and
 | 
			
		||||
# placed the file "test_biCode.R" inside.
 | 
			
		||||
file.show("./tests/test_biCode.R")
 | 
			
		||||
 | 
			
		||||
# Execute the file ...
 | 
			
		||||
test_file("./tests/test_biCode.R")
 | 
			
		||||
 | 
			
		||||
# .. or execute all the test files in the directory:
 | 
			
		||||
test_dir("./tests")
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Testing scripts  ===================================================
 | 
			
		||||
 | 
			
		||||
# Scripts need special consideration since we do not necessarily source() them
 | 
			
		||||
# entirely. Therefore automated testing is not reasonable. What you can do
 | 
			
		||||
# instead is to place a conditional block at the end of your script, that
 | 
			
		||||
# never gets executed - then you can manually execute the code in the block
 | 
			
		||||
# whenever you wish to test your functions. For example:
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # ... your tests go here
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Rethinking testing  ================================================
 | 
			
		||||
 | 
			
		||||
# However, it is important to keep in mind that different objectives lead to
 | 
			
		||||
# different ideas of what works best. There is never a "best" in and of itself,
 | 
			
		||||
# the question is always: "Best for what?" While automated unit testing is a
 | 
			
		||||
# great way to assure the integrity of packages and larger software artefacts as
 | 
			
		||||
# they are being developed, more loosely conceived aggregates of code - like the
 | 
			
		||||
# scripts for this course for example - have different objectives and in this
 | 
			
		||||
# case I find the testthat approach to actually be inferior. The reason is its
 | 
			
		||||
# tendency to physically separate code and tests. Keeping assets, and functions
 | 
			
		||||
# that operate on those assets separated is always poor design. I have found
 | 
			
		||||
# over time that a more stable approach is to move individual functions into
 | 
			
		||||
# their individual scripts, all in one folder, one function (and its helpers)
 | 
			
		||||
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
 | 
			
		||||
# explained above.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
calcGC <- function(s) {
 | 
			
		||||
  s <- gsub("[^agctAGCT]", "", s)
 | 
			
		||||
  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
expect_equal(calcGC("AATT"), 0)
 | 
			
		||||
expect_equal(calcGC("ATGC"), 0.5)
 | 
			
		||||
expect_equal(calcGC("AC"),   0.5)
 | 
			
		||||
expect_equal(calcGC("CGCG"), 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-Unit_testing.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Unit_testing unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017  10  -  2019  01
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Updates. Discuss local tests.
 | 
			
		||||
#           1.1    Change from require() to requireNamespace()
 | 
			
		||||
#           1.0    New code
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Unit Tests with testthat            42
 | 
			
		||||
#TOC>   2        Organizing your tests              165
 | 
			
		||||
#TOC>   2.1        Testing scripts                  189
 | 
			
		||||
#TOC>   2.2        Rethinking testing               202
 | 
			
		||||
#TOC>   3        Task solutions                     220
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Unit Tests with testthat  ============================================
 | 
			
		||||
 | 
			
		||||
# The testthat package supports writing and executing unit tests in many ways.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("testthat", quietly = TRUE)) {
 | 
			
		||||
  install.packages("testthat")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = testthat)       # basic information
 | 
			
		||||
#  browseVignettes("testthat")    # available vignettes
 | 
			
		||||
#  data(package = "testthat")     # available datasets
 | 
			
		||||
 | 
			
		||||
# testthat is one of those packages that we either use A LOT in a script,
 | 
			
		||||
# or not at all. Therefore it's more reasonable to depart from our usual
 | 
			
		||||
# <package>::<function>() idiom, and load the entire library. In fact, if
 | 
			
		||||
# we author packages, it is common practice to load testthat in the part
 | 
			
		||||
# of the package that automates testing.
 | 
			
		||||
 | 
			
		||||
library(testthat)
 | 
			
		||||
 | 
			
		||||
# An atomic test consists of an expectation about the bahaviour of a function or
 | 
			
		||||
# the existence of an object. testthat provides a number of useful expectations:
 | 
			
		||||
 | 
			
		||||
# At the most basic level, you can use expect_true() and expect_false():
 | 
			
		||||
 | 
			
		||||
expect_true(file.exists("./data/S288C_YDL056W_MBP1_coding.fsa"))
 | 
			
		||||
expect_true(file.exists("NO-SUCH-FILE.txt"))
 | 
			
		||||
 | 
			
		||||
expect_false(is.integer(NA))
 | 
			
		||||
 | 
			
		||||
# More commonly, you will test for equality of an output with a given result.
 | 
			
		||||
# But you need to consider what it means for two numbers to be "equal" on a
 | 
			
		||||
# digital computer. Consider:
 | 
			
		||||
 | 
			
		||||
49*(1/49) == 1      # Surprised? Read FAQ 7.31
 | 
			
		||||
                    # https://cran.r-project.org/doc/FAQ/R-FAQ.html
 | 
			
		||||
49*(1/49) - 1       # NOT zero (but almost)
 | 
			
		||||
 | 
			
		||||
# This is really unpredictable ...
 | 
			
		||||
0.1 + 0.05 == 0.15
 | 
			
		||||
0.2 + 0.07 == 0.27
 | 
			
		||||
 | 
			
		||||
# It's easy to be caught on the wrong foot with numeric comparisons, therefore
 | 
			
		||||
# R uses the function all.equal() to test whether two numbers are equal for
 | 
			
		||||
# practical puposes up to machine precision.
 | 
			
		||||
49*(1/49) == 1
 | 
			
		||||
all.equal(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# The testthat function expect_equal() uses all.equal internally:
 | 
			
		||||
expect_equal(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# ... which is reasonable, or, if things MUST be exactly the same ...
 | 
			
		||||
expect_identical(49*(1/49), 1)
 | 
			
		||||
 | 
			
		||||
# ... but consider:
 | 
			
		||||
expect_identical(2, 2L) # one is typeof() "double", the other is integer"
 | 
			
		||||
 | 
			
		||||
# Some very useful expectations are expect_warning(), and expect_error(), for
 | 
			
		||||
# constructing tests that check for erroneous output:
 | 
			
		||||
 | 
			
		||||
as.integer(c("1", "2", "three"))
 | 
			
		||||
expect_warning(as.integer(c("1", "2", "three"))) # Note that the warning is NOT
 | 
			
		||||
                                                 # printed.
 | 
			
		||||
1/"x"
 | 
			
		||||
expect_warning(1/"x")
 | 
			
		||||
expect_error(1/"x")      # Again: note that the error is NOT printed, as well
 | 
			
		||||
                         # code execution will continue.
 | 
			
		||||
 | 
			
		||||
# Even better, you can check if the warning or error is what you expect it
 | 
			
		||||
# to be - because it could actually have occured somewhere else in your code.
 | 
			
		||||
 | 
			
		||||
v <- c("1", "x")
 | 
			
		||||
log(v[1:2])
 | 
			
		||||
expect_error(log(v[1:2]), "non-numeric argument to mathematical function")
 | 
			
		||||
expect_error(log(v[1:2]), "non-numeric") # We can abbreviate the error message.
 | 
			
		||||
expect_error(log(v[1,2]))                # This appears oK, but ...
 | 
			
		||||
expect_error(log(v[1,2]), "non-numeric") # ... it's actually a different error!
 | 
			
		||||
 | 
			
		||||
# Producing unit tests simply means: we define a function, and then we check
 | 
			
		||||
# whether all test pass. Consider a function that is loaded on startup from
 | 
			
		||||
# the .utilities.R script:
 | 
			
		||||
 | 
			
		||||
biCode
 | 
			
		||||
 | 
			
		||||
# We could test it like so:
 | 
			
		||||
 | 
			
		||||
expect_equal(biCode(""), ".....")
 | 
			
		||||
expect_equal(biCode(" "), ".....")
 | 
			
		||||
expect_equal(biCode("123 12"), ".....")
 | 
			
		||||
expect_equal(biCode("h sapiens"), "H..SA")
 | 
			
		||||
expect_equal(biCode("homo sapiens"), "HOMSA")
 | 
			
		||||
expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
 | 
			
		||||
expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
 | 
			
		||||
             c("PHACI", "MACRU"))
 | 
			
		||||
expect_error(biCode(), "argument \"s\" is missing, with no default")
 | 
			
		||||
 | 
			
		||||
# The test_that() function allows to group related tests, include an informative
 | 
			
		||||
# message which test is being executed, and run a number of tests that are
 | 
			
		||||
# passed to the function inside a code block - i.e. {...}
 | 
			
		||||
# test_that("<descriptive string>, {<code block>})
 | 
			
		||||
 | 
			
		||||
test_that("NA values are preserved", {
 | 
			
		||||
  # bicode() respects vector length: input and output must have the smae length.
 | 
			
		||||
  # Therefore NA's can't be simply skipped, bust must be properly passed
 | 
			
		||||
  # into output:
 | 
			
		||||
  expect_true(is.na((biCode(NA))))
 | 
			
		||||
  expect_equal(biCode(c("first", NA, "last")),
 | 
			
		||||
               c("FIRST", NA, "LAST."))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Task: Write a function calcGC() that calculates GC content in a sequence.
 | 
			
		||||
#       Hint: you could strsplit() the sequence into a vector, and count
 | 
			
		||||
#       G's and C's; or you could use gsub("[AT]", "", <sequence>) to remove
 | 
			
		||||
#       A's and T's, and use nchar() before and after to calculate the content
 | 
			
		||||
#       from the length difference.
 | 
			
		||||
#       Then write tests that:
 | 
			
		||||
#          confirm that calcGC("AATT") is 0;
 | 
			
		||||
#          confirm that calcGC("ATGC") is 0.5;
 | 
			
		||||
#          confirm that calcGC("AC")   is 0.5;
 | 
			
		||||
#          confirm that calcGC("CGCG") is 1;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Organizing your tests  ===============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Tests are only useful if they are actually executed and we need to make sure
 | 
			
		||||
# there are no barriers to do that. The testthat package supports automatic
 | 
			
		||||
# execution of tests:
 | 
			
		||||
#  - put your tests into an R-script,
 | 
			
		||||
#  - save your tests in a file called "test_<my-function-name>.R"
 | 
			
		||||
#  - execute the test with test_file("test_<my-function-name>.R") ...
 | 
			
		||||
#  ... or, if you are working on a project ...
 | 
			
		||||
#  - place the file in a test-directory (e.g. the directory "test" in this
 | 
			
		||||
#      project),
 | 
			
		||||
#  - execute all your tests with test_dir("<my-test-directory>")
 | 
			
		||||
 | 
			
		||||
# For example I have provided a "tests" directory with this project, and
 | 
			
		||||
# placed the file "test_biCode.R" inside.
 | 
			
		||||
file.show("./tests/test_biCode.R")
 | 
			
		||||
 | 
			
		||||
# Execute the file ...
 | 
			
		||||
test_file("./tests/test_biCode.R")
 | 
			
		||||
 | 
			
		||||
# .. or execute all the test files in the directory:
 | 
			
		||||
test_dir("./tests")
 | 
			
		||||
 | 
			
		||||
# ==   2.1  Testing scripts  ===================================================
 | 
			
		||||
 | 
			
		||||
# Scripts need special consideration since we do not necessarily source() them
 | 
			
		||||
# entirely. Therefore automated testing is not reasonable. What you can do
 | 
			
		||||
# instead is to place a conditional block at the end of your script, that
 | 
			
		||||
# never gets executed - then you can manually execute the code in the block
 | 
			
		||||
# whenever you wish to test your functions. For example:
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # ... your tests go here
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ==   2.2  Rethinking testing  ================================================
 | 
			
		||||
 | 
			
		||||
# However, it is important to keep in mind that different objectives lead to
 | 
			
		||||
# different ideas of what works best. There is never a "best" in and of itself,
 | 
			
		||||
# the question is always: "Best for what?" While automated unit testing is a
 | 
			
		||||
# great way to assure the integrity of packages and larger software artefacts as
 | 
			
		||||
# they are being developed, more loosely conceived aggregates of code - like the
 | 
			
		||||
# scripts for this course for example - have different objectives and in this
 | 
			
		||||
# case I find the testthat approach to actually be inferior. The reason is its
 | 
			
		||||
# tendency to physically separate code and tests. Keeping assets, and functions
 | 
			
		||||
# that operate on those assets separated is always poor design. I have found
 | 
			
		||||
# over time that a more stable approach is to move individual functions into
 | 
			
		||||
# their individual scripts, all in one folder, one function (and its helpers)
 | 
			
		||||
# per file, and examples, demos and tests in an if (FALSE) { ... } block, as
 | 
			
		||||
# explained above.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
calcGC <- function(s) {
 | 
			
		||||
  s <- gsub("[^agctAGCT]", "", s)
 | 
			
		||||
  return(nchar(gsub("[atAT]", "", s)) / nchar(s))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
expect_equal(calcGC("AATT"), 0)
 | 
			
		||||
expect_equal(calcGC("ATGC"), 0.5)
 | 
			
		||||
expect_equal(calcGC("AC"),   0.5)
 | 
			
		||||
expect_equal(calcGC("CGCG"), 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							
							
						
						
									
										332
									
								
								RPR-eUtils_XML.R
									
									
									
									
									
								
							@@ -1,166 +1,166 @@
 | 
			
		||||
# tocID <- "RPR-eUtils_XML.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2021-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2.1  2021 Maintenance
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                       Line
 | 
			
		||||
#TOC> -----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Working with NCBI eUtils                      43
 | 
			
		||||
#TOC>   1.1        Task - fetchNCBItaxData() function         145
 | 
			
		||||
#TOC>   2        Task solutions                               152
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Working with NCBI eUtils  ============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# To begin, we load the xml2 package that contains functions
 | 
			
		||||
# we need to receive and parse html data. NCBI's eUtils send information in
 | 
			
		||||
# XML format so we need to be able to parse XML.
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = xml2)       # basic information
 | 
			
		||||
#  browseVignettes("xml2")    # available vignettes
 | 
			
		||||
#  data(package = "xml2")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We will walk through the process with the refSeqID
 | 
			
		||||
# of yeast Mbp1
 | 
			
		||||
refSeqID <- "NP_010227"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# First we build a query URL...
 | 
			
		||||
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Then we assemble an URL that will search for get the
 | 
			
		||||
# unique, NCBI internal identifier,
 | 
			
		||||
# for our refSeqID...
 | 
			
		||||
URL <- paste(eUtilsBase,
 | 
			
		||||
             "esearch.fcgi?",     # ...using the esearch program
 | 
			
		||||
                                  # that finds an entry in an
 | 
			
		||||
                                  # NCBI database
 | 
			
		||||
             "db=protein",
 | 
			
		||||
             "&term=", refSeqID,
 | 
			
		||||
             sep="")
 | 
			
		||||
# Copy the URL and paste it into your browser to see
 | 
			
		||||
# what the response should look like.
 | 
			
		||||
URL
 | 
			
		||||
 | 
			
		||||
# To fetch a response in R, we use the function read_xml()
 | 
			
		||||
# with our URL as its argument.
 | 
			
		||||
( myXML <- xml2::read_xml(URL) )
 | 
			
		||||
 | 
			
		||||
# This is XML. We can take the response apart into
 | 
			
		||||
# its individual components with the as_list() function.
 | 
			
		||||
 | 
			
		||||
xml2::as_list(myXML)
 | 
			
		||||
 | 
			
		||||
# Note how the XML "tree" is represented as a list of
 | 
			
		||||
# lists of lists ...
 | 
			
		||||
# If we know exactly what element we are looking for,
 | 
			
		||||
# we can extract it from this structure:
 | 
			
		||||
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
 | 
			
		||||
 | 
			
		||||
# But this is not very robust, it would break with the
 | 
			
		||||
# slightest change that the NCBI makes to their data format -
 | 
			
		||||
# and the NCBI changes things A LOT!
 | 
			
		||||
 | 
			
		||||
# Somewhat more robust is to specify the type of element
 | 
			
		||||
# we want - its the text contained in an <Id>...</Id>
 | 
			
		||||
# element, and use the XPath XML parsing language to
 | 
			
		||||
# retrieve it.
 | 
			
		||||
 | 
			
		||||
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
 | 
			
		||||
 | 
			
		||||
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
 | 
			
		||||
                                                  # of the node set
 | 
			
		||||
 | 
			
		||||
# We will need to do this more than once, so we write a function
 | 
			
		||||
# for it...
 | 
			
		||||
node2text <- function(doc, tag) {
 | 
			
		||||
  # an extractor function for the contents of elements
 | 
			
		||||
  # between given tags in an XML response.
 | 
			
		||||
  # Contents of all matching elements is returned in
 | 
			
		||||
  # a vector of strings.
 | 
			
		||||
  path <- paste0("//", tag)
 | 
			
		||||
  nodes <- xml2::xml_find_all(doc, path)
 | 
			
		||||
  return(xml2::xml_text(nodes))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# using node2text() ...
 | 
			
		||||
(GID <- node2text(myXML, "Id"))
 | 
			
		||||
 | 
			
		||||
# The GI is the pivot for data requests at the
 | 
			
		||||
# NCBI.
 | 
			
		||||
 | 
			
		||||
# Let's first get the associated data for this GI
 | 
			
		||||
URL <- paste0(eUtilsBase,
 | 
			
		||||
              "esummary.fcgi?",
 | 
			
		||||
              "db=protein",
 | 
			
		||||
              "&id=",
 | 
			
		||||
              GID,
 | 
			
		||||
              "&version=2.0")
 | 
			
		||||
(myXML <- xml2::read_xml(URL))
 | 
			
		||||
 | 
			
		||||
(taxID <- node2text(myXML, "TaxId"))
 | 
			
		||||
(organism <- node2text(myXML, "Organism"))
 | 
			
		||||
 | 
			
		||||
#  This forms the base of a function that gets taxonomy data
 | 
			
		||||
#  from an Entrez result. You can write this!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchNCBItaxData() function  ================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
 | 
			
		||||
# information, returns a list with taxID and organism, if the operation is
 | 
			
		||||
# successful, or a list of length 0 if there is an error.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
# I have placed such a function into the dbUtilities script: look it up by
 | 
			
		||||
# clicking on  dbFetchNCBItaxData() in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test:
 | 
			
		||||
dbFetchNCBItaxData("XP_001837394")
 | 
			
		||||
 | 
			
		||||
# Expected outout:
 | 
			
		||||
# ----------------
 | 
			
		||||
# taxID                         organism
 | 
			
		||||
# 1 240176 Coprinopsis cinerea okayama7#130
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "RPR-eUtils_XML.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  A Bioinformatics Course:
 | 
			
		||||
#              R code accompanying the RPR-Scripting_data_downloads unit.
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2.1
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2021-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2.1  2021 Maintenance
 | 
			
		||||
#           1.2    2020 Updates
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First ABC units version
 | 
			
		||||
#           0.1    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# == DO NOT SIMPLY  source()  THIS FILE! =======================================
 | 
			
		||||
#
 | 
			
		||||
# If there are portions you don't understand, use R's help system, Google for an
 | 
			
		||||
# answer, or ask your instructor. Don't continue if you don't understand what's
 | 
			
		||||
# going on. That's not how it works ...
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                       Line
 | 
			
		||||
#TOC> -----------------------------------------------------------
 | 
			
		||||
#TOC>   1        Working with NCBI eUtils                      43
 | 
			
		||||
#TOC>   1.1        Task - fetchNCBItaxData() function         145
 | 
			
		||||
#TOC>   2        Task solutions                               152
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Working with NCBI eUtils  ============================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# To begin, we load the xml2 package that contains functions
 | 
			
		||||
# we need to receive and parse html data. NCBI's eUtils send information in
 | 
			
		||||
# XML format so we need to be able to parse XML.
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = xml2)       # basic information
 | 
			
		||||
#  browseVignettes("xml2")    # available vignettes
 | 
			
		||||
#  data(package = "xml2")     # available datasets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We will walk through the process with the refSeqID
 | 
			
		||||
# of yeast Mbp1
 | 
			
		||||
refSeqID <- "NP_010227"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# First we build a query URL...
 | 
			
		||||
eUtilsBase <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Then we assemble an URL that will search for get the
 | 
			
		||||
# unique, NCBI internal identifier,
 | 
			
		||||
# for our refSeqID...
 | 
			
		||||
URL <- paste(eUtilsBase,
 | 
			
		||||
             "esearch.fcgi?",     # ...using the esearch program
 | 
			
		||||
                                  # that finds an entry in an
 | 
			
		||||
                                  # NCBI database
 | 
			
		||||
             "db=protein",
 | 
			
		||||
             "&term=", refSeqID,
 | 
			
		||||
             sep="")
 | 
			
		||||
# Copy the URL and paste it into your browser to see
 | 
			
		||||
# what the response should look like.
 | 
			
		||||
URL
 | 
			
		||||
 | 
			
		||||
# To fetch a response in R, we use the function read_xml()
 | 
			
		||||
# with our URL as its argument.
 | 
			
		||||
( myXML <- xml2::read_xml(URL) )
 | 
			
		||||
 | 
			
		||||
# This is XML. We can take the response apart into
 | 
			
		||||
# its individual components with the as_list() function.
 | 
			
		||||
 | 
			
		||||
xml2::as_list(myXML)
 | 
			
		||||
 | 
			
		||||
# Note how the XML "tree" is represented as a list of
 | 
			
		||||
# lists of lists ...
 | 
			
		||||
# If we know exactly what element we are looking for,
 | 
			
		||||
# we can extract it from this structure:
 | 
			
		||||
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
 | 
			
		||||
 | 
			
		||||
# But this is not very robust, it would break with the
 | 
			
		||||
# slightest change that the NCBI makes to their data format -
 | 
			
		||||
# and the NCBI changes things A LOT!
 | 
			
		||||
 | 
			
		||||
# Somewhat more robust is to specify the type of element
 | 
			
		||||
# we want - its the text contained in an <Id>...</Id>
 | 
			
		||||
# element, and use the XPath XML parsing language to
 | 
			
		||||
# retrieve it.
 | 
			
		||||
 | 
			
		||||
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
 | 
			
		||||
 | 
			
		||||
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
 | 
			
		||||
                                                  # of the node set
 | 
			
		||||
 | 
			
		||||
# We will need to do this more than once, so we write a function
 | 
			
		||||
# for it...
 | 
			
		||||
node2text <- function(doc, tag) {
 | 
			
		||||
  # an extractor function for the contents of elements
 | 
			
		||||
  # between given tags in an XML response.
 | 
			
		||||
  # Contents of all matching elements is returned in
 | 
			
		||||
  # a vector of strings.
 | 
			
		||||
  path <- paste0("//", tag)
 | 
			
		||||
  nodes <- xml2::xml_find_all(doc, path)
 | 
			
		||||
  return(xml2::xml_text(nodes))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# using node2text() ...
 | 
			
		||||
(GID <- node2text(myXML, "Id"))
 | 
			
		||||
 | 
			
		||||
# The GI is the pivot for data requests at the
 | 
			
		||||
# NCBI.
 | 
			
		||||
 | 
			
		||||
# Let's first get the associated data for this GI
 | 
			
		||||
URL <- paste0(eUtilsBase,
 | 
			
		||||
              "esummary.fcgi?",
 | 
			
		||||
              "db=protein",
 | 
			
		||||
              "&id=",
 | 
			
		||||
              GID,
 | 
			
		||||
              "&version=2.0")
 | 
			
		||||
(myXML <- xml2::read_xml(URL))
 | 
			
		||||
 | 
			
		||||
(taxID <- node2text(myXML, "TaxId"))
 | 
			
		||||
(organism <- node2text(myXML, "Organism"))
 | 
			
		||||
 | 
			
		||||
#  This forms the base of a function that gets taxonomy data
 | 
			
		||||
#  from an Entrez result. You can write this!
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   1.1  Task - fetchNCBItaxData() function  ================================
 | 
			
		||||
 | 
			
		||||
# Task: write a function that takes as input a RefSeq ID, fetches the taxonomy
 | 
			
		||||
# information, returns a list with taxID and organism, if the operation is
 | 
			
		||||
# successful, or a list of length 0 if there is an error.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Task solutions  ======================================================
 | 
			
		||||
 | 
			
		||||
# I have placed such a function into the dbUtilities script: look it up by
 | 
			
		||||
# clicking on  dbFetchNCBItaxData() in the Environment pane.
 | 
			
		||||
 | 
			
		||||
# Test:
 | 
			
		||||
dbFetchNCBItaxData("XP_001837394")
 | 
			
		||||
 | 
			
		||||
# Expected outout:
 | 
			
		||||
# ----------------
 | 
			
		||||
# taxID                         organism
 | 
			
		||||
# 1 240176 Coprinopsis cinerea okayama7#130
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,10 +1,10 @@
 | 
			
		||||
HEADER   TEST                                                 0TST      0TST   1
 | 
			
		||||
REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
 | 
			
		||||
ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
 | 
			
		||||
ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
 | 
			
		||||
ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
 | 
			
		||||
ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
 | 
			
		||||
TER       5      GLY     1                                              0TST   7
 | 
			
		||||
HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
 | 
			
		||||
HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
 | 
			
		||||
END                                                                     0TST  10
 | 
			
		||||
HEADER   TEST                                                 0TST      0TST   1
 | 
			
		||||
REMARK     A CATALOGUE OF ATOM AND HETATM RECORDS                       0TST   2
 | 
			
		||||
ATOM      1  N   GLY     1      -6.253  75.745  53.559  1.00 36.34      0TST   3
 | 
			
		||||
ATOM      2  CA  GLY     1      -5.789  75.223  52.264  1.00 44.94      0TST   4
 | 
			
		||||
ATOM      3  C   GLY     1      -5.592  73.702  52.294  1.00 32.28      0TST   5
 | 
			
		||||
ATOM      4  O   GLY     1      -5.140  73.148  53.304  1.00 19.32      0TST   6
 | 
			
		||||
TER       5      GLY     1                                              0TST   7
 | 
			
		||||
HETATM    6  O   HOH     1      -4.169  60.050  40.145  1.00  3.00      0TST   8
 | 
			
		||||
HETATM    7 CA   CA      1      -1.258 -71.579  50.253  1.00  3.00      0TST   9
 | 
			
		||||
END                                                                     0TST  10
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
							
						
						
									
										3104
									
								
								data/1BM8.pdb
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,5 +1,5 @@
 | 
			
		||||
>2F1C:X|PDBID|CHAIN|SEQUENCE
 | 
			
		||||
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
 | 
			
		||||
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
 | 
			
		||||
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
 | 
			
		||||
>2F1C:X|PDBID|CHAIN|SEQUENCE
 | 
			
		||||
EERNDWHFNIGAMYEIENVEGYGEDMDGLAEPSVYFNAANGPWRIALAYYQEGPVDYSAGKRGTWFDRPELEVHYQFLEN
 | 
			
		||||
DDFSFGLTGGFRNYGYHYVDEPGKDTANMQRWKIAPDWDVKLTDDLRFNGWLSMYKFANDLNTTGYADTRVETETGLQYT
 | 
			
		||||
FNETVALRVNYYLERGFNMDDSRNNGEFSTQEIRAYLPLTLGNHSVTPYTRIGLDRWSNWDWQDDIEREGHDFNRVGLFY
 | 
			
		||||
GYDFQNGLSVSLEYAFEWQDHDEGDSDKFHYAGVGVNYSFHHHHHH
 | 
			
		||||
							
								
								
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								data/3FG7.fa
									
									
									
									
									
								
							@@ -1,6 +1,6 @@
 | 
			
		||||
>3FG7:A|PDBID|CHAIN|SEQUENCE
 | 
			
		||||
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
 | 
			
		||||
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
 | 
			
		||||
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
 | 
			
		||||
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
 | 
			
		||||
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
 | 
			
		||||
>3FG7:A|PDBID|CHAIN|SEQUENCE
 | 
			
		||||
MAEEHHHHHHHHLEVLFQGPGRPKTHTVGSVAKVEQVKFDATSMHVKPQVAAQQKMVDDGSGEVQVWRIENLELVPVDSK
 | 
			
		||||
WLGHFYGGDCYLLLYTYLIGEKQHYLLYVWQGSQASQDEITASAYQAVILDQKYNGEPVQIRVPMGKEPPHLMSIFKGRM
 | 
			
		||||
VVYQGGTSRTNNLETGPSTRLFQVQGTGANNTKAFEVPARANFLNSNDVFVLKTQSCCYLWCGKGCSGDEREMAKMVADT
 | 
			
		||||
ISRTEKQVVVEGQEPANFWMALGGKAPYANTKRLQEENLVITPRLFECSNKTGRFLATEIPDFNQDDLEEDDVFLLDVWD
 | 
			
		||||
QVFFWIGKHANEEEKKAAATTAQEYLKTHPSGRDPETPIIVVKQGHEPPTFTGWFLAWDPFKWSGIHVVPNLSPLSNN
 | 
			
		||||
 
 | 
			
		||||
@@ -1,20 +1,20 @@
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_010227",
 | 
			
		||||
    "UniProtID" : "P39678",
 | 
			
		||||
    "taxonomyID" : 559292,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
 | 
			
		||||
       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
 | 
			
		||||
       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
 | 
			
		||||
       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
 | 
			
		||||
       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
 | 
			
		||||
       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
 | 
			
		||||
       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
 | 
			
		||||
       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
 | 
			
		||||
       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
 | 
			
		||||
       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
 | 
			
		||||
       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
 | 
			
		||||
       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_010227",
 | 
			
		||||
    "UniProtID" : "P39678",
 | 
			
		||||
    "taxonomyID" : 559292,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSNQIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRILEKEVLKETHEKVQGGF",
 | 
			
		||||
       "GKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASPPPAPKHHHASKVDRKKAIRSASTSAIMET",
 | 
			
		||||
       "KRNNKKAEENQFQSSKILGNPTAAPRKRGRPVGSTRGSRRKLGVNLQRSQSDMGFPRPAIPNSSISTTQL",
 | 
			
		||||
       "PSIRSTMGPQSPTLGILEEERHDSRQQQPQQNNSAQFKEIDLEDGLSSDVEPSQQLQQVFNQNTGFVPQQ",
 | 
			
		||||
       "QSSLIQTQQTESMATSVSSSPSLPTSPGDFADSNPFEERFPGGGTSPIISMIPRYPVTSRPQTSDINDKV",
 | 
			
		||||
       "NKYLSKLVDYFISNEMKSNKSLPQVLLHPPPHSAPYIDAPIDPELHTAFHWACSMGNLPIAEALYEAGTS",
 | 
			
		||||
       "IRSTNSQGQTPLMRSSLFHNSYTRRTFPRIFQLLHETVFDIDSQSQTVIHHIVKRKSTTPSAVYYLDVVL",
 | 
			
		||||
       "SKIKDFSPQYRIELLLNTQDKNGDTALHIASKNGDVVFFNTLVKMGALTTISNKEGLTANEIMNQQYEQM",
 | 
			
		||||
       "MIQNGTNQHVNSSNTDLNIHVNTNNIETKNDVNSMVIMSPVSPSDYITYPSQIATNISRNIPNVVNSMKQ",
 | 
			
		||||
       "MASIYNDLHEQHDNEIKSLQKTLKSISKTKIQVSLKTLEVLKESSKDENGEAQTNDDFEILSRLQEQNTK",
 | 
			
		||||
       "KLRKRLIRYKRLIKQKLEYRQTVLLNKLIEDETQATTNNTVEKDNNTLERLELAQELTMLQLQRKNKLSS",
 | 
			
		||||
       "LVKKFEDNAKIHKYRRIIREGTEMNIEEVDSSLDVILQTLIANNNKNKGAEQIITISNANSHA"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,30 +1,30 @@
 | 
			
		||||
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
 | 
			
		||||
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
 | 
			
		||||
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
 | 
			
		||||
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
 | 
			
		||||
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
 | 
			
		||||
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
 | 
			
		||||
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
 | 
			
		||||
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
 | 
			
		||||
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
 | 
			
		||||
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
 | 
			
		||||
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
 | 
			
		||||
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
 | 
			
		||||
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
 | 
			
		||||
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
 | 
			
		||||
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
 | 
			
		||||
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
 | 
			
		||||
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
 | 
			
		||||
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
 | 
			
		||||
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
 | 
			
		||||
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
 | 
			
		||||
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
 | 
			
		||||
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
 | 
			
		||||
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
 | 
			
		||||
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
 | 
			
		||||
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
 | 
			
		||||
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
 | 
			
		||||
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
 | 
			
		||||
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
 | 
			
		||||
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
 | 
			
		||||
CACCAGTCCCCAGAATGA
 | 
			
		||||
>PTPN5-201 cds:protein_coding (ENST00000358540.7)
 | 
			
		||||
ATGAATTATGAGGGAGCCAGGAGTGAGAGAGAGAACCACGCTGCTGATGACTCCGAGGGA
 | 
			
		||||
GGGGCCCTGGACATGTGCTGCAGTGAGAGGCTACCGGGTCTCCCCCAGCCGATAGTGATG
 | 
			
		||||
GAGGCACTGGACGAGGCTGAAGGGCTCCAGGACTCACAGAGAGAGATGCCGCCACCCCCT
 | 
			
		||||
CCTCCCTCGCCGCCCTCAGATCCAGCTCAGAAGCCACCACCTCGAGGCGCTGGGAGCCAC
 | 
			
		||||
TCCCTCACTGTCAGGAGCAGCCTGTGCCTGTTCGCTGCCTCACAGTTCCTGCTTGCCTGT
 | 
			
		||||
GGGGTGCTCTGGTTCAGCGGTTATGGCCACATCTGGTCACAGAACGCCACAAACCTCGTC
 | 
			
		||||
TCCTCTTTGCTGACGCTCCTGAAACAGCTGGAACCCACGGCCTGGCTTGACTCTGGGACG
 | 
			
		||||
TGGGGAGTCCCCAGTCTGCTGCTGGTCTTTCTGTCCGTGGGCCTGGTCCTCGTTACCACC
 | 
			
		||||
CTGGTGTGGCACCTCCTGAGGACACCCCCAGAGCCACCCACCCCACTGCCCCCTGAGGAC
 | 
			
		||||
AGGCGCCAGTCAGTGAGCCGCCAGCCCTCCTTCACCTACTCAGAGTGGATGGAGGAGAAG
 | 
			
		||||
ATCGAGGATGACTTCCTGGACCTCGACCCGGTGCCCGAGACTCCTGTGTTTGATTGTGTG
 | 
			
		||||
ATGGACATCAAGCCTGAGGCTGACCCCACCTCACTCACCGTCAAGTCCATGGGTCTGCAG
 | 
			
		||||
GAGAGGAGGGGTTCCAATGTCTCCCTGACCCTGGACATGTGCACTCCGGGCTGCAACGAG
 | 
			
		||||
GAGGGCTTTGGCTATCTCATGTCCCCACGTGAGGAGTCCGCCCGCGAGTACCTGCTCAGC
 | 
			
		||||
GCCTCCCGTGTCCTCCAAGCAGAAGAGCTTCATGAAAAGGCCCTGGACCCTTTCCTGCTG
 | 
			
		||||
CAGGCGGAATTCTTTGAAATCCCCATGAACTTTGTGGATCCGAAAGAGTACGACATCCCT
 | 
			
		||||
GGGCTGGTGCGGAAGAACCGGTACAAAACCATACTTCCCAACCCTCACAGCAGAGTGTGT
 | 
			
		||||
CTGACCTCACCAGACCCTGACGACCCTCTGAGTTCCTACATCAATGCCAACTACATCCGG
 | 
			
		||||
GGCTATGGTGGGGAGGAGAAGGTGTACATCGCCACTCAGGGACCCATCGTCAGCACGGTC
 | 
			
		||||
GCCGACTTCTGGCGCATGGTGTGGCAGGAGCACACGCCCATCATTGTCATGATCACCAAC
 | 
			
		||||
ATCGAGGAGATGAACGAGAAATGCACCGAGTATTGGCCGGAGGAGCAGGTGGCGTACGAC
 | 
			
		||||
GGTGTTGAGATCACTGTGCAGAAAGTCATTCACACGGAGGATTACCGGCTGCGACTCATC
 | 
			
		||||
TCCCTCAAGAGTGGGACTGAGGAGCGAGGCCTGAAGCATTACTGGTTCACATCCTGGCCC
 | 
			
		||||
GACCAGAAGACCCCAGACCGGGCCCCCCCACTCCTGCACCTGGTGCGGGAGGTGGAGGAG
 | 
			
		||||
GCAGCCCAGCAGGAGGGGCCCCACTGTGCCCCCATCATCGTCCACTGCAGTGCAGGGATT
 | 
			
		||||
GGGAGGACCGGCTGCTTCATTGCCACCAGCATCTGCTGCCAGCAGCTGCGGCAGGAGGGT
 | 
			
		||||
GTGGTGGACATCCTGAAGACCACGTGCCAGCTCCGTCAGGACAGGGGCGGCATGATCCAG
 | 
			
		||||
ACATGCGAGCAGTACCAGTTTGTGCACCACGTCATGAGCCTCTACGAAAAGCAGCTGTCC
 | 
			
		||||
CACCAGTCCCCAGAATGA
 | 
			
		||||
 
 | 
			
		||||
@@ -1,12 +1,12 @@
 | 
			
		||||
>RAB39B cds:protein_coding (ENST00000369454.4)
 | 
			
		||||
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
 | 
			
		||||
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
 | 
			
		||||
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
 | 
			
		||||
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
 | 
			
		||||
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
 | 
			
		||||
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
 | 
			
		||||
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
 | 
			
		||||
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
 | 
			
		||||
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
 | 
			
		||||
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
 | 
			
		||||
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
 | 
			
		||||
>RAB39B cds:protein_coding (ENST00000369454.4)
 | 
			
		||||
ATGGAGGCCATCTGGCTGTACCAGTTCCGGCTCATTGTCATCGGGGATTCCACAGTGGGC
 | 
			
		||||
AAGTCCTGCCTGATCCGCCGCTTCACCGAGGGTCGCTTTGCCCAGGTTTCTGACCCCACC
 | 
			
		||||
GTGGGGGTGGATTTTTTCTCCCGCTTGGTGGAGATCGAGCCAGGAAAACGCATCAAGCTC
 | 
			
		||||
CAGATCTGGGATACCGCGGGTCAAGAGAGGTTCAGATCCATCACTCGCGCCTACTACAGG
 | 
			
		||||
AACTCAGTAGGTGGTCTTCTCTTATTTGACATTACCAACCGCAGGTCCTTCCAGAATGTC
 | 
			
		||||
CATGAGTGGTTAGAAGAGACCAAAGTACACGTTCAGCCCTACCAAATTGTATTTGTTCTG
 | 
			
		||||
GTGGGTCACAAGTGTGACCTGGATACACAGAGGCAAGTGACTCGCCACGAGGCCGAGAAA
 | 
			
		||||
CTGGCTGCTGCATACGGCATGAAGTACATTGAAACGTCAGCCCGAGATGCCATTAATGTG
 | 
			
		||||
GAGAAAGCCTTCACAGACCTGACAAGAGACATATATGAGCTGGTTAAAAGGGGGGAGATT
 | 
			
		||||
ACAATCCAGGAGGGCTGGGAAGGGGTGAAGAGTGGATTTGTACCAAATGTGGTTCACTCT
 | 
			
		||||
TCAGAAGAGGTTGTCAAATCAGAGAGGAGATGTTTGTGCTAG
 | 
			
		||||
 
 | 
			
		||||
@@ -1,131 +1,131 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{css, echo = FALSE}
 | 
			
		||||
 | 
			
		||||
.striped tr:nth-child(even) {
 | 
			
		||||
  background: #eaf1ff;
 | 
			
		||||
}
 | 
			
		||||
.striped {
 | 
			
		||||
  padding: 5px;
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{r setup, include=FALSE}
 | 
			
		||||
knitr::opts_chunk$set(echo = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Phobias! ##
 | 
			
		||||
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
 | 
			
		||||
 | 
			
		||||
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
 | 
			
		||||
```{r packages}
 | 
			
		||||
if (! requireNamespace("rvest", quietly=TRUE)) {
 | 
			
		||||
  install.packages("rvest")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
 | 
			
		||||
 | 
			
		||||
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
 | 
			
		||||
 | 
			
		||||
```{r getPageData, cache=TRUE}
 | 
			
		||||
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
 | 
			
		||||
allTables <- rvest::html_table(webPage, fill = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
 | 
			
		||||
 | 
			
		||||
```{r collateTables, cache=TRUE}
 | 
			
		||||
phobiaTable <- data.frame(Phobia = character(), Condition = character())
 | 
			
		||||
for (i in seq_along(allTables)) {
 | 
			
		||||
  df <- allTables[[i]]
 | 
			
		||||
  if (all(colnames(df) == c("Phobia", "Condition"))) {
 | 
			
		||||
    phobiaTable <- rbind(phobiaTable, df)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
 | 
			
		||||
```{r , ref.label="randRow", echo=FALSE}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Table**: seven random phobias<br/>
 | 
			
		||||
```{r renderPhobiaTable, echo=FALSE, results='asis'}
 | 
			
		||||
sel <- sample(1:nrow(phobiaTable), 7)
 | 
			
		||||
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
 | 
			
		||||
 | 
			
		||||
```{r randRow}
 | 
			
		||||
randRow <- function(M, seed = FALSE) {
 | 
			
		||||
  # Return a random row from a dataframe M.
 | 
			
		||||
  if (seed) {
 | 
			
		||||
    oldseed <- .Random.seed                # play nice and save the RNG state ...
 | 
			
		||||
    set.seed(as.integer(seed))
 | 
			
		||||
  }
 | 
			
		||||
  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
 | 
			
		||||
  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
 | 
			
		||||
  return(r)
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
 | 
			
		||||
 | 
			
		||||
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
 | 
			
		||||
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
 | 
			
		||||
 | 
			
		||||
```{r preProcess}
 | 
			
		||||
 | 
			
		||||
# select only single-word phobias that end with "phobia"
 | 
			
		||||
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
 | 
			
		||||
names <- phobiaTable$Phobia[sel]
 | 
			
		||||
 | 
			
		||||
# extract the ones we did _not_ select
 | 
			
		||||
x <- phobiaTable$Phobia[! sel]
 | 
			
		||||
# use strsplit() to split them apart and flatten the resulting list
 | 
			
		||||
x <- unlist(strsplit(x, ", "))
 | 
			
		||||
x <- unlist(strsplit(x, " "))
 | 
			
		||||
x <- unlist(strsplit(x, "/"))
 | 
			
		||||
# use the same selection as above, and append the result to our "names""
 | 
			
		||||
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
 | 
			
		||||
names <- c(names, x[sel])
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
 | 
			
		||||
 | 
			
		||||
```{r showHist}
 | 
			
		||||
 | 
			
		||||
x <- nchar(names)
 | 
			
		||||
pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
 | 
			
		||||
pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
 | 
			
		||||
hist(x,
 | 
			
		||||
     main = "Length of phobia-names",
 | 
			
		||||
     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
 | 
			
		||||
                   pShort, nchar(pShort), pLong, nchar(pLong)),
 | 
			
		||||
     cex.sub = 0.8,
 | 
			
		||||
     xlab = "name",
 | 
			
		||||
     ylab = "counts",
 | 
			
		||||
     col ="#aef5ee")
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
That's all.
 | 
			
		||||
 | 
			
		||||
<!-- [END] -->
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{css, echo = FALSE}
 | 
			
		||||
 | 
			
		||||
.striped tr:nth-child(even) {
 | 
			
		||||
  background: #eaf1ff;
 | 
			
		||||
}
 | 
			
		||||
.striped {
 | 
			
		||||
  padding: 5px;
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
<small>Random Phobias - .Rmd sample code for BCH441 at the University of Toronto. (c) Boris Steipe 2020 -->
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{r setup, include=FALSE}
 | 
			
		||||
knitr::opts_chunk$set(echo = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Phobias! ##
 | 
			
		||||
We all have some, but we could always use more. How to know them all? With this code we access the [Wikipedia list of phobias](https://en.wikipedia.org/wiki/List_of_phobias), scrape the contents and assemble a dataframe. Then we write a function to retrieve a random phobia, which we can subsequently ponder on - either to delight in the fact that we don't have that fear, or to add to our daily quota of anxieties <small>(like our well-founded [fear of bad programming practice](http://xkcd.com/292/))</small>.
 | 
			
		||||
 | 
			
		||||
To load the list, we will "screenscrape" the contents of Wikipedia's [List of Phobias](https://en.wikipedia.org/wiki/List_of_phobias). First, we install the `rvest` library and the `xml2` library from CRAN, if we don't have it.
 | 
			
		||||
```{r packages}
 | 
			
		||||
if (! requireNamespace("rvest", quietly=TRUE)) {
 | 
			
		||||
  install.packages("rvest")
 | 
			
		||||
}
 | 
			
		||||
if (! requireNamespace("xml2", quietly=TRUE)) {
 | 
			
		||||
  install.packages("xml2")
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
As we customarily do, we avoid using the `library()` function to make the package contents accessible, but use the `package::` syntax instead. This makes our code more explicit and maintainable.
 | 
			
		||||
 | 
			
		||||
`xml2` handles reading and parsing of documents. The `rvest` package was designed for screenscraping and has functions to make our life very easy: it accesses the response of an `xml2` query, looks for all HTML formatted tables, parses them with an XPATH expression and returns them as lists from which we can get data frames.
 | 
			
		||||
 | 
			
		||||
```{r getPageData, cache=TRUE}
 | 
			
		||||
webPage <- xml2::read_html("https://en.wikipedia.org/wiki/List_of_phobias")
 | 
			
		||||
allTables <- rvest::html_table(webPage, fill = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
There are ```r length(allTables)``` tables in the list, but the ones we are interested in are data frames with two columns named `Phobia` and `Condition`.
 | 
			
		||||
 | 
			
		||||
```{r collateTables, cache=TRUE}
 | 
			
		||||
phobiaTable <- data.frame(Phobia = character(), Condition = character())
 | 
			
		||||
for (i in seq_along(allTables)) {
 | 
			
		||||
  df <- allTables[[i]]
 | 
			
		||||
  if (all(colnames(df) == c("Phobia", "Condition"))) {
 | 
			
		||||
    phobiaTable <- rbind(phobiaTable, df)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Done, we collected ```r nrow(phobiaTable)``` phobias. Let's randomly select a few and print them.
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
 | 
			
		||||
```{r , ref.label="randRow", echo=FALSE}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
**Table**: seven random phobias<br/>
 | 
			
		||||
```{r renderPhobiaTable, echo=FALSE, results='asis'}
 | 
			
		||||
sel <- sample(1:nrow(phobiaTable), 7)
 | 
			
		||||
knitr::kable(phobiaTable[sel, ], table.attr = "class=\"striped\"", format = "html")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
To pick a single random phobia from the list, we take a (pseudo) random sample of size 1 from the number of rows in the `phobiaFrame` object. Our function thus returns a random row from a matrix or dataframe, and it uses an optional argument: `seed`. This can either be Boolean `FALSE` (the default), or an integer that is used in R's `set.seed()` function.
 | 
			
		||||
 | 
			
		||||
```{r randRow}
 | 
			
		||||
randRow <- function(M, seed = FALSE) {
 | 
			
		||||
  # Return a random row from a dataframe M.
 | 
			
		||||
  if (seed) {
 | 
			
		||||
    oldseed <- .Random.seed                # play nice and save the RNG state ...
 | 
			
		||||
    set.seed(as.integer(seed))
 | 
			
		||||
  }
 | 
			
		||||
  r <- M[sample(1:nrow(M), 1), ]           # fetch one random row
 | 
			
		||||
  if (seed) { .Random.seed  <- oldseed }   # ... restore the RNG state
 | 
			
		||||
  return(r)
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
With this useful tool we can ponder on our favourite phobia of the day. For today, let it be **`r randRow(phobiaTable, seed=1123581321)[2]`**, the `r randRow(phobiaTable, seed=1123581321)[1]`.
 | 
			
		||||
 | 
			
		||||
_`r randRow(phobiaTable, seed=1123581321)[1]`_! Really!!? Awful.
 | 
			
		||||
 | 
			
		||||
<p> 
 | 
			
		||||
<p>
 | 
			
		||||
 | 
			
		||||
Finally: let's plot a histogram of phobia name lengths just to illustrate plots. A little preprocessing is required, since some names collate synonyms, like _"Hypnophobia, somniphobia"_. We'll break these up.
 | 
			
		||||
 | 
			
		||||
```{r preProcess}
 | 
			
		||||
 | 
			
		||||
# select only single-word phobias that end with "phobia"
 | 
			
		||||
sel <- ! grepl(" ", phobiaTable$Phobia) & grepl(".phobia$", phobiaTable$Phobia)
 | 
			
		||||
names <- phobiaTable$Phobia[sel]
 | 
			
		||||
 | 
			
		||||
# extract the ones we did _not_ select
 | 
			
		||||
x <- phobiaTable$Phobia[! sel]
 | 
			
		||||
# use strsplit() to split them apart and flatten the resulting list
 | 
			
		||||
x <- unlist(strsplit(x, ", "))
 | 
			
		||||
x <- unlist(strsplit(x, " "))
 | 
			
		||||
x <- unlist(strsplit(x, "/"))
 | 
			
		||||
# use the same selection as above, and append the result to our "names""
 | 
			
		||||
sel <- ! grepl(" ", x) & grepl(".phobia$", x)
 | 
			
		||||
names <- c(names, x[sel])
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Done, we collected ```r length(names)``` names for phobias. Here is a histogram of their lengths.
 | 
			
		||||
 | 
			
		||||
```{r showHist}
 | 
			
		||||
 | 
			
		||||
x <- nchar(names)
 | 
			
		||||
pShort <- names[which(x == min(x))[1]]  # pull out the shortest name ...
 | 
			
		||||
pLong  <- names[which(x == max(x))[1]]  # ... and the longest name too.
 | 
			
		||||
hist(x,
 | 
			
		||||
     main = "Length of phobia-names",
 | 
			
		||||
     sub = sprintf("Shortest: %s (%d), Longest: %s (%d)",
 | 
			
		||||
                   pShort, nchar(pShort), pLong, nchar(pLong)),
 | 
			
		||||
     cex.sub = 0.8,
 | 
			
		||||
     xlab = "name",
 | 
			
		||||
     ylab = "counts",
 | 
			
		||||
     col ="#aef5ee")
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
That's all.
 | 
			
		||||
 | 
			
		||||
<!-- [END] -->
 | 
			
		||||
 
 | 
			
		||||
@@ -1,43 +1,43 @@
 | 
			
		||||
>MBP1 YDL056W SGDID:S000002214
 | 
			
		||||
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
 | 
			
		||||
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
 | 
			
		||||
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
 | 
			
		||||
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
 | 
			
		||||
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
 | 
			
		||||
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
 | 
			
		||||
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
 | 
			
		||||
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
 | 
			
		||||
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
 | 
			
		||||
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
 | 
			
		||||
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
 | 
			
		||||
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
 | 
			
		||||
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
 | 
			
		||||
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
 | 
			
		||||
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
 | 
			
		||||
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
 | 
			
		||||
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
 | 
			
		||||
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
 | 
			
		||||
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
 | 
			
		||||
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
 | 
			
		||||
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
 | 
			
		||||
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
 | 
			
		||||
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
 | 
			
		||||
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
 | 
			
		||||
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
 | 
			
		||||
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
 | 
			
		||||
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
 | 
			
		||||
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
 | 
			
		||||
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
 | 
			
		||||
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
 | 
			
		||||
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
 | 
			
		||||
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
 | 
			
		||||
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
 | 
			
		||||
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
 | 
			
		||||
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
 | 
			
		||||
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
 | 
			
		||||
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
 | 
			
		||||
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
 | 
			
		||||
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
 | 
			
		||||
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
 | 
			
		||||
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
 | 
			
		||||
>MBP1 YDL056W SGDID:S000002214
 | 
			
		||||
ATGTCTAACCAAATATACTCAGCGAGATATTCGGGGGTTGATGTTTATGAATTCATTCAT
 | 
			
		||||
TCTACAGGATCTATCATGAAAAGGAAAAAGGATGATTGGGTCAATGCTACACATATTTTA
 | 
			
		||||
AAGGCCGCCAATTTTGCCAAGGCTAAAAGAACAAGGATTCTAGAGAAGGAAGTACTTAAG
 | 
			
		||||
GAAACTCATGAAAAAGTTCAGGGTGGATTTGGTAAATATCAGGGTACATGGGTCCCACTG
 | 
			
		||||
AACATAGCGAAACAACTGGCAGAAAAATTTAGTGTCTACGATCAGCTGAAACCGTTGTTC
 | 
			
		||||
GACTTTACGCAAACAGATGGGTCTGCTTCTCCACCTCCTGCTCCAAAACATCACCATGCC
 | 
			
		||||
TCGAAGGTGGATAGGAAAAAGGCTATTAGAAGTGCAAGTACTTCCGCAATTATGGAAACA
 | 
			
		||||
AAAAGAAACAACAAGAAAGCCGAGGAAAATCAATTTCAAAGCAGCAAAATATTGGGAAAT
 | 
			
		||||
CCCACGGCTGCACCAAGGAAAAGAGGTAGACCGGTAGGATCTACGAGGGGAAGTAGGCGG
 | 
			
		||||
AAGTTAGGTGTCAATTTACAACGTTCTCAAAGTGATATGGGATTTCCTAGACCGGCGATA
 | 
			
		||||
CCGAATTCTTCAATATCGACAACGCAACTTCCCTCTATTAGATCCACCATGGGACCACAA
 | 
			
		||||
TCCCCTACATTGGGTATTCTGGAAGAAGAAAGGCACGATTCTCGACAGCAGCAGCCGCAA
 | 
			
		||||
CAAAATAATTCTGCACAGTTCAAAGAAATTGATCTTGAGGACGGCTTATCAAGCGATGTG
 | 
			
		||||
GAACCTTCACAACAATTACAACAAGTTTTTAATCAAAATACTGGATTTGTACCCCAACAA
 | 
			
		||||
CAATCTTCCTTGATACAGACACAGCAAACAGAATCAATGGCCACGTCCGTATCTTCCTCT
 | 
			
		||||
CCTTCATTACCTACGTCACCGGGCGATTTTGCCGATAGTAATCCATTTGAAGAGCGATTT
 | 
			
		||||
CCCGGTGGTGGAACATCTCCTATTATTTCCATGATCCCGCGTTATCCTGTAACTTCAAGG
 | 
			
		||||
CCTCAAACATCGGATATTAATGATAAAGTTAACAAATACCTTTCAAAATTGGTTGATTAT
 | 
			
		||||
TTTATTTCCAATGAAATGAAGTCAAATAAGTCCCTACCACAAGTGTTATTGCACCCACCT
 | 
			
		||||
CCACACAGCGCTCCCTATATAGATGCTCCAATCGATCCAGAATTACATACTGCCTTCCAT
 | 
			
		||||
TGGGCTTGTTCTATGGGTAATTTACCAATTGCTGAGGCGTTGTACGAAGCCGGAACAAGT
 | 
			
		||||
ATCAGATCGACAAATTCTCAAGGCCAAACTCCATTGATGAGAAGTTCCTTATTCCACAAT
 | 
			
		||||
TCATACACTAGAAGAACTTTCCCTAGAATTTTCCAGCTACTGCACGAGACCGTATTTGAT
 | 
			
		||||
ATCGATTCGCAATCACAAACAGTAATTCACCATATTGTGAAACGAAAATCAACAACACCT
 | 
			
		||||
TCTGCAGTTTATTATCTTGATGTTGTGCTATCTAAGATCAAGGATTTTTCCCCACAGTAT
 | 
			
		||||
AGAATTGAATTACTTTTAAACACACAAGACAAAAATGGCGATACCGCACTTCATATTGCT
 | 
			
		||||
TCTAAAAATGGAGATGTTGTTTTTTTTAATACACTGGTCAAAATGGGTGCATTAACTACT
 | 
			
		||||
ATTTCCAATAAGGAAGGATTAACCGCCAATGAAATAATGAATCAACAATATGAGCAAATG
 | 
			
		||||
ATGATACAAAATGGTACAAATCAACATGTCAATTCTTCAAACACGGACTTGAATATCCAC
 | 
			
		||||
GTTAATACAAACAACATTGAAACGAAAAATGATGTTAATTCAATGGTAATCATGTCGCCT
 | 
			
		||||
GTTTCTCCTTCGGATTACATAACCTATCCATCTCAAATTGCCACCAATATATCAAGAAAT
 | 
			
		||||
ATTCCAAATGTAGTGAATTCTATGAAGCAAATGGCTAGCATATACAACGATCTTCATGAA
 | 
			
		||||
CAGCATGACAACGAAATAAAAAGTTTGCAAAAAACTTTAAAAAGCATTTCTAAGACGAAA
 | 
			
		||||
ATACAGGTAAGCCTAAAAACTTTAGAGGTATTGAAAGAGAGCAGTAAAGATGAAAACGGC
 | 
			
		||||
GAAGCTCAGACTAATGATGACTTCGAAATTTTATCTCGTCTACAAGAACAAAATACTAAG
 | 
			
		||||
AAATTGAGAAAAAGGCTCATACGATACAAACGGTTGATAAAACAAAAGCTGGAATACAGG
 | 
			
		||||
CAAACGGTTTTATTGAACAAATTAATAGAAGATGAAACTCAGGCTACCACCAATAACACA
 | 
			
		||||
GTTGAGAAAGATAATAATACGCTGGAAAGGTTGGAATTGGCTCAAGAACTAACGATGTTG
 | 
			
		||||
CAATTACAAAGGAAAAACAAATTGAGTTCCTTGGTGAAGAAATTTGAAGACAATGCCAAG
 | 
			
		||||
ATTCATAAATATAGACGGATTATCAGGGAAGGTACGGAAATGAATATTGAAGAAGTAGAT
 | 
			
		||||
AGTTCGCTGGATGTAATACTACAGACATTGATAGCCAACAATAATAAAAATAAGGGCGCA
 | 
			
		||||
GAACAGATCATCACAATCTCAAACGCGAATAGTCATGCATAA
 | 
			
		||||
@@ -1,47 +1,47 @@
 | 
			
		||||
SGD_features.tab
 | 
			
		||||
 | 
			
		||||
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
 | 
			
		||||
 | 
			
		||||
The SGD_features.tab file is updated weekly (Saturday).
 | 
			
		||||
 | 
			
		||||
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
 | 
			
		||||
used chromosomal_feature.tab file.
 | 
			
		||||
 | 
			
		||||
File contents:
 | 
			
		||||
 | 
			
		||||
1. Information on current chromosomal features in SGD, including Dubious ORFs. 
 | 
			
		||||
Also contains coordinates of intron, exons, and other subfeatures that are located
 | 
			
		||||
within a chromosomal feature.
 | 
			
		||||
 | 
			
		||||
2. The relationship between subfeatures and the feature in which they
 | 
			
		||||
are located is identified by the feature name in column #7 (parent
 | 
			
		||||
feature). For example, the parent feature of the intron found in
 | 
			
		||||
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
 | 
			
		||||
chromosome 6.
 | 
			
		||||
 | 
			
		||||
3. The coordinates of all features are in chromosomal coordinates.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Columns within SGD_features.tab:
 | 
			
		||||
 | 
			
		||||
1.   Primary SGDID (mandatory)
 | 
			
		||||
2.   Feature type (mandatory)
 | 
			
		||||
3.   Feature qualifier (optional)
 | 
			
		||||
4.   Feature name (optional)
 | 
			
		||||
5.   Standard gene name (optional)
 | 
			
		||||
6.   Alias (optional, multiples separated by |)
 | 
			
		||||
7.   Parent feature name (optional)
 | 
			
		||||
8.   Secondary SGDID (optional, multiples separated by |)
 | 
			
		||||
9.   Chromosome (optional)
 | 
			
		||||
10.  Start_coordinate (optional)
 | 
			
		||||
11.  Stop_coordinate (optional)
 | 
			
		||||
12.  Strand (optional)
 | 
			
		||||
13.  Genetic position (optional)
 | 
			
		||||
14.  Coordinate version (optional)
 | 
			
		||||
15.  Sequence version (optional)
 | 
			
		||||
16.  Description (optional)
 | 
			
		||||
 | 
			
		||||
Note that "chromosome 17" is the mitochondrial chromosome.
 | 
			
		||||
 | 
			
		||||
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
 | 
			
		||||
 | 
			
		||||
SGD_features.tab
 | 
			
		||||
 | 
			
		||||
The latest version of the SGD_features.tab file is based on Genome Version R64-2-1.
 | 
			
		||||
 | 
			
		||||
The SGD_features.tab file is updated weekly (Saturday).
 | 
			
		||||
 | 
			
		||||
NOTE: On 4 September 2004, the SGD_features.tab file replaced the previously
 | 
			
		||||
used chromosomal_feature.tab file.
 | 
			
		||||
 | 
			
		||||
File contents:
 | 
			
		||||
 | 
			
		||||
1. Information on current chromosomal features in SGD, including Dubious ORFs. 
 | 
			
		||||
Also contains coordinates of intron, exons, and other subfeatures that are located
 | 
			
		||||
within a chromosomal feature.
 | 
			
		||||
 | 
			
		||||
2. The relationship between subfeatures and the feature in which they
 | 
			
		||||
are located is identified by the feature name in column #7 (parent
 | 
			
		||||
feature). For example, the parent feature of the intron found in
 | 
			
		||||
ACT1/YFL039C will be YFL039C. The parent feature of YFL039C is
 | 
			
		||||
chromosome 6.
 | 
			
		||||
 | 
			
		||||
3. The coordinates of all features are in chromosomal coordinates.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Columns within SGD_features.tab:
 | 
			
		||||
 | 
			
		||||
1.   Primary SGDID (mandatory)
 | 
			
		||||
2.   Feature type (mandatory)
 | 
			
		||||
3.   Feature qualifier (optional)
 | 
			
		||||
4.   Feature name (optional)
 | 
			
		||||
5.   Standard gene name (optional)
 | 
			
		||||
6.   Alias (optional, multiples separated by |)
 | 
			
		||||
7.   Parent feature name (optional)
 | 
			
		||||
8.   Secondary SGDID (optional, multiples separated by |)
 | 
			
		||||
9.   Chromosome (optional)
 | 
			
		||||
10.  Start_coordinate (optional)
 | 
			
		||||
11.  Stop_coordinate (optional)
 | 
			
		||||
12.  Strand (optional)
 | 
			
		||||
13.  Genetic position (optional)
 | 
			
		||||
14.  Coordinate version (optional)
 | 
			
		||||
15.  Sequence version (optional)
 | 
			
		||||
16.  Description (optional)
 | 
			
		||||
 | 
			
		||||
Note that "chromosome 17" is the mitochondrial chromosome.
 | 
			
		||||
 | 
			
		||||
The SGD_features.tab file is complemented by GFF3 file saccharomyces_cerevisiae.gff
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
							
						
						
									
										32908
									
								
								data/SGD_features.tab
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
							
						
						
									
										2030
									
								
								data/Species.csv
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,179 +1,179 @@
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
93	+	missense_variant	25398284	93	C	93	T	G/D	12	93	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
86	+	missense_variant	25398284	86	C	86	A	G/V	12	86	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
72	+	missense_variant	25398285	72	C	72	A	G/C	12	72	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
63	-	missense_variant	25398284	63	G	63	A	G/D	12	63	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
36	-	missense_variant	25398284	36	G	36	T	G/V	12	36	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
24	+	missense_variant	25398281	24	C	24	T	G/D	12	24	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
23	+	missense_variant	25398284	23	C	23	G	G/A	12	23	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
16	-	missense_variant	25398285	16	G	16	C	G/R	12	16	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
13	+	missense_variant	25398285	13	C	13	G	G/R	12	13	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
11	+	missense_variant	25380275	11	T	11	G	Q/H	12	11	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
10	+	missense_variant	25398282	10	C	10	A	G/C	12	10	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
9	+	missense_variant	25398285	9	C	9	T	G/S	12	9	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
7	+	missense_variant	25378562	7	C	7	T	A/T	12	7	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
7	+	missense_variant	25380276	7	T	7	A	Q/L	12	7	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
5	+	missense_variant	25398284	5	CC	5	AA	G/F	12	5	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
5	+	missense_variant	25380276	5	T	5	C	Q/R	12	5	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
4	+	missense_variant	25398284	4	C	4	A	G/V	12	4	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380275	3	T	3	A	Q/H	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25378647	3	T	3	G	K/N	12	3	117	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
3	+	missense_variant	25380277	3	G	3	T	Q/K	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	-	missense_variant	25380275	3	A	3	C	Q/H	12	3	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
3	+	missense_variant	25398284	3	C	3	T	G/D	12	3	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
3	-	missense_variant	25398281	3	G	3	A	G/D	12	3	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000557334
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25378562	2	G	2	A	A/T	12	2	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	-	missense_variant	25380276	2	A	2	G	Q/R	12	2	61	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	+	missense_variant	25398255	2	G	2	T	Q/K	12	2	22	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
2	-	missense_variant	25398285	2	G	2	A	G/S	12	2	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
2	+	missense_variant	25378561	2	G	2	A	A/V	12	2	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	72	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	185	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	183-184	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380271	1	C	1	T	E/K	12	1	63.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380274	1	C	1	T	E/K	12	1	62	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380275	1	T	1	G	Q/H	12	1	61.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	G	-	12	1	60	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000256078
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380277	1	GA	1	TT	GQ/GK	12	1	60-61	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25380278	0	A	1	T	-	12	1	60	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380240	1	C	1	A	R/M	12	1	73.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	C	A/G	12	1	59	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000556131
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25362743	1	A	1	G	C/R	12	1	72	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	inframe_deletion	25362744	0	CTTTGT	1	-	-	12	1	70-71	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000557334
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000557334
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000556131
 | 
			
		||||
1	+	missense_variant	25398281	1	C	1	A	G/V	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380282	1	G	1	T	A/E	12	1	59	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398211	1	T	1	C	I/M	12	1	36	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398213	1	T	1	A	I/L	12	1	36	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398250	0	T	1	C	-	12	1	23	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398260	1	G	1	C	T/R	12	1	20	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398280	0	G	1	A	-	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
0	+	synonymous_variant	25398283	0	A	1	C	-	12	1	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25398284	1	G	1	C	G/A	12	1	12	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	A	G/C	12	1	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398285	1	C	1	G	G/R	12	1	12.0	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25398306	1	T	1	C	K/E	12	1	5	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25362743	1	A	1	T	S/C	12	1	185	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25378647	1	A	1	T	K/N	12	1	117	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	-	missense_variant	25398282	1	G	1	T	G/C	12	1	13	ENSG00000133703	ENST00000311936
 | 
			
		||||
1	+	missense_variant	25380254	1	C	1	A	R/S	12	1	68	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378645	1	C	1	G	C/S	12	1	118	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378594	1	C	1	G	R/T	12	1	135	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25368454	1	C	1	T	R/Q	12	1	164	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25368473	1	T	1	C	T/A	12	1	158	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378557	1	C	1	G	K/N	12	1	147	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	G	A/P	12	1	146	ENSG00000133703	ENST00000256078
 | 
			
		||||
1	+	missense_variant	25378562	1	C	1	T	A/T	12	1	146.0	ENSG00000133703	ENST00000256078
 | 
			
		||||
 
 | 
			
		||||
		
		
			
  | 
@@ -1,49 +1,49 @@
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
 | 
			
		||||
2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
2	+	missense_variant	3119330	2	G	2	A	R/Q	17	2	139	ENSG00000172146	ENST00000304094
 | 
			
		||||
2	+	missense_variant	3119138	2	C	2	T	S/L	17	2	75	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119772	0	C	2	T	-	17	2	286	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119791	1	C	1	T	R/W	17	1	293	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119799	1	G	1	A	M/I	17	1	295	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119805	0	T	1	C	-	17	1	297	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119823	0	C	1	T	-	17	1	303	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119786	1	G	1	A	R/K	17	1	291	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119744	1	C	1	G	T/R	17	1	277	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119691	0	C	1	T	-	17	1	259	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119589	0	C	1	T	-	17	1	225	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119408	1	G	1	A	S/N	17	1	165	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119431	1	G	1	A	E/K	17	1	173	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119462	1	C	1	T	P/L	17	1	183	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3119514	1	C	1	G	-	17	1	200	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119530	1	T	1	G	F/V	17	1	206	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119581	1	A	1	G	T/A	17	1	223	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3119590	1	C	1	T	-	17	1	226	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119679	1	G	1	T	M/I	17	1	255	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119592	0	G	1	A	-	17	1	226	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119596	1	C	1	T	P/S	17	1	228	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119610	0	C	1	T	-	17	1	232	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119627	1	C	1	T	S/F	17	1	238	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119640	0	C	1	A	-	17	1	242	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119672	1	C	1	T	T/I	17	1	253	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119395	1	C	1	A	L/M	17	1	161	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119403	0	A	1	G	-	17	1	163	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119386	1	C	1	T	P/S	17	1	158	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119289	0	C	1	A	-	17	1	125	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	stop_gained	3118972	1	C	1	T	-	17	1	20	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118978	1	G	1	A	E/K	17	1	22	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118986	1	A	1	C	E/D	17	1	24	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119002	1	C	1	T	L/F	17	1	30	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119029	0	T	1	C	-	17	1	39	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119074	1	C	1	T	R/C	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119075	1	G	1	A	R/H	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119076	0	C	1	T	-	17	1	54	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119115	0	C	1	T	-	17	1	67	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119139	0	G	1	A	-	17	1	75	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3119187	0	C	1	T	-	17	1	91	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119210	1	C	1	T	T/M	17	1	99	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119217	1	G	1	A	M/I	17	1	101	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119264	1	C	1	T	A/V	17	1	117	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3119269	1	G	1	A	A/T	17	1	119	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118961	1	G	1	A	G/E	17	1	16	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3118956	0	C	1	A	-	17	1	14	ENSG00000172146	ENST00000304094
 | 
			
		||||
0	+	synonymous_variant	3118944	0	G	1	A	-	17	1	10	ENSG00000172146	ENST00000304094
 | 
			
		||||
1	+	missense_variant	3118928	1	A	1	C	N/T	17	1	5	ENSG00000172146	ENST00000304094
 | 
			
		||||
 
 | 
			
		||||
		
		
			
  | 
@@ -1,113 +1,113 @@
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
 | 
			
		||||
4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
 | 
			
		||||
3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
 | 
			
		||||
MUTS_PAM	STRAND	MOST_SEVERE	START	MUTS_PAM_SAMPLES	REF	MUTS_CS	ALT	AA_CHANGE	CHR	MUTS_CS_SAMPLES	PROTEIN_POS	GENE	TRANSCRIPT
 | 
			
		||||
5	+	missense_variant	112926888	5	G	5	T	G/V	12	5	503	ENSG00000179295	ENST00000351677
 | 
			
		||||
4	+	missense_variant	112926270	4	C	4	T	T/M	12	4	468	ENSG00000179295	ENST00000351677
 | 
			
		||||
3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
3	+	missense_variant	112888198	3	G	3	A	A/T	12	3	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926910	2	G	2	C	Q/H	12	2	510	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926909	2	A	2	T	Q/L	12	2	510	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112926900	2	C	2	A	T/K	12	2	507	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000392597
 | 
			
		||||
2	+	missense_variant	112891006	2	C	2	T	H/Y	12	2	114	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888210	2	G	2	A	E/K	12	2	76	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	T	A/V	12	2	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
2	+	missense_variant	112888199	2	C	2	A	A/D	12	2	72	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	82	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	26	ENSG00000179295	ENST00000530818
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	34	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	70	ENSG00000179295	ENST00000530818
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	74	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	76	ENSG00000179295	ENST00000530818
 | 
			
		||||
1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000392597
 | 
			
		||||
0	+	synonymous_variant	112893822	0	T	1	C	-	12	1	237	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	T	D/Y	12	1	61	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888189	1	G	1	A	E/K	12	1	69	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888195	1	T	1	C	F/L	12	1	71	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888197	1	T	1	A	F/L	12	1	71	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888211	1	A	1	C	E/A	12	1	76	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891015	1	C	1	T	L/F	12	1	117	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891073	1	T	1	A	L/H	12	1	136	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112891116	0	T	1	C	-	12	1	150	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112891129	1	G	1	T	D/Y	12	1	155	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112892383	1	G	1	C	V/L	12	1	181	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112892409	0	T	1	C	-	12	1	189	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	stop_gained	112893784	1	G	1	T	-	12	1	225	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893798	0	A	1	G	-	12	1	229	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888165	1	G	1	A	D/N	12	1	61	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888163	1	G	1	T	G/V	12	1	60	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888162	1	G	1	C	G/R	12	1	60	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112893802	0	C	1	A	-	12	1	231	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112910775	1	C	1	T	L/F	12	1	262	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112888161	0	T	1	C	-	12	1	59	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112910837	1	C	1	G	I/M	12	1	282	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926887	1	G	1	C	G/R	12	1	503	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926908	1	C	1	G	Q/E	12	1	510.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939963	1	G	1	C	G/R	12	1	539	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939970	1	A	1	T	E/V	12	1	541	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939981	1	A	1	C	I/L	12	1	545	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112939993	0	C	1	T	-	12	1	549	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112939999	1	G	1	A	D/N	12	1	551	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940012	1	G	1	A	G/E	12	1	555	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940025	0	T	1	C	-	12	1	559	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940027	1	T	1	C	L/P	12	1	560	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940031	0	G	1	A	-	12	1	561	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112940036	1	G	1	T	C/F	12	1	563	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112940052	0	C	1	T	-	12	1	568	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112884103	1	G	1	A	G/D	12	1	13	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112888139	1	C	1	G	T/S	12	1	52	ENSG00000179295	ENST00000392597
 | 
			
		||||
1	+	missense_variant	112926885	1	C	1	T	S/L	12	1	502	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926884	1	T	1	C	S/P	12	1	502	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112926862	0	C	1	T	-	12	1	494	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924286	1	C	1	T	T/M	12	1	411	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112910844	1	T	1	G	F/V	12	1	285.0	ENSG00000179295	ENST00000351677
 | 
			
		||||
0	+	synonymous_variant	112915507	0	A	1	G	-	12	1	302	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112915523	1	A	1	G	N/D	12	1	308	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112915743	1	A	1	G	N/S	12	1	339	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112919908	1	T	1	G	Y/D	12	1	375	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	frameshift_variant	112920002	1	-	1	T	-	12	1	406	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	stop_gained	112924308	1	C	1	A	-	12	1	418	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926852	1	C	1	T	P/L	12	1	491	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924331	1	A	1	T	H/L	12	1	426	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112924336	1	G	1	A	V/M	12	1	428	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926248	1	G	1	A	A/T	12	1	461	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926249	1	C	1	G	A/G	12	1	461	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926291	1	TT	1	CA	L/P	12	1	475	ENSG00000179295	ENST00000351677
 | 
			
		||||
1	+	missense_variant	112926839	1	G	1	T	D/Y	12	1	487	ENSG00000179295	ENST00000351677
 | 
			
		||||
 
 | 
			
		||||
		
		
			
  | 
@@ -1,39 +1,39 @@
 | 
			
		||||
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
 | 
			
		||||
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
 | 
			
		||||
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
 | 
			
		||||
 | 
			
		||||
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
 | 
			
		||||
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
 | 
			
		||||
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
 | 
			
		||||
 | 
			
		||||
>MBP1_COPCI  - XP_001837394 A8NYC6
 | 
			
		||||
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
 | 
			
		||||
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
 | 
			
		||||
 | 
			
		||||
>MBP1_CRYNE  - XP_569090 Q5KMQ9
 | 
			
		||||
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
 | 
			
		||||
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
 | 
			
		||||
 | 
			
		||||
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
 | 
			
		||||
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
 | 
			
		||||
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
 | 
			
		||||
 | 
			
		||||
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
 | 
			
		||||
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
 | 
			
		||||
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
 | 
			
		||||
 | 
			
		||||
>MBP1_SACCE Mbp1 NP_010227 P39678
 | 
			
		||||
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
 | 
			
		||||
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
 | 
			
		||||
 | 
			
		||||
>MBP1_SCHPO Res2 NP_593032 P41412
 | 
			
		||||
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
 | 
			
		||||
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
 | 
			
		||||
 | 
			
		||||
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
 | 
			
		||||
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
 | 
			
		||||
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
 | 
			
		||||
 | 
			
		||||
>MBP1_WALME  - XP_006957051 I4YGC0
 | 
			
		||||
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
 | 
			
		||||
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
 | 
			
		||||
>MBP1_ASPNI AN3154 XP_660758 Q5B8H6
 | 
			
		||||
-VYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRI
 | 
			
		||||
LEREVQKGVHEKVQGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDY
 | 
			
		||||
 | 
			
		||||
>MBP1_BIPOR COCMIDRAFT_338 XP_007682304 W6ZM86
 | 
			
		||||
KIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRI
 | 
			
		||||
LEREVQKGVHEKVQGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDY
 | 
			
		||||
 | 
			
		||||
>MBP1_COPCI  - XP_001837394 A8NYC6
 | 
			
		||||
QIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRV
 | 
			
		||||
LEREVQKGEHEKVQGGYGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEF
 | 
			
		||||
 | 
			
		||||
>MBP1_CRYNE  - XP_569090 Q5KMQ9
 | 
			
		||||
DYVPTSVSPPPAPKHSVA--PPSKARRDKEKETGRTKATPSRTGPTSAAA
 | 
			
		||||
LQAQAQLN-RAKMHDSTPDADASFRSFEERVSLTEDDSSSDTPSPVASV
 | 
			
		||||
 | 
			
		||||
>MBP1_NEUCR Swi4 XP_955821 Q7RW59
 | 
			
		||||
-IYSATYSGIPVWEYQFGVDHVMRRRHDDWVNATHILKAAGFDKPARTRI
 | 
			
		||||
LEREVQKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEF
 | 
			
		||||
 | 
			
		||||
>MBP1_PUCGR PGTG_08863 XP_003327086 E3KED4
 | 
			
		||||
-IYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRV
 | 
			
		||||
LEREIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNF
 | 
			
		||||
 | 
			
		||||
>MBP1_SACCE Mbp1 NP_010227 P39678
 | 
			
		||||
QIYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHILKAANFAKAKRTRI
 | 
			
		||||
LEKEVLKETHEKVQGGFGKYQGTWVPLNIAKQLAEKFSVYDQLKPLFDF
 | 
			
		||||
 | 
			
		||||
>MBP1_SCHPO Res2 NP_593032 P41412
 | 
			
		||||
-VHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRV
 | 
			
		||||
LERQVQIGAHEKVQGGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILS-
 | 
			
		||||
 | 
			
		||||
>MBP1_USTMA UMAG_11222 XP_011392621 A0A0D1DP35
 | 
			
		||||
-IFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRV
 | 
			
		||||
LEREIQKGIHEKVQGGYGKYQGTWIPLDVAIELAERYNIQGLLQPITSY
 | 
			
		||||
 | 
			
		||||
>MBP1_WALME  - XP_006957051 I4YGC0
 | 
			
		||||
-IYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRI
 | 
			
		||||
LEREVQKGTHEKVQGGYGKYQGTWIPMERSVELARQYRIELLLDPIINY
 | 
			
		||||
 
 | 
			
		||||
@@ -1,490 +1,490 @@
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "68476_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957790",
 | 
			
		||||
    "UniProtID" : "I4YDD8",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
 | 
			
		||||
             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
 | 
			
		||||
             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
 | 
			
		||||
             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
 | 
			
		||||
             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
 | 
			
		||||
             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
 | 
			
		||||
             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
 | 
			
		||||
             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
 | 
			
		||||
             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "00846_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001831299",
 | 
			
		||||
    "UniProtID" : "A8N8X1",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
 | 
			
		||||
             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
 | 
			
		||||
             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
 | 
			
		||||
             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
 | 
			
		||||
             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
 | 
			
		||||
             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
 | 
			
		||||
             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
 | 
			
		||||
             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
 | 
			
		||||
             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
 | 
			
		||||
             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
 | 
			
		||||
             "SEAQVVDIGRVSGFMQKVRDGII"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "8533_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007691662",
 | 
			
		||||
    "UniProtID" : "W6ZE71",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
 | 
			
		||||
             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
 | 
			
		||||
             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
 | 
			
		||||
             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
 | 
			
		||||
             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
 | 
			
		||||
             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
 | 
			
		||||
             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
 | 
			
		||||
             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
 | 
			
		||||
             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
 | 
			
		||||
             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
 | 
			
		||||
             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PGTG_02039",
 | 
			
		||||
    "RefSeqID" : "XP_003320997",
 | 
			
		||||
    "UniProtID" : "E3JX03",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
 | 
			
		||||
             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
 | 
			
		||||
             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
 | 
			
		||||
             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
 | 
			
		||||
             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
 | 
			
		||||
             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
 | 
			
		||||
             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
 | 
			
		||||
             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
 | 
			
		||||
             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
 | 
			
		||||
             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
 | 
			
		||||
             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBPA_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_664319",
 | 
			
		||||
    "UniProtID" : "Q5AYB5",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
 | 
			
		||||
             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
 | 
			
		||||
             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
 | 
			
		||||
             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
 | 
			
		||||
             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
 | 
			
		||||
             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
 | 
			
		||||
             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
 | 
			
		||||
             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
 | 
			
		||||
             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
 | 
			
		||||
             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
 | 
			
		||||
             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
 | 
			
		||||
             "MRDRGQDW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05520_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_570545",
 | 
			
		||||
    "UniProtID" : "Q5KHS0",
 | 
			
		||||
    "taxonomyID" : "214684",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
 | 
			
		||||
             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
 | 
			
		||||
             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
 | 
			
		||||
             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
 | 
			
		||||
             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
 | 
			
		||||
             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
 | 
			
		||||
             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
 | 
			
		||||
             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
 | 
			
		||||
             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
 | 
			
		||||
             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
 | 
			
		||||
             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
 | 
			
		||||
             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "RES1_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_595496",
 | 
			
		||||
    "UniProtID" : "P33520",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
 | 
			
		||||
             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
 | 
			
		||||
             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
 | 
			
		||||
             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
 | 
			
		||||
             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
 | 
			
		||||
             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
 | 
			
		||||
             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
 | 
			
		||||
             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "CDC10_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_596132",
 | 
			
		||||
    "UniProtID" : "P01129",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
 | 
			
		||||
             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
 | 
			
		||||
             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
 | 
			
		||||
             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
 | 
			
		||||
             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
 | 
			
		||||
             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
 | 
			
		||||
             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
 | 
			
		||||
             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
 | 
			
		||||
             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
 | 
			
		||||
             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05338_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011392041",
 | 
			
		||||
    "UniProtID" : "A0A0D1BWD8",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
 | 
			
		||||
             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
 | 
			
		||||
             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
 | 
			
		||||
             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
 | 
			
		||||
             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
 | 
			
		||||
             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
 | 
			
		||||
             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
 | 
			
		||||
             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
 | 
			
		||||
             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
 | 
			
		||||
             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
 | 
			
		||||
             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
 | 
			
		||||
             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
 | 
			
		||||
             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
 | 
			
		||||
             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SWI4_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_011036",
 | 
			
		||||
    "UniProtID" : "P25302",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
 | 
			
		||||
             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
 | 
			
		||||
             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
 | 
			
		||||
             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
 | 
			
		||||
             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
 | 
			
		||||
             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
 | 
			
		||||
             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
 | 
			
		||||
             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
 | 
			
		||||
             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
 | 
			
		||||
             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
 | 
			
		||||
             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
 | 
			
		||||
             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
 | 
			
		||||
             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
 | 
			
		||||
             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SWI6_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962967",
 | 
			
		||||
    "UniProtID" : "Q7SBG9",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
 | 
			
		||||
             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
 | 
			
		||||
             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
 | 
			
		||||
             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
 | 
			
		||||
             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
 | 
			
		||||
             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
 | 
			
		||||
             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
 | 
			
		||||
             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
 | 
			
		||||
             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
 | 
			
		||||
             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
 | 
			
		||||
             "EKPELEIARVRRFLGGVEGVVH"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "15042_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011388143",
 | 
			
		||||
    "UniProtID" : "A0A0D1CVS5",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
 | 
			
		||||
             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
 | 
			
		||||
             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
 | 
			
		||||
             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
 | 
			
		||||
             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
 | 
			
		||||
             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
 | 
			
		||||
             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
 | 
			
		||||
             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "04778_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011391646",
 | 
			
		||||
    "UniProtID" : "A0A0D1DQM4",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
 | 
			
		||||
             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
 | 
			
		||||
             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
 | 
			
		||||
             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
 | 
			
		||||
             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
 | 
			
		||||
             "PESAQLFTIHDFGSDPFYAEQVERG"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "STUA_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_663440",
 | 
			
		||||
    "UniProtID" : "P36011",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
 | 
			
		||||
             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
 | 
			
		||||
             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
 | 
			
		||||
             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
 | 
			
		||||
             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
 | 
			
		||||
             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
 | 
			
		||||
             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
 | 
			
		||||
             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "STUA_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_960837",
 | 
			
		||||
    "UniProtID" : "Q1K6U0",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
 | 
			
		||||
             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
 | 
			
		||||
             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
 | 
			
		||||
             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
 | 
			
		||||
             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
 | 
			
		||||
             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
 | 
			
		||||
             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
 | 
			
		||||
             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
 | 
			
		||||
             "RRR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PHD1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_012881",
 | 
			
		||||
    "UniProtID" : "P36093",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
 | 
			
		||||
             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
 | 
			
		||||
             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
 | 
			
		||||
             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
 | 
			
		||||
             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "08099_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001836714",
 | 
			
		||||
    "UniProtID" : "A8NVH3",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
 | 
			
		||||
             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
 | 
			
		||||
             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
 | 
			
		||||
             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
 | 
			
		||||
             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
 | 
			
		||||
             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
 | 
			
		||||
             "PHRPW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "68479_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957792",
 | 
			
		||||
    "UniProtID" : "I4YDE0",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
 | 
			
		||||
             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
 | 
			
		||||
             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
 | 
			
		||||
             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "11943_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003330006",
 | 
			
		||||
    "UniProtID" : "E3KMR2",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
 | 
			
		||||
             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
 | 
			
		||||
             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
 | 
			
		||||
             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
 | 
			
		||||
             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
 | 
			
		||||
             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
 | 
			
		||||
             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
 | 
			
		||||
             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "03082_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003321545",
 | 
			
		||||
    "UniProtID" : "E3JYK1",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
 | 
			
		||||
             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
 | 
			
		||||
             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
 | 
			
		||||
             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
 | 
			
		||||
             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
 | 
			
		||||
             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
 | 
			
		||||
             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
 | 
			
		||||
             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
 | 
			
		||||
             "LIMEWNPSC"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SOK2_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_013729",
 | 
			
		||||
    "UniProtID" : "P53438",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
 | 
			
		||||
             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
 | 
			
		||||
             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
 | 
			
		||||
             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
 | 
			
		||||
             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
 | 
			
		||||
             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
 | 
			
		||||
             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
 | 
			
		||||
             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
 | 
			
		||||
             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
 | 
			
		||||
             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "14426_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_002911429",
 | 
			
		||||
    "UniProtID" : "D6RMB0",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
 | 
			
		||||
             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
 | 
			
		||||
             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
 | 
			
		||||
             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
 | 
			
		||||
             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
 | 
			
		||||
             "ITYLPNFL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "BQT4_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_596166",
 | 
			
		||||
    "UniProtID" : "O60158",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
 | 
			
		||||
             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
 | 
			
		||||
             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
 | 
			
		||||
             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
 | 
			
		||||
             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
 | 
			
		||||
             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PGTG_05590",
 | 
			
		||||
    "RefSeqID" : "XP_003323688",
 | 
			
		||||
    "UniProtID" : "E3K4V4",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
 | 
			
		||||
             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
 | 
			
		||||
             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
 | 
			
		||||
             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
 | 
			
		||||
             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
 | 
			
		||||
             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
 | 
			
		||||
             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "06560_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962267",
 | 
			
		||||
    "UniProtID" : "Q7S9H5",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
 | 
			
		||||
             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
 | 
			
		||||
             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
 | 
			
		||||
             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
 | 
			
		||||
             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
 | 
			
		||||
             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
 | 
			
		||||
             "ANVL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "81480_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007682909",
 | 
			
		||||
    "UniProtID" : "W6ZKJ4",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
 | 
			
		||||
             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
 | 
			
		||||
             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
 | 
			
		||||
             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
 | 
			
		||||
             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
 | 
			
		||||
             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "01622_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_657766",
 | 
			
		||||
    "UniProtID" : "Q5BH18",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
 | 
			
		||||
             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
 | 
			
		||||
             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
 | 
			
		||||
             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
 | 
			
		||||
             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
 | 
			
		||||
             "RVRNRALMGVTAAFALAKPALVLLEA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05405_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_663009",
 | 
			
		||||
    "UniProtID" : "Q5B225",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
 | 
			
		||||
             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
 | 
			
		||||
             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
 | 
			
		||||
             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
 | 
			
		||||
             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
 | 
			
		||||
             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
 | 
			
		||||
             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
 | 
			
		||||
             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "105954_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007691967",
 | 
			
		||||
    "UniProtID" : "W6Z1H5",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
 | 
			
		||||
             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
 | 
			
		||||
             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
 | 
			
		||||
             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
 | 
			
		||||
             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
 | 
			
		||||
             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "69819_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006959479",
 | 
			
		||||
    "UniProtID" : "I4Y911",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
 | 
			
		||||
             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
 | 
			
		||||
             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
 | 
			
		||||
             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
 | 
			
		||||
             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "02840_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_568872",
 | 
			
		||||
    "UniProtID" : "Q5KM59",
 | 
			
		||||
    "taxonomyID" : "214684",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
 | 
			
		||||
             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
 | 
			
		||||
             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
 | 
			
		||||
             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
 | 
			
		||||
             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "11055_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011390537",
 | 
			
		||||
    "UniProtID" : "A0A0D1DZM8",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
 | 
			
		||||
             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
 | 
			
		||||
             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
 | 
			
		||||
             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
 | 
			
		||||
             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
 | 
			
		||||
             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
 | 
			
		||||
             "ASILPW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "XBP1_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962373",
 | 
			
		||||
    "UniProtID" : "Q7S9W7",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
 | 
			
		||||
             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
 | 
			
		||||
             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
 | 
			
		||||
             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
 | 
			
		||||
             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
 | 
			
		||||
             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
 | 
			
		||||
             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
 | 
			
		||||
             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
 | 
			
		||||
             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "XBP1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_012165",
 | 
			
		||||
    "UniProtID" : "P40489",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
 | 
			
		||||
             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
 | 
			
		||||
             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
 | 
			
		||||
             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
 | 
			
		||||
             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
 | 
			
		||||
             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
 | 
			
		||||
             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
 | 
			
		||||
             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
 | 
			
		||||
             "FKTNSKQ"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "68476_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957790",
 | 
			
		||||
    "UniProtID" : "I4YDD8",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MKEEKEKTPPNNITGPPTPAQNILHSTPAAFGTAGTVGQGAGGFGSQLYQSPYVDSQQSVIGSPVTPAPLPKKATLKTPQ",
 | 
			
		||||
             "PRIYSAVYSGVGVYEAMIRGIAVMRRRADGYMNATQILKVAGVDKGRRTKILEREILAGLHEKIQGGYGKYQGTWIPFER",
 | 
			
		||||
             "GRELALQYGCDHLLAPIFDFNPSVMQPSAGRSAKSPSKKRQNSIVLSPTQERHQSSIIALNTARASGIYVGGADDPNDDG",
 | 
			
		||||
             "LSKKEKSPVKKSKYDEVPVNVSKRPYVPPPGTNAHILTRTQQSLTALFQQPTTNSDFIPEAVAILDTTSGALHPDLAIDE",
 | 
			
		||||
             "LGHTALHWAASLGRISNVQQLIKKGADMKRGNIEGETPLERSVLVNDNYDKKTFAYLLQELGSSIRVVDRTGRSILHHIA",
 | 
			
		||||
             "LIAAVNGRSMSAKYYMENVLEYIARYENGEFKSLVDLQDEHGDTALNISARVGNRNLVKMLVDAGANKTVVNKLGLKASD",
 | 
			
		||||
             "FGVEHETLNSVTGDEMLSNLQPPPPLNVDSSASVLENIHNLLNGITQQYTDETSGKNALLFEIQAELKQHSHELADVRKE",
 | 
			
		||||
             "IQYWQNKATQMAEVDQKIKNINEAIENEKVQTWSLLGEANADKMEGIETSSSSNTSEIKIPTGDNEESLKQLRKLSKWLE",
 | 
			
		||||
             "GTQKLTEERVASIDGLSASKEVKYKSIVSVCTGVPVNEVEGMLAQLLEAMESDANADLNKVQEFLAREC"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "00846_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001831299",
 | 
			
		||||
    "UniProtID" : "A8N8X1",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MQASTRPPGSNQPPVKIYNAVYSSVQVYECMVRGIAVMRRRNDSYVNATQILKVAGVDKGRRTKILEKEILPGKHEIVQG",
 | 
			
		||||
             "GYGKYQGTWIPLERGRDIAAQYGVAPLLSPLFDFQPSTNSLGALPVSTPGGTASPRPLSASSSYSSMGVAGQYIPSSIPS",
 | 
			
		||||
             "NLPPAPIMPGSALRLLNQGRAQGLFTPSTTSATLRPAGYHSPGPYGTSYAPSPQPQSSQTPPPGSGLKRNRSEAEVEGYH",
 | 
			
		||||
             "SQPHDVQMADAPPPNTASQPNEDNPSPAKRLRTDGSITTEPASSQGQWQQQQPLPYASQQRSGPGLSQLSGHNGHGSSRP",
 | 
			
		||||
             "PSSLSAPNGNRPAHTNPEDQTRKTRFSSKPSMPRGMDPHMPFKDARRSALIALICHRDDPTSVIDLLREISADHLNPPSF",
 | 
			
		||||
             "DVDTVLDDQGHTALHLAASMARTQTVDMLIQTGADMHRGNHLGETPLIRACLATPNSDQQSFATLVNYLHDSIWTLDTSK",
 | 
			
		||||
             "KSVVHHIVSLAGVKGRAVVARYYLDQIFYWIAQHEGGDFRSLVDLQDEHGDTAINIAARVGNRSLVRTLLDVGANRVLAN",
 | 
			
		||||
             "KLGLRPGDFGVETEELSSGLRAEDLISSLRTGPPAPVQKSQDVIADMTSMIQSLSTEFQAEIKSKQDSLDVTQAHLRAAT",
 | 
			
		||||
             "RELSEQRKQIQTWQARCGDLDQINQRVRNVEKAIAEEDMFDWTGRTELDGKDGKEKGGPAFAYRGSKSTMVGVGGSVDVS",
 | 
			
		||||
             "FSVESEPPLPTTDTAASLVKLRRLKMWHQRMEELVKGRLKGLQGASAEKEYQCKKIVALCTGIPLDKVEEMLDNLVIAVE",
 | 
			
		||||
             "SEAQVVDIGRVSGFMQKVRDGII"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "8533_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007691662",
 | 
			
		||||
    "UniProtID" : "W6ZE71",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTSHSFPAASPSHQQSALYANSPHGHALMAAPAALNRSFSDMSAFHHHAMDKPQIYTAVYSGVSVYEMEVNRVAVMRRR",
 | 
			
		||||
             "SDGWLNATQILKVAGVDKGKRTKVLEKEILTGEHEKVQGGYGKYQGTWINYRRGREFCRQYGVEDVLRPLLDYDITLDGS",
 | 
			
		||||
             "HAPGHAIETPTKEQAMAANRKRFYTQSIDGRTTTQNLTGTFFSNISSTATSALAAMNKVARLNSPAPRPSSSSQRRTSAT",
 | 
			
		||||
             "RPSQSQPPLASQDSFRTSSQQSITSEPSFAGHNGQTDSAYATAVDESQEPPRKRIRASHDDSYSQPTAADMSIHPLSSPT",
 | 
			
		||||
             "EPSESFDQHHPAQPITLADGDVPTALPPLPYPDTKQDEEKQAMLTDLFADQTRSDFTNHPAILHLSGPDLDMPIDNSSNT",
 | 
			
		||||
             "ALHWAATLARVSLIRLLVSKGANMFRGNASGQTALMSAVSVNNSLDHSCFPETLEILAPLIELRDSQGRTILHHIAVTCA",
 | 
			
		||||
             "IKGRAASSKYYLEALLEYLVRSNIGGGQPPPFHDTSNHSKPIGLMRFMQEMVNARDKAGNTALNLAARIGNRNIISQLME",
 | 
			
		||||
             "VQADPTIPNHKGTRPMDFGVGTDLGDGQGIITATSPTKAKAPLSKAEETSREIQPLMSGILQSASLQFTQEARLKQDAID",
 | 
			
		||||
             "QTNELITQLSSQQKQEQQKLQTLRARLRQRQDRAKRISNLKRWLEPQRHMLSVNDGAIDLHDKKRIGYADTQGAGLLIKE",
 | 
			
		||||
             "DDLPYELRQAGDHLDRRASDGPIYLSTSVPLDPSTLSQVSHQPQCQNFLLQQLPAASVLRQRIETYTATNTALLKRSRML",
 | 
			
		||||
             "KEKDGQLEMMYRKVVSLCTKVEENRIEECLEGLVAALDSEEGEGVEVGRVREFLRKVEGVD"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PGTG_02039",
 | 
			
		||||
    "RefSeqID" : "XP_003320997",
 | 
			
		||||
    "UniProtID" : "E3JX03",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAAHKTTNDIPVSSSHHINPESGTGTSSTQAFPIPNIKNNPHVYMAVYSSVPVYEMMVRGIGVMRRRSDSYMNATQILKV",
 | 
			
		||||
             "AGLDKSKRTRILEREIIQGEHEKIQGGYGRYQGTWVPFTRAQELATQLNVAQLLAPLFDYRPEPNSEVNIRSTNTKPSSS",
 | 
			
		||||
             "ASRANSHKTTLARQTSRQSLNEKRERSGDTTPLPHDPPEAGPSKRSRLNTPSRQSNGSANTPSSLIDHSHSAMDPDFIIP",
 | 
			
		||||
             "HSQSQPTAASQCTTSTFAPIHGATVEYPAGPSHLRKSNSSSRSHLEVALKAERNIHTLMALFSNPPDGDELESETHHENP",
 | 
			
		||||
             "NSVAEVNEVLEDPELEIDTPIDEHCHTALHWASSLARLGLVRAFLRSGADVNRGNDVGETPLMRSTLVTNNFERESFNQL",
 | 
			
		||||
             "LELLHPSLWTLDNQDRTVLHHICLTASIKGRGESSRYYLECICEWIVNKHGAQFDSQLFDAVDLNGDTALNIAARVGNKH",
 | 
			
		||||
             "LVRMLLDVGADMTIGNNLGLKPIDFGVGAGETSASYTDDMISAPLRRNPTASAPARSSRDIITSITSSVNSLSEDFENEI",
 | 
			
		||||
             "RSKTDRLESVRAQLMVATRQLTTQRRQLESLKHDLDERALLELRLKKLRMAIAEEDGFDWTGRSDLDGRPAQAGKLFEQN",
 | 
			
		||||
             "GIASTLAGLSASQIQLELEPDPFIPPENNQDSLVYLRRLEKWYVRVLSLLRERIGRMKGSNLEQEAKYLKVIGSFIGNTC",
 | 
			
		||||
             "TNDLSSSGSSMTGRPANQTTSTTQEVPSRATQNVNPADIHDLESMDGHRRKVSTTDAVNKSHEFGRTRSELLKASMIDNK",
 | 
			
		||||
             "LLKQLMAAIESDGPELDLNRVAGFMQRVQSGSL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBPA_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_664319",
 | 
			
		||||
    "UniProtID" : "Q5AYB5",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTTSNHHQQRPSLSMSYSQGSIGSANGMSFSQSQMSSLNASQSVASTPRATPPPKSSQQSAMSFNYSNGLPNGARASFSG",
 | 
			
		||||
             "FEDMNGYGTMIYHEEFKPQIYRAVYSNVSVYEMEVNGVAVMKRRSDGWLNATQILKVAGVVKARRTKTLEKEIAAGEHEK",
 | 
			
		||||
             "VQGGYGKYQGTWVNYQRGVELCREYHVEELLRPLLEYDMNPNGTAASGQDSLDTPTKEQAMAAQRKRLYSGMENRSMSQP",
 | 
			
		||||
             "QQGTFFQNISRTAATAVNAMSKARFESPAARGGDSRRLSVIRKPSQQMGSQDAQPPFGSQQSFYSAASDSGFASNIPTNG",
 | 
			
		||||
             "RYAPQDAMSFEQEEPMEPPRKRIRSSQAFSLPIDGTSMSMSEPTPTEPNDSFYQDMEPLHHIDEGRHGLDPLPPATTPER",
 | 
			
		||||
             "FQKMKLIMTLFLDKTTKDFSTHPALIQLSGEDLEVPLDEYRNNALHWAAMLARMPLVYALVKKGVNIARLNGAGETALQK",
 | 
			
		||||
             "AVGTRNNLDYRSFPRLLQVLAPTIDMVDRSGRTILHHIAVMAATGHGGHVSAKHYLEALLEFIVRHGGTSLNQQSNGTAS",
 | 
			
		||||
             "QPGMPLSNEVITLGRFISEIVNLRDDQGDTALNLAGRARSVLVPQLLEVGADPHIPNHTGLRPADYGVGVDMVDGSSQPA",
 | 
			
		||||
             "GSRSDTFLAQLAKTRKEILEATTAQVTAIVQETLGTFDKELAASLTSKQEKFDHWHAKIRESAKARQIEQKQLDELKRRS",
 | 
			
		||||
             "IDRTETSRRLKNLEKSSTDLLEAHKEILTNLGDTSKPVSLGDADQESGFEIAEFEALFPETFDPASGFSEAQIAYLRKLP",
 | 
			
		||||
             "SAEILEQRVSCYRAFNKETLDEIDALRSKNVVLGQNYRRMVMACTGWSAEQVDEAAEGLTQCVKELNDNPVPEDEAIEIL",
 | 
			
		||||
             "MRDRGQDW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05520_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_570545",
 | 
			
		||||
    "UniProtID" : "Q5KHS0",
 | 
			
		||||
    "taxonomyID" : "214684",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MEPPSNPIQPPVTPSHHSLLSAISPALSEQTPAPIHTLPPHLRPSIPQPHIAPPRPSSVQPTMEEQQRMHHIQQHQQQQH",
 | 
			
		||||
             "FQQQQNDENVFGSVMGAPGHVPGHEAPMSTQPKVYASVYSGVPVFEAMIRGISVMRRASDSWVNATQILKVAGVHKSART",
 | 
			
		||||
             "KILEKEVLNGIHEKIQGGYGKYQGTWVPLDRGRDLAEQYGVGSYLSSVFDFVPSASVIAALPVIRTGTPDRSGQQTPSGL",
 | 
			
		||||
             "PGHPNQRVISPFANHGQTTPHMPPPQFIHQGNEQMMNLPPHPSSLAYPTQPKPYFSMPLQHTVGPQYDERHEGMTMTPTM",
 | 
			
		||||
             "SMDGLAPPADIARMGFPYNPSDIYIDQYGQPHATYQASPYGKESGHPSKRQRSDAEGSYIESGAAVQQHVEQDEEADDGL",
 | 
			
		||||
             "DNDSTASDDARDPPPLPSSMLLPHKPIRPKATPANGRIKSRLVQIFNVEGQVNLRSVFGLAPDQLPNFDIDMVIDDQGHS",
 | 
			
		||||
             "ALHWACALARLSIVQQLIELGADIHRGNYAGETPLIRAVLTSNHAEAGSFTDLLHLLSPSIRTLDHAYRTVLHHIALVAG",
 | 
			
		||||
             "VKGRVPAARTYMASVLEWVAREQQANNTHSITNPPNPADRNELAPINLRTLVDVQDVHGDTALNVAARVGNKGLVGLLLD",
 | 
			
		||||
             "AGADKTRANKLGLRPENFGLEIEALKISNGEAVMANLKSEVSKPERKSRDVQKNIATIFESISSTFSSEMLAKQTKLNAT",
 | 
			
		||||
             "EASVRHATRALADKRQHLHRAQEKLATMQLFEQRSENVRRIMDAIAAGTLLTPAEFTGRTQTMHEKSTGQLPPLAFRHVP",
 | 
			
		||||
             "GLALDASSQSQLNGAPPSTPLSVEDQEDIALPERDDPECLVKLRRMALWEDRIAEVLEDKIRAMEGEGVDRAVKYRKLVS",
 | 
			
		||||
             "VCAKVPVDKVDSMLDGLVAAVESEGQGLDFSRASNFVNRIKATKS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "RES1_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_595496",
 | 
			
		||||
    "UniProtID" : "P33520",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MYNDQIHKITYSGVEVFEYTINGFPLMKRCHDNWLNATQILKIAELDKPRRTRILEKFAQKGLHEKIQGGCGKYQGTWVP",
 | 
			
		||||
             "SERAVELAHEYNVFDLIQPLIEYSGSAFMPMSTFTPQSNRKPTEAYRRNSPVKKSFSRPSHSLLYPYTSSNNMTSTSRMS",
 | 
			
		||||
             "GIHDALSLQSDFTRSPDMPSDSFTGSLHDIKASPFSSNNYAQSLLDYFLLPNTTQPPDFVYDRPSDWDVNAGIDEDGHTA",
 | 
			
		||||
             "LHWAAAMGNLEMMHALLQAGANVVAVNYLQQTSLMRCVMFTMNYDLQTFEVVSELLQSAICMNDSFGQTVFHHIALLASS",
 | 
			
		||||
             "KSKMEAARYYMDILLQNLTATQSVDVAAQIINLQDDHGDTALLICARNGAKKCARLLLSFYASSSIPNNQGQYPTDFLSS",
 | 
			
		||||
             "KDMSFPENDDSPLNSKIEDNLIDNLKYPQSLDDHLSSKKPISYFSNKLTHQTLPNVFTQLSELSKCHEASLAEKQLTYNL",
 | 
			
		||||
             "AMEALEQTVRETETCQRLWNERTNNDENYLVNQREDLIHQCKKFLHTLKTARYYLETVQLHQLKKYVTYFSQIWSTDELA",
 | 
			
		||||
             "DISETKNLVGHDTKTNRSSLSSKHEVDLFTAENEAAREKLVEQLCSLQAQRKQKINEILNLLSMGMYNTINTDQSGS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "CDC10_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_596132",
 | 
			
		||||
    "UniProtID" : "P01129",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASANFIRQFELGNDSFSYQKRPEDEPSQPLSNRNINKLNDSSTLKDSSSRIFINSQVLRDGRPVELYAVECSGMKYMEL",
 | 
			
		||||
             "SCGDNVALRRCPDSYFNISQILRLAGTSSSENAKELDDIIESGDYENVDSKHPQIDGVWVPYDRAISIAKRYGVYEILQP",
 | 
			
		||||
             "LISFNLDLFPKFSKQQQIESSSISKNLNTSSFNTRSPLRNHNFSNPSKSSKNGVHTINNMQSSPSPSSSFLLPLTQIDSQ",
 | 
			
		||||
             "NVKRSNNYLSTSPPILEQRLKRHRIDVSDEDLHPSSQLNDNEASSLFPDTPRLNHSLSFVSLVSSLPPLDQNIMQDYHTS",
 | 
			
		||||
             "KDILTSIFLDVNFADSSALEAKLSDSLDLDVPIDELGHAALHWAAAVAKMPLLQALIHKGANPLRGNLTGETALMRSVLV",
 | 
			
		||||
             "TNHLNQNSFGDLLDLLYASLPCTDRAGRTVVHHICLTAGIKGRGSASRYYLETLLNWAKKHASGNNGYMLKDFINYLNHQ",
 | 
			
		||||
             "DKNGDTALNIAARIGNKNIVEVLMQAGASAYIPNRAGLSVANFGIFVENALKQPEDSKQTKVSLMSENLSSKEKTAVPPR",
 | 
			
		||||
             "QKSRDIIASVTDVISSLDKDFQDEMAAKQSMIDSAYTQLRESTKKLSDLREQLHVSETQRTLFLELRQRCKNLMTSIEEQ",
 | 
			
		||||
             "KSELSNLYESFDPNGIHDSLSLDADAPFTVNENNNKNLSIAELKFQVAAYERNEARLNELANKLWQRNSNIKSKCRRVVS",
 | 
			
		||||
             "LCTGVDESRVDSLLESLLQAVESDGQQGEVDMGRVAGFLRVVKEHQA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05338_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011392041",
 | 
			
		||||
    "UniProtID" : "A0A0D1BWD8",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPLNYFANQDQTASDTYAHEASSFPAPSSILTDTSKPLQPVQEVAASSLVDGVSFTSPHASIIHASKQSPRAASSLSFTT",
 | 
			
		||||
             "SALQRAGLLPANPNMSTTATSGTSAASESLQRVITQGTASAAAINGASTPAHSGPLTPAHLKNLTPAQANAALQNPVGNI",
 | 
			
		||||
             "PTVYLATYSNVPVYEITVRGIAVMRRRGDGWLNATQILKIAGIEKTRRTKILEKSILTGEHEKIQGGYGKFQGTWIPLQR",
 | 
			
		||||
             "AQQVAAEYNVSHLLQPILEFDPATADQIPKLYQRKKPAASARNSSASAINDARGSTPSKIYSPAPASLGGPSQQPRFLSL",
 | 
			
		||||
             "RPPKETHEQEISSAIFMPPGTAGLLSNGTFVDDRAASALAYPGPPAIPPGSTPAEQAALRSYNVYGYTPQGVPLPSSAAA",
 | 
			
		||||
             "DGNGTEAAATAASTGAGKREASETDQDGASAAKRSRLTSPQQQRRDDGLLLGPSPVKDLNALGPAGGSLRAASAPRGHRI",
 | 
			
		||||
             "TVGPPDAAGRDGAVPRYADRALPPKPYDEGEKRMRDRLVSLFSDDGVLPGVSEATGAGASQSAADEDDDAYVAKLDSLLA",
 | 
			
		||||
             "DLREKASLGGLGASGTDGPKATVDLITDDHGHTALHWASALCRVKLVRTLVARPPWQGGANIHAGNHAGETALHRSVLVT",
 | 
			
		||||
             "NSYDASSFPTLLNLLSSSLNTRDFKKRTVLHHISLVAALKGRAASARYYLACVLEHISAEKNSKYKGLIDAQDEDGETAL",
 | 
			
		||||
             "GIVARLGNASMVRMLLDVGARKDLANALGIRPSDWGIESSADGASLTPSQNDGTNTVASLPPLTAADLASQNPSDIISAL",
 | 
			
		||||
             "TRPAQVPVMKSSDVRDQLSSTLDDLQSSFERELKEKQDAVSTVQSHLQAATRDLAARRKTVSAAQAKLAEKDEARQRVQN",
 | 
			
		||||
             "LRRAIVAQLGLEEADADLSLEQLVEEAANAASAAPADKSADKMDIDGAEDVKPVRASNLETLIDDILSFDTIQSDLKAVG",
 | 
			
		||||
             "TSAVTQEVVEQDELVRLRWLVSFYQSSCDELSSTISELEDSSAKKESQCQQVVAICANIPQDKVESMLDELLTAMESDGP",
 | 
			
		||||
             "DVDLARVANFMQKVGKTRENGDQPGVGAQLSSSTSLSTAVSSGGTAASSVVPAVERDGEDAKPDA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SWI4_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_011036",
 | 
			
		||||
    "UniProtID" : "P25302",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPFDVLISNQKDNTNHQNITPISKSVLLAPHSNHPVIEIATYSETDVYECYIRGFETKIVMRRTKDDWINITQVFKIAQF",
 | 
			
		||||
             "SKTKRTKILEKESNDMQHEKVQGGYGRFQGTWIPLDSAKFLVNKYEIIDPVVNSILTFQFDPNNPPPKRSKNSILRKTSP",
 | 
			
		||||
             "GTKITSPSSYNKTPRKKNSSSSTSATTTAANKKGKKNASINQPNPSPLQNLVFQTPQQFQVNSSMNIMNNNDNHTTMNFN",
 | 
			
		||||
             "NDTRHNLINNISNNSNQSTIIQQQKSIHENSFNNNYSATQKPLQFFPIPTNLQNKNVALNNPNNNDSNSYSHNIDNVINS",
 | 
			
		||||
             "SNNNNNGNNNNLIIVPDGPMQSQQQQQHHHEYLTNNFNHSMMDSITNGNSKKRRKKLNQSNEQQFYNQQEKIQRHFKLMK",
 | 
			
		||||
             "QPLLWQSFQNPNDHHNEYCDSNGSNNNNNTVASNGSSIEVFSSNENDNSMNMSSRSMTPFSAGNTSSQNKLENKMTDQEY",
 | 
			
		||||
             "KQTILTILSSERSSDVDQALLATLYPAPKNFNINFEIDDQGHTPLHWATAMANIPLIKMLITLNANALQCNKLGFNCITK",
 | 
			
		||||
             "SIFYNNCYKENAFDEIISILKICLITPDVNGRLPFHYLIELSVNKSKNPMIIKSYMDSIILSLGQQDYNLLKICLNYQDN",
 | 
			
		||||
             "IGNTPLHLSALNLNFEVYNRLVYLGASTDILNLDNESPASIMNKFNTPAGGSNSRNNNTKADRKLARNLPQKNYYQQQQQ",
 | 
			
		||||
             "QQQPQNNVKIPKIIKTQHPDKEDSTADVNIAKTDSEVNESQYLHSNQPNSTNMNTIMEDLSNINSFVTSSVIKDIKSTPS",
 | 
			
		||||
             "KILENSPILYRRRSQSISDEKEKAKDNENQVEKKKDPLNSVKTAMPSLESPSSLLPIQMSPLGKYSKPLSQQINKLNTKV",
 | 
			
		||||
             "SSLQRIMGEEIKNLDNEVVETESSISNNKKRLITIAHQIEDAFDSVSNKTPINSISDLQSRIKETSSKLNSEKQNFIQSL",
 | 
			
		||||
             "EKSQALKLATIVQDEESKVDMNTNSSSHPEKQEDEEPIPKSTSETSSPKNTKADAKFSNTVQESYDVNETLRLATELTIL",
 | 
			
		||||
             "QFKRRMTTLKISEAKSKINSSVKLDKYRNLIGITIENIDSKLDDIEKDLRANA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SWI6_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962967",
 | 
			
		||||
    "UniProtID" : "Q7SBG9",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MQPPQLGGASQQSQPSSQQSFSMSQSSQSVYRQYTDPPNRLHNDHAVPTIYSATYSGVGVYEMEVNNVAVMRRQKDGWVN",
 | 
			
		||||
             "ATQILKVANIDKGRRTKILEKEIQIGEHEKVQGGYGKYQGTWIPFERGLEVCRQYGVEELLSKLLTHNRGQEGETGNVDT",
 | 
			
		||||
             "PTKEQAMAAQRKRMYNASSQENRGIGSTGTFFKNISSTASTAVAAISKARFDSPAPRNRSGPSRAPSFNRQSSMQDVADF",
 | 
			
		||||
             "PNSQQSLVSTEYATQTQNADSGFGSQTTQPLAGDGLEQPPRKRQRVLTPARSFGGQTPGHQPLDPFNAGNIANGDSGSPT",
 | 
			
		||||
             "EPSNSFNYDQVTANDGDASYALGPLRPLPYENNADAEAKRGMLMGLFMDANGPEEAIQAALCNVSPQELDSPIDTQSHTA",
 | 
			
		||||
             "LHWAATLSRMPLLRALIHAGANPWRVNACGETALMRACTVTNSMENNTFPELLDLLGCTLDVTDDKGRTVLHHIAVTSAV",
 | 
			
		||||
             "KGRHYASRYYLESLLEWVVRQGSAPSSQENGIGDRKGRRMGIARFMSEIVNAQDNSGDTALNVAARVGNRSIISQLLEVG",
 | 
			
		||||
             "ADPTIPNRANLKPLDFGIGIADAETNDDPAQEKTGATTGSGHKSRETSDEVVRSITHLIGESASIFQNELKKKQESIDTL",
 | 
			
		||||
             "HSQLRVTSSQVGDARRTLESLQEKLKAQQLAKQKIVNFNRACEEEEQILIELEQRHGRLDVASANAWEMELESALEIVKT",
 | 
			
		||||
             "QSPKGLDPDSRPSLPSAAVLRARIKALRARSSKTRQAVAALQAQSKEKELKYRRLVSLCTRRPEIEVEALLDTLTRAVES",
 | 
			
		||||
             "EKPELEIARVRRFLGGVEGVVH"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "15042_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011388143",
 | 
			
		||||
    "UniProtID" : "A0A0D1CVS5",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTASPLHHGHGNGSYANSPAPTGVTGRDAGVAAAAVADSAVRSGSVPASASGSAPGSASGSMYGEAHTQHHTGHHHYSA",
 | 
			
		||||
             "HHTHSHGALTSPVNGGHSSSWSPYGYPAAPVYGGSPSPYGHNAYSQYASGYGYANGTAHHVATAPTTPSATSTAYHTGVN",
 | 
			
		||||
             "GMMMHHGQHAGYGYSSHHLGSHTPTHTHTHSSAYFMNGDGAHSHLNSSAHLTSPSYTTAPQYSTQLPLAGRHRVTTTLWE",
 | 
			
		||||
             "DEGTLCFQVDARGVCVARRHDNNMINGTKLLNVCGMSRGKRDGILKNEKERIVVKVGAMHLKGVWISFARAKQLAEQNGI",
 | 
			
		||||
             "ADALYPLFEPNIQSFLYHPDNYPRTAAVIAAAQERQAQRQRAPGGQPSPGANGTSQAPPLMRANTTPSNGDTSTFSSGLS",
 | 
			
		||||
             "SLGSWTGSHDQGHASAPTTAQPSPSSMHNGATQMHMSLSNHGTASPTYAQSQQQQQQQQQQQQQQQQQQQQQQQQAYPMT",
 | 
			
		||||
             "AAQQLARPSVGDRRQSAPISLNNSVGHAENPYGATNLGGAANGGLVNGARKVSGLKRSWNDADDLNGSAAASPTERDMQR",
 | 
			
		||||
             "SGSGGSNGLKLDGDDLHSPDSSDDRLAKKTRGMPQRGGGATTAMPSMSTNMLMGVGNGSGIHHE"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "04778_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011391646",
 | 
			
		||||
    "UniProtID" : "A0A0D1DQM4",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNQAPLSATGVNFYISGPRPARLFPTPIHEFRKGKYATAGGESGFMTVFEYDVRGHTMMIDVDTSFVRFTSITQALGKNK",
 | 
			
		||||
             "VNFGRLVKTCPALDPHITKLKGGYLSIQGTWLPFDLAKELSRRIAWEIRDHLVPLFGYDFPSTCLRPDSEGFGQLAIGMS",
 | 
			
		||||
             "QKRARKRHNNGGPHQTSCYGPSLPISIELWQHSTDPLRDLGESSVVGGQAIEHVSAKNSAVQPCYGSSQPATFHYSKGYG",
 | 
			
		||||
             "LESRPWYGQDYLESNSLESMWNSAQAGGGSVGLQVPISTCGATASPCLAAIGANGGSPILSSPPSSNASSSSNQSYTAAG",
 | 
			
		||||
             "YGLMVPPTVPSHSVNSEAGANQAEGPTPIDGSRSYASLTAHGYATGYGDANASLSTWNDATHASTFTLHVHAHVHFQPPD",
 | 
			
		||||
             "PESAQLFTIHDFGSDPFYAEQVERG"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "STUA_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_663440",
 | 
			
		||||
    "UniProtID" : "P36011",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASMNQPQPYMDVHSHLSSGQTYASHPATAGALTHYQYPQQPPVLQPTSTYGPASSYSQYPYPNSVASSQSVPPPTTSIS",
 | 
			
		||||
             "SQVPAQLLPLPVTNHPVPTHGYGNNSGTPMQGYVYDPTGQMAPPGAKPRVTATLWEDEGSLCYQVEAKGVCVARREDNGM",
 | 
			
		||||
             "INGTKLLNVAGMTRGRRDGILKSEKVRNVVKIGPMHLKGVWIPFDRALEFANKEKITDLLYPLFVQHISNLLYHPANQNQ",
 | 
			
		||||
             "RNMTVPDSRRLEGPQPVVRTPQAQQPPSLHHHSLQTPVPSHMSQPGGRPSLDRAHTFPTPPARMNSSVPNTQPLSIDTSL",
 | 
			
		||||
             "SNARSMPTTPATTPPGNNLQGMQSYQPQSGYDSKPYYSAAPSTHPQYAPQQPLPQQSMAQYGHSMPTSSYRDMAPPSSQR",
 | 
			
		||||
             "GSVTEIESDVKTERYGQGTVAKTEPEQEQEYAQPDSGYNTGRGSYYTTNPSVGGLAHDHSQLTPDMTGSPQQNGSGRMTP",
 | 
			
		||||
             "RTSNTAPQWAPGYTTPPRPAAASSLYNIVSDTRGTSGANGSTSDNYSVASNSGYSTGMNGSMGSNKRMRDDDDDRIVPPD",
 | 
			
		||||
             "SRGEFDTKRRKTLTETPVGGPVGGVPLGLQPMKAGGSLISARR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "STUA_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_960837",
 | 
			
		||||
    "UniProtID" : "Q1K6U0",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNPNTPADVYYGQMSQGSSMPVTTVPSHSHYASQQPPPLLQPGSTYAHQYGTPQYGYANALSSPASIPPSLPPSMNSMAG",
 | 
			
		||||
             "QSVLPLPGSGSMNPAVYASGGFDTTGQVAPPGMKPRVTATLWEDEGSLCFQVEARGICVARREDNAMINGTKLLNVAGMT",
 | 
			
		||||
             "RGRRDGILKSEKVRHVVKIGPMHLKGVWIPFERALDFANKEKITELLYPLFVHNIGALLYHPTNQSRTSQVMAAAEQRRK",
 | 
			
		||||
             "DSHGQLRGPPGLPSLQQHHHHHSMLPGPPSLPSHPSMGRPALDRAHTFPTPPTSASSVMGPMGNSDGYQWSQQSMSGTQG",
 | 
			
		||||
             "NSSLSLDTSLGSNARSMPSTPATTPPGSTIQSMQNYPPVSQSYESSRQMYQGQSAQQAQYQSQQHYSSQPQHQERPVYSQ",
 | 
			
		||||
             "SSYIKNDMGPPSGRPTGQSNDASDSKPPTGMIHQGQGQSDPGTHAGSEEDDDANNEAEYTHDSGGYDANRGSYNYNTQAV",
 | 
			
		||||
             "NSLPHDHGLAPEIGGSPHQAGSGRATPRTAAAPSSYYSAQGYHTPPRGQPSSSLYNVMSNERTGSNGTQGNEMYAGQADM",
 | 
			
		||||
             "PSSLPNGYSAQPSVMNGSSGGLKRGRDDDDDGGRPTTSAPNLGPGMDMKRRKTMMDGGSLPSPTYTATIAQAAPSAIAAH",
 | 
			
		||||
             "RRR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PHD1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_012881",
 | 
			
		||||
    "UniProtID" : "P36093",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MYHVPEMRLHYPLVNTQSNAAITPTRSYDNTLPSFNELSHQSTINLPFVQRETPNAYANVAQLATSPTQAKSGYYCRYYA",
 | 
			
		||||
             "VPFPTYPQQPQSPYQQAVLPYATIPNSNFQPSSFPVMAVMPPEVQFDGSFLNTLHPHTELPPIIQNTNDTSVARPNNLKS",
 | 
			
		||||
             "IAAASPTVTATTRTPGVSSTSVLKPRVITTMWEDENTICYQVEANGISVVRRADNNMINGTKLLNVTKMTRGRRDGILRS",
 | 
			
		||||
             "EKVREVVKIGSMHLKGVWIPFERAYILAQREQILDHLYPLFVKDIESIVDARKPSNKASLTPKSSPAPIKQEPSDNKHEI",
 | 
			
		||||
             "ATEIKPKSIDALSNGASTQGAGELPHLKINHIDTEAQTSRAKNELS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "08099_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001836714",
 | 
			
		||||
    "UniProtID" : "A8NVH3",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSTGMLQETLQTTSASTSGTRFRPYASPNHQVTKGRYITSNDPRGYIPVYEYPLNGQWIMMDIDDGYILWTGIWKALGNS",
 | 
			
		||||
             "KADIVKMIDSQPDLAPLIRRVRGGYLKIQGTWMPYEVALKLSRRVAWPIRHDLVPLFGPTFPSTCLSPDQPGYGQVVASS",
 | 
			
		||||
             "NVRRRARRNTQATAQPPREAHSNWTVMTPGPMVGLSFPHSQFSRPPLPPLAPTPARSPSDYAPSSHYGNQLDPQDARRYS",
 | 
			
		||||
             "HSPYSPLASPPERKSSISSKALSLEIPPVRPSSSKAREDISLPPLKQPDGADPEMSPYALPPISALEDLRGVDTQDSAAV",
 | 
			
		||||
             "LRRLRLDDDYPSSSRSSTSQDSIWGRRHSLSAHSPHPRSSDNSRFQPYLSSRSYQDSTLKRSRSPAESYADRRRASDFSQ",
 | 
			
		||||
             "EDSTSAYSPISPATPNSSILSHSSFSDLKKLASSTDTRYNFPRISGRDWAPLKGDTDHIRSSYRSGPSPLELDSDSESSA",
 | 
			
		||||
             "PHRPW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "68479_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957792",
 | 
			
		||||
    "UniProtID" : "I4YDE0",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTNKVQELWWEENKTRVWQVEVDNGNYVARRQDNDQINGTKLLNITKITRGKRDGILKNEKSRQVVKTGTITLKGVWIPF",
 | 
			
		||||
             "ERAIILARQFNIEQQLYPLFETNLGDYVENSIGSHQIKRKSLNNLMDSLTTNRELVSKRRSTVSTYNPATSAYVSPYGFS",
 | 
			
		||||
             "PQHCYQTEFEDMNQHSGEIQSGRPRNTSSASDWMTNWSTSSSSPVIPATPNTFSPVMNTFQSLALHSPPIPIPNYYYDSS",
 | 
			
		||||
             "SSYFPSYHQKQQQQQVQMQMQMHTTASIGGDRQSNEYIQR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "11943_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003330006",
 | 
			
		||||
    "UniProtID" : "E3KMR2",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAAAPTSSFLTSMSAQPPRTVQALVNEEVRAPPPVRLYPSQHRVSMTRYATSTDPRGYIPVFEYPLNGQYIMIDCETGMV",
 | 
			
		||||
             "HFTGIWKALGHTKADVVKLVESDPTIAPYLRKVRGGYLKIQGTWLPFDTAQTLARRVAWQVRYDLVPLFGPDFPDTCLGP",
 | 
			
		||||
             "GEPGFGQLLLSAPKPRGRRGAKKAAAAPTVAHERTASPQDNRSQSRPGPYPSQESFGNRCSGRVEAVGAMNGYSPMLSQA",
 | 
			
		||||
             "RYSPYTRAPVHRITQLEPLPSLIQPNQSCPHPTADSMYSSHYHQSPRQSMMTSHGAGPYGQQHLTGSTASGMQSTAPLPS",
 | 
			
		||||
             "MRPHQAHQSENNFFETYRGPDSFEALSNKWLAPEVANPSLNDSGLLHGEGGCLPPLQYSNNPVLRNGPSGSPTNQYNFPN",
 | 
			
		||||
             "QIDSAHSSHHIDSNQTQHVHRHAGFPYESQHQSNFRHDLSTEEAAHHPASPSQQPPPSVTYDKAHNSEPQAGSQAANVTA",
 | 
			
		||||
             "GCYAASGSNSTGNPAGSPGSHSSHVPKSPTPSSASTSTHMQNSHNPNSHRSPSNTLTNMSNNGGFNSNTQGEEAIQFSVL",
 | 
			
		||||
             "TSPAHLETSGPSENSIPPAQSSDSDWNPAQNTTGLSPSQAPRQ"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "03082_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003321545",
 | 
			
		||||
    "UniProtID" : "E3JYK1",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MILISPTRTLPSPRPIDTDPILNYRHIQPAAAAAAVGPWLGQNQHHHHHHDTLAKSPNITTAPATHSPSELSASPAPSAV",
 | 
			
		||||
             "STGSSLLDPQSVPHIKIPHSSSPPAIMLPQPSSDDDSSTAEEEQPSAQSSNATLNTPTPHTNAPHQLDSHASSVGLYDLP",
 | 
			
		||||
             "PTSSSAPTTSSSSSPFPSNVPSHQQPSPYSSSPHPNQEHHPHHPHHGNQFYQQSPPALHSPLQSAHHPQQSFDARPHSSL",
 | 
			
		||||
             "FAHQHYHSRPQSAPHSTSQFSLDPHVLAAAAANVEVKKWDEENTYYYQVAHKGVTVGRLKGSGLVNGTKLLNLAGISRGK",
 | 
			
		||||
             "RDGILKNEKIRKVVKHGTMHLKGVWIAFDRAVFLAEQHSIADKIFPLLVVNLEHYVPIEPPLMAGGTKLGPGSLFHHHHP",
 | 
			
		||||
             "RHPRLLPQPIKFPPSTISLAPASANSFSSTGGWPSGPSSALPSIGYNEPFSAPPIPRSAATADTSPSIYEQAQFQYLNSA",
 | 
			
		||||
             "QANNPDLLERRHTLPNNSFHGYNSVPSFGSSQPPPPVSYSFHYNSTHVPGYPPRSSTAESATPNQFEYQSKNHNGNGNGD",
 | 
			
		||||
             "AAGSYPATLYHSQPAARPVSSTTAQPSPALNSAPLLLGDLSPGSSTQIVDHGAGDFRLSTGTSNGQVKQEGDDESCNEKR",
 | 
			
		||||
             "LIMEWNPSC"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "SOK2_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_013729",
 | 
			
		||||
    "UniProtID" : "P53438",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPIGNPINTNDIKSNRMRQESNMSAVSNSESTIGQSTQQQQQQQQYLGQSVQPLMPVSYQYVVPEQWPYPQYYQQPQSQS",
 | 
			
		||||
             "QQQLQSQPQMYQVQESFQSSGSDSNASNPPSTSVGVPSNATATALPNGSAITTKKSNNSTNISNNVPYYYYFPQMQAQQS",
 | 
			
		||||
             "MAYSYPQAYYYYPANGDGTTNGATPSVTSNQVQNPNLEKTYSTFEQQQQHQQQQQLQAQTYPAQPPKIGNAFSKFSKSGP",
 | 
			
		||||
             "PSDSSSGSMSPNSNRTSRNSNSISSLAQQPPMSNYPQPSTYQYPGFHKTSSIPNSHSPIPPRSLTTPTQGPTSQNGPLSY",
 | 
			
		||||
             "NLPQVGLLPPQQQQQVSPLYDGNSITPPVKPSTDQETYLTANRHGVSDQQYDSMAKTMNSFQTTTIRHPMPLIATTNATG",
 | 
			
		||||
             "SNTSGTSASIIRPRVTTTMWEDEKTLCYQVEANGISVVRRADNDMVNGTKLLNVTKMTRGRRDGILKAEKIRHVVKIGSM",
 | 
			
		||||
             "HLKGVWIPFERALAIAQREKIADYLYPLFIRDIQSVLKQNNPSNDSSSSSSSTGIKSISPRTYYQPINNYQNPNGPSNIS",
 | 
			
		||||
             "AAQLTYSSMNLNNKIIPNNSIPAVSTIAAGEKPLKKCTMPNSNQLEGHTITNLQTLSATMPMKQQLMGNIASPLSYPRNA",
 | 
			
		||||
             "TMNSASTLGITPADSKPLTPSPTTTNTNQSSESNVGSIHTGITLPRVESESASHSKWSKEADSGNTVPDNQTLKEPRSSQ",
 | 
			
		||||
             "LPISALTSTDTDKIKTSTSDEATQPNEPSEAEPVKESESSKSQVDGAGDVSNEEIAADDTKKQEK"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "14426_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_002911429",
 | 
			
		||||
    "UniProtID" : "D6RMB0",
 | 
			
		||||
    "taxonomyID" : "240176",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTARPPLPLRHANPSLRDGNATIPPVKYQILSCQGKDILVGRLKIDTTDGGHAFILRRFDTQAISLTTMFRAAFPTASEA",
 | 
			
		||||
             "EEKDEINYVKANFDLFGNNGSSKEPHITRLAGTWVNRDTAGQLAHDYNMVDLINTMVEAEPDPNGQYRRSNKSAQNNNPP",
 | 
			
		||||
             "TNAPEPTPATNVHATRSPAKQSPKPPSKTLPTPSPGSGDAQPPAPKRRREGSPATFTSGIPVASSPAVPKTPGPRRSTRT",
 | 
			
		||||
             "KSPAPSRVPQPLTATKPRSRASVAPPSPKKRPVDLPKSSPIKAEEDTAVEDNVAGNELYAQDISEQKKLIADLKAAASSK",
 | 
			
		||||
             "KPADTVKEDDDQQMEEEGQGPSKLKRIRQDEEKPLQFEFKEPEREERQIATNRRVGRFDMQPERKSLAWGIAAFAFGMTA",
 | 
			
		||||
             "ITYLPNFL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "BQT4_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_596166",
 | 
			
		||||
    "UniProtID" : "O60158",
 | 
			
		||||
    "taxonomyID" : "284812",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTENEKSRSLPAERNPLYKDDTLDHTPLIPKCRAQVIEFPDGPATFVRLKCTNPESKVPHFLMRMAKDSSISATSMFRSA",
 | 
			
		||||
             "FPKATQEEEDLEMRWIRDNLNPIEDKRVAGLWVPPADALALAKDYSMTPFINALLEASSTPSTYATPSRPTAQKSETSEG",
 | 
			
		||||
             "EPESSTSATTTSVARRTRQRLAEHLENSKKTILQHDNKEEDKEIHSEENETKDEIKSEKKEPEIKKQEGGSSTEKVGQPS",
 | 
			
		||||
             "SSDDKAKGSTSKDQPSEEEEKTSDIQDRKIKTPIKPSLLGKIRSSVNKGMTDVASQVNRGMTDVASQVNKGVNGVASQVN",
 | 
			
		||||
             "KGMNGVANQVNKGVTGVASQVRKPVGKLEKKFENLEKSIGDTLKSSIRSSPKSKKRSREDFEENEDYNAMVPVKRSRITK",
 | 
			
		||||
             "LESEVYYEKRKVRALGGIAIGLGVGAILPFLF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "PGTG_05590",
 | 
			
		||||
    "RefSeqID" : "XP_003323688",
 | 
			
		||||
    "UniProtID" : "E3K4V4",
 | 
			
		||||
    "taxonomyID" : "418459",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPKSSSCCEPEQKQSIPTNANPISAGGAGLDIRLAGMRSAHATLRGCSFSPYMVTQHPPLRDSVNRNKQQPTNNSTNPYT",
 | 
			
		||||
             "KKASRMSQTNLYKSNNPPNLPQDEFNQTLVNYQGKLRSIRIQDININGHTITIARIKIPSPEKLSSHLIKRFDTNAISAS",
 | 
			
		||||
             "SFFRSAFPHSTEEEEAIQMRYLHQIYDTHTAGAVEFGSARKLTGVWVPIENAAELAEVYGLTRFAEPLLAFPNPKENPRS",
 | 
			
		||||
             "PTGTKIGGEDESSTTQTPKASQQSKLTGQISVTRSSKRSRAGPLSFGNTSPSSFSLNSFNKPPTETNKSGTHDDSKSTND",
 | 
			
		||||
             "ENDEKPASPTDRVAGRGARNSPSKKPTTVDENHEHTEHEDHQLIGTDELAQRAKQEALKLVSELKNSQPCTQSSLESPTN",
 | 
			
		||||
             "TLETELTRTTSPAKSNKVTRKRSSDEVSFEGEEQGEDEDEERTADETATHRSFLPKLLWRKSAAQAHPNSKKHKRTQLGG",
 | 
			
		||||
             "GGSSSSSSKSFVPLLTNSATPSVDDSSSTHNPNKRNLAIAGIVIAGAAA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "06560_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962267",
 | 
			
		||||
    "UniProtID" : "Q7S9H5",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MAQVARHLPARRNPLMLEDVPSHTDLASRRRLGQTQLTPRMVTAVPGAEVDPSSLLAFDYAHLRAPLPKGIVSGIFKSSP",
 | 
			
		||||
             "PSYFLMRRSQDGYISATGMFKATFPYASQEEEEAERKYIKSIPTTSSEETAGNVWIPPEQALILAEEYQITPWIRALLDP",
 | 
			
		||||
             "SDIAVTATDSSAPKQIAPPPKFFGAQPPLVAPTPPTTRSTRSRPSSRRSSSPAKSTTTSKRGTTPRNTKRTVTTEASATT",
 | 
			
		||||
             "VTTTATATAVPSAETPATSFADSQAPTLINGEIPTSTPINTVPVTKIQTTEAELKVESIEKEPVVVLEPIEEEPKIKVRV",
 | 
			
		||||
             "DEDVKLDKDGEEVKHTKVELEVPLMAGEPPSKEEARKMIEEAKAMVEAAVKADAEAAAALVEASKAGAEDEKAEDEAKAE",
 | 
			
		||||
             "TEATKEEEADSKGKRKAEKISVDEDEKAADEAEQPRQAKRVKTEAELRKDRIRKRAYLGLTATFAVGALGALLPIITPYV",
 | 
			
		||||
             "ANVL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "81480_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007682909",
 | 
			
		||||
    "UniProtID" : "W6ZKJ4",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MVVDRVLPERKNPLLEPTDSTSIEILIERRRLGQTNLGVKAGVSGIANATKPENMGTFDYAHLRVPLPKDLTGSGIFSRN",
 | 
			
		||||
             "RMSAFPESYFLMRRSSDGYISATGMFKAAFPWASLQEEDLERKYQKTFPSAGDEEVAGSVWIAPEEALALSEEYSMRHWI",
 | 
			
		||||
             "EALLDPAPIEKGGKDKSNAAIQMPPRFDVANAQPATLPTFGFRQTRARSARSVSPSKAMTPGRKYATPRKGRSTRSAMKP",
 | 
			
		||||
             "DATHADDMFRPIEAVTPSTALQNSIARRIAPAETIASSIEGEVKEVEQEVKAALDAEKKPEPELEVQEGTVHIEVKQTVE",
 | 
			
		||||
             "TNGDTEKTSTSVTVDVPHDHAALPEPEDPTAMIEEAKRMVAEAQKLEGGSPSVTRSSKRGIEEVLDEEDLADERLNKLAK",
 | 
			
		||||
             "KAYTTEQKMTKEKVTRRALVGLGVMAAIGTAFQYFV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "01622_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_657766",
 | 
			
		||||
    "UniProtID" : "Q5BH18",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MVRSLPKKNNPFVTPDAAPPYEELLMRRRLGKTNLAVKPTQVGTSNATKPENLGPFEYAHLRAPLPKDLKGSEIFPSHSP",
 | 
			
		||||
             "QQHPETYFLMRRSKDGYVSATGMFKIAFPWAKLEEERSEREYLKTRPETSEDEIAGNVWISPVLALELAAEYKMYDWVRA",
 | 
			
		||||
             "LLDPTEIIQSPSSAKKQITPPPKFELPPIQAPEALVPSSRTRSRRSASPSKKAGTPRKPRQTKAQKEAAVAATNEANATL",
 | 
			
		||||
             "QSALDDTVSNADGEINGDVLPSVEDKREPETSPVKGKKAAAKAKKQAVSEEDQEDKVKIEIKSDAAEGSDVQAAQTTISV",
 | 
			
		||||
             "EMPISLPEAPSAEDTQEMIAKAKEMVKEAVKLQQEPAESSATAKKRGAEEAELGEEEEDEETKTLRTKRAKVLEEKLKRE",
 | 
			
		||||
             "RVRNRALMGVTAAFALAKPALVLLEA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "05405_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_663009",
 | 
			
		||||
    "UniProtID" : "Q5B225",
 | 
			
		||||
    "taxonomyID" : "227321",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MASIQFLLNPLPSLPSSDRCPLPTPSPTISSSTAMLRSPRQKKQKMAKDAPIFQRGKPRGEVRYPPYEDRDGKFSCQHQD",
 | 
			
		||||
             "FRIHPLGNIADYPRHIPYNSDKKSFQERTGRESFEVFQYTFQLPGEEKQWTVMWDYNIGLVRTTHLFKCNDYSKTTPAKM",
 | 
			
		||||
             "LNQNPGLRDICHSITGGALAAQGYWMPYEAAKAIAATFCWKIRFALTPLFGDNFPDLCIHPDDRARFGRMVIDPGIVRIA",
 | 
			
		||||
             "TEKANLYRMLELRCSTTNSLRADYVLRPSSAPDIDRTDPNLERDRVALGRHILPKSHRHHHHRSKTSPSTNTSLVGYGSS",
 | 
			
		||||
             "PEVEYYSCGTEPYCVSPESPIRSSFTPVNTPRSTDIYPSSSSTNFLRSPHELLASLSSSASIARARIERASKISGARVIP",
 | 
			
		||||
             "SSVPSNVTSITTKGRDNTGHSALMEESDIDADAETDSGHEHDLDFELSSSDESSTSSTVSSSTSSASLGFAANSRNRPYR",
 | 
			
		||||
             "DDDEPHRDTDEEMVDYRAPKRIATAGARDRRWGRGRRVIHQEHSDIETSRRARKHAQRSSNARLVCEMTAAHALISLLHD",
 | 
			
		||||
             "ATGSDVDVDTHNRLECGRSPDGGVKNNLKGSYFGIRLNHNPSTESGQKRRRASA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "105954_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007691967",
 | 
			
		||||
    "UniProtID" : "W6Z1H5",
 | 
			
		||||
    "taxonomyID" : "930090",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MNIQDLLNPSCGDRHDHRRSESATPPSRPVAILPALRRQKIPKDAPIFSEGNRTVGIVNFAPHEAGNDEELLAQHCRFQI",
 | 
			
		||||
             "YPLGEISRKGVRHIPYNSDKKDFLEKTGRDAFEMFQYTYKLPGEDKPYVVVWDYNVGLVRMTPFFKSCKYSKTIPAKTLR",
 | 
			
		||||
             "ENPGLKDISYSITGGALVCQGYWIPYQAARAIAATFCYDIRWALTPVFGNDFPSICLTPDDPSFAKFVIDPAIVRYCTEE",
 | 
			
		||||
             "TTKFRELGSAYEVHRPVAPTQVEAPTSRSDQPLSTSIVRQRRARPIDIESGYGTDTERNDRCLFSPEVSPRTRFTPINRP",
 | 
			
		||||
             "RSPYSPRTAESSFVSSPVSIRAPPGLHTPTSTPYEHSGEVFRAKRSHSKVAFCEHPADEAVIRPPTAATVDSAHGCEMCV",
 | 
			
		||||
             "GDDNHSHLDMDAAEMLLSLRTADSAMPPSKRTRRGS"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "69819_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006959479",
 | 
			
		||||
    "UniProtID" : "I4Y911",
 | 
			
		||||
    "taxonomyID" : "671144",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MTSPGLPKDFNELLDKSEIPSPKWQQITRDDRPITIARLKLPHPREKHTFILRRYDCNGISFGSLFKAAYPYATDEEEKI",
 | 
			
		||||
             "ESGFVKKNYDVTLVPTEEYQERKLAKLAGFWIPIAIAEELGQRYAMAEYVDALAKADTPDLTDFKKRSSNRQTSEDIKSS",
 | 
			
		||||
             "PAKAQASLESPAKSASKIPTPTKNPAPRRSARHQSRSPSPSPLTHNLTPGKKKAKKAPKEAVIEESVEETIVVDKKESPL",
 | 
			
		||||
             "KKALNDDQVLADIERAKDLVDDIKQSKNLSQSSPVKVVKEEVLETIQPSVSTESLEGEGKRKRELEDETGNEIKVVSFGQ",
 | 
			
		||||
             "NPPANPEEIQQRPVVQRRGVAAAVGAFALGVGFAASNILPRFLF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "02840_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_568872",
 | 
			
		||||
    "UniProtID" : "Q5KM59",
 | 
			
		||||
    "taxonomyID" : "214684",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MSHPAADAPPPYPGTTDDAQYDLTPLPHTANRPRLPEDKRNPHLNNLPEDTKIVKFQTIVRENKEIVVGRIKVPTENANG",
 | 
			
		||||
             "THHAFILRRYDTNAISLTTMYKVAFPSATEEEEKREMDWVKSSFDTRGTNGGRDSEVVRLAGQWVSRNLAIHIAPAYNLV",
 | 
			
		||||
             "QLVAALSRAVPDPNVAYRKSQRSQAAADELARTKAKQSQAPSSVPAISNVPVRKPQAAIPSMATEISSPASKRQRKDSVT",
 | 
			
		||||
             "EASGSATQTITEAQPSADTSETDDTRHITIEATTTITSPSGANVDMDAEIEQAKQLVKDLRQEIQLRNEAGDSLEDQGVA",
 | 
			
		||||
             "VADDVRGVKRGKHEDEAVVISGGAGGKDRVVRTNKRIPQTAGGDVGQRFGWGAFVFSIGLGASLTLFSQYASSLL"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "11055_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011390537",
 | 
			
		||||
    "UniProtID" : "A0A0D1DZM8",
 | 
			
		||||
    "taxonomyID" : "237631",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MPAAASARKSTPTRKSTPRRARSSSVTSNASTGVPASPSASPRKTKKQKEAAAAAAAAVAAAAATAEQVNDDESDLLRPK",
 | 
			
		||||
             "LPTKRNPRLKEVDEAVVKLQIIKREGHNIIIGRVKLPTVNGQDHAFLLKRFDTNAMAASSMFRLAFPFADGTAEAAEMRF",
 | 
			
		||||
             "LDTKYDTNRANGGYIVEEVKVPETPKKRGRTRKTAENSKKESTPDTESVSADKQIRVLPEGSTGVRLQGTWIPAEDAIEV",
 | 
			
		||||
             "AEDYGIAKYALALIHATAEHAEDGGAPILTSEPVAEVKTPRKRQRVSAAAATASDTPDSPQLVQRVTRLENADGSISKVR",
 | 
			
		||||
             "VESTLEAPSSNGVPVALSQAEIEEQIAQAKALAAGIQQSITAGSGSASTRGQKRRAVNDRPTAEIDPLADDEDYSESGRV",
 | 
			
		||||
             "VRAFRRGTRVARRRPIATTAGAVAAAGAVGAGALAWVSGGNPEVAIQTLQASMQSIGLQNLQNLGLQNLQQIGTQLGAHL",
 | 
			
		||||
             "ASILPW"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "XBP1_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_962373",
 | 
			
		||||
    "UniProtID" : "Q7S9W7",
 | 
			
		||||
    "taxonomyID" : "367110",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MLNQNPGLKDIAYSITGGAIKAQGYWMPYACAKAVCATFCYQIAGALIPLFGPDFPSECISPGEPRYGIMIIKPELISDT",
 | 
			
		||||
             "MRKAQELYRRYGNWGGGCTSSSPARRPLRTASSGSQERHHHHPYPNQEHLDHQQQQQRTVCSRRCPAEENSCVDARPQLR",
 | 
			
		||||
             "GISAPMPPAGEWTPPLLRSSAGRPRPVMPTSTHSSISYPERAPHRSAWTAVNHQPPNNSLDRYSLKRPLPSNEPDESVSH",
 | 
			
		||||
             "SNWPSRSQAPNPWLTAIPRSPRKTSSSPWASQPGSASRSRAGSIDSMASQHPQGLPSPSLILSSPSSSMVSLSSSNSPSP",
 | 
			
		||||
             "RPQLPPISQLCSLPVPSGRRRLPNGRPSRVGGDATSSHSRQDHSTCGAYQFSAGYQRALTPPSSTSAPMHWRSQRRPSLQ",
 | 
			
		||||
             "DQHEHEHIEDTQPRRIAVEANMECGDDNESHLHLPLPLPRTSSSASIVADKNANDTTSDNSSSRNFNSASIGSGRDDGQT",
 | 
			
		||||
             "SLAARKTAALTLLHLRQQEEEKEAAAAAAAAAAAAYSSTKRPESPSSSLSSPVSPPPTSGQPSPTLSAVVTATNLRRGTT",
 | 
			
		||||
             "TATATAVIDTTEPLAPPPSPSSNYLGSPISTSIASSSSSFSPSTSCNGTRENSVVANEMTRYAGQEADAGGPRHCNGDAD",
 | 
			
		||||
             "DEGDYEHEQQYRRKRRRLLLVGRAKSF"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "XBP1_SACCE",
 | 
			
		||||
    "RefSeqID" : "NP_012165",
 | 
			
		||||
    "UniProtID" : "P40489",
 | 
			
		||||
    "taxonomyID" : "559292",
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
             "MKYPAFSINSDTVHLTDNPLDDYQRLYLVSVLDRDSPPASFSAGLNIRKVNYKSSIAAQFTHPNFIISARDAGNGEEAAA",
 | 
			
		||||
             "QNVLNCFEYQFPNLQTIQSLVHEQTLLSQLASSATPHSALHLHDKNILMGKIILPSRSNKTPVSASPTKQEKKALSTASR",
 | 
			
		||||
             "ENATSSLTKNQQFKLTKMDHNLINDKLINPNNCVIWSHDSGYVFMTGIWRLYQDVMKGLINLPRGDSVSTSQQQFFCKAE",
 | 
			
		||||
             "FEKILSFCFYNHSSFTSEESSSVLLSSSTSSPPKRRTSTGSTFLDANASSSSTSSTQANNYIDFHWNNIKPELRDLICQS",
 | 
			
		||||
             "YKDFLINELGPDQIDLPNLNPANFTKRIRGGYIKIQGTWLPMEISRLLCLRFCFPIRYFLVPIFGPDFPKDCESWYLAHQ",
 | 
			
		||||
             "NVTFASSTTGAGAATAATAAANTSTNFTSTAVARPRQKPRPRPRQRSTSMSHSKAQKLVIEDALPSFDSFVENLGLSSND",
 | 
			
		||||
             "KNFIKKNSKRQKSSTYTSQTSSPIGPRDPTVQILSNLASFYNTHGHRYSYPGNIYIPQQRYSLPPPNQLSSPQRQLNYTY",
 | 
			
		||||
             "DHIHPVPSQYQSPRHYNVPSSPIAPAPPTFPQPYGDDHYHFLKYASEVYKQQNQRPAHNTNTNMDTSFSPRANNSLNNFK",
 | 
			
		||||
             "FKTNSKQ"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,116 +1,116 @@
 | 
			
		||||
[
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "APSES fold", "start" : "4", "end" : "102"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "KilA-N", "start" : "22", "end" : "105"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "108", "end" : "122"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "236", "end" : "241"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "279", "end" : "307"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "low complexity", "start" : "700", "end" : "717"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "394", "end" : "423"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "427", "end" : "463"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Ankyrin fold", "start" : "512", "end" : "541"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "Swi6 fold", "start" : "381", "end" : "547"},
 | 
			
		||||
  {"pName" : "MBP1_SACCE", "fName" : "coiled coil", "start" : "633", "end" : "655"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "APSES fold", "start" : "9", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "KilA-N", "start" : "26", "end" : "109"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "low complexity", "start" : "529", "end" : "534"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "260", "end" : "289"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Ankyrin fold", "start" : "381", "end" : "413"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "Swi6 fold", "start" : "193", "end" : "402"},
 | 
			
		||||
  {"pName" : "MBP1_ASPNI", "fName" : "coiled coil", "start" : "509", "end" : "572"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "APSES fold", "start" : "8", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "KilA-N", "start" : "26", "end" : "109"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "134", "end" : "152"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "267", "end" : "278"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "low complexity", "start" : "670", "end" : "685"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "266", "end" : "295"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Ankyrin fold", "start" : "387", "end" : "416"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "Swi6 fold", "start" : "253", "end" : "421"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "659", "end" : "681"},
 | 
			
		||||
  {"pName" : "MBP1_BIPOR", "fName" : "coiled coil", "start" : "500", "end" : "590"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "APSES fold", "start" : "14", "end" : "114"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "KilA-N", "start" : "34", "end" : "117"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "130", "end" : "141"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "253", "end" : "266"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "514", "end" : "525"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "554", "end" : "564"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "601", "end" : "618"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "620", "end" : "629"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "636", "end" : "652"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "658", "end" : "672"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "725", "end" : "735"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "low complexity", "start" : "752", "end" : "771"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "268", "end" : "297"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Ankyrin fold", "start" : "390", "end" : "419"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "Swi6 fold", "start" : "270", "end" : "426"},
 | 
			
		||||
  {"pName" : "MBP1_NEUCR", "fName" : "coiled coil", "start" : "500", "end" : "550"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "APSES fold", "start" : "8", "end" : "104"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "KilA-N", "start" : "25", "end" : "113"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "111", "end" : "125"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "136", "end" : "145"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "176", "end" : "191"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "low complexity", "start" : "422", "end" : "447"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "247", "end" : "276"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Ankyrin fold", "start" : "368", "end" : "397"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "Swi6 fold", "start" : "234", "end" : "400"},
 | 
			
		||||
  {"pName" : "MBP1_SCHPO", "fName" : "coiled coil", "start" : "457", "end" : "538"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "APSES fold", "start" : "5", "end" : "103"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "KilA-N", "start" : "23", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "170", "end" : "191"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "435", "end" : "450"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "low complexity", "start" : "611", "end" : "626"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "270", "end" : "299"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "389", "end" : "418"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Ankyrin fold", "start" : "474", "end" : "509"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "Swi6 fold", "start" : "257", "end" : "429"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "500", "end" : "570"},
 | 
			
		||||
  {"pName" : "MBP1_COPCI", "fName" : "coiled coil", "start" : "651", "end" : "678"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "APSES fold", "start" : "16", "end" : "114"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "KilA-N", "start" : "34", "end" : "117"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "66", "end" : "85"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "413", "end" : "423"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "633", "end" : "644"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "low complexity", "start" : "697", "end" : "709"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "477", "end" : "506"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Ankyrin fold", "start" : "618", "end" : "647"},
 | 
			
		||||
  {"pName" : "MBP1_CRYNE", "fName" : "Swi6 fold", "start" : "452", "end" : "663"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "APSES fold", "start" : "90", "end" : "187"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "KilA-N", "start" : "107", "end" : "190"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "208", "end" : "227"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "low complexity", "start" : "273", "end" : "291"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "442", "end" : "271"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "475", "end" : "509"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Ankyrin fold", "start" : "561", "end" : "590"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "Swi6 fold", "start" : "429", "end" : "601"},
 | 
			
		||||
  {"pName" : "MBP1_PUCGR", "fName" : "coiled coil", "start" : "827", "end" : "863"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "APSES fold", "start" : "7", "end" : "104"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "KilA-N", "start" : "24", "end" : "107"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "106", "end" : "116"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "161", "end" : "183"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "666", "end" : "681"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "low complexity", "start" : "688", "end" : "700"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "AT hook", "start" : "134", "end" : "146"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "245", "end" : "274"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "278", "end" : "314"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Ankyrin fold", "start" : "364", "end" : "393"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "Swi6 fold", "start" : "232", "end" : "404"},
 | 
			
		||||
  {"pName" : "MBP1_USTMA", "fName" : "coiled coil", "start" : "590", "end" : "618"},
 | 
			
		||||
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "APSES fold", "start" : "6", "end" : "103"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "KilA-N", "start" : "23", "end" : "106"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "149", "end" : "162"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "171", "end" : "188"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "618", "end" : "628"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "low complexity", "start" : "634", "end" : "660"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "250", "end" : "279"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Ankyrin fold", "start" : "369", "end" : "398"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "Swi6 fold", "start" : "237", "end" : "409"},
 | 
			
		||||
  {"pName" : "MBP1_WALME", "fName" : "coiled coil", "start" : "461", "end" : "585"}
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,47 +1,47 @@
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "APSES fold",
 | 
			
		||||
    "description " : "DNA binding domain by similarity to structure",
 | 
			
		||||
    "sourceDB" : "PDB",
 | 
			
		||||
    "accession" : "1BM8_A_1_99"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "KilA-N",
 | 
			
		||||
    "description " : "DNA binding domain by Pfam annotation",
 | 
			
		||||
    "sourceDB" : "Pfam",
 | 
			
		||||
    "accession" : "PF04383"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "AT hook",
 | 
			
		||||
    "description " : "DNA interaction motif by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "low complexity",
 | 
			
		||||
    "description " : "SEG annotation by SMART",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "Ankyrin fold",
 | 
			
		||||
    "description " : "Ankyrin domain by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : "SM00248"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "Swi6 fold",
 | 
			
		||||
    "description " : "Swi6 fold by similarity to structure",
 | 
			
		||||
    "sourceDB" : "PDB",
 | 
			
		||||
    "accession" : "1SW6_B"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "coiled coil",
 | 
			
		||||
    "description " : "Coiled coil by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "McInerny 2011",
 | 
			
		||||
    "description " : "Yeast cell cycle review",
 | 
			
		||||
    "sourceDB" : "PubMed",
 | 
			
		||||
    "accession" : "21310294"}
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "APSES fold",
 | 
			
		||||
    "description " : "DNA binding domain by similarity to structure",
 | 
			
		||||
    "sourceDB" : "PDB",
 | 
			
		||||
    "accession" : "1BM8_A_1_99"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "KilA-N",
 | 
			
		||||
    "description " : "DNA binding domain by Pfam annotation",
 | 
			
		||||
    "sourceDB" : "Pfam",
 | 
			
		||||
    "accession" : "PF04383"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "AT hook",
 | 
			
		||||
    "description " : "DNA interaction motif by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "low complexity",
 | 
			
		||||
    "description " : "SEG annotation by SMART",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "Ankyrin fold",
 | 
			
		||||
    "description " : "Ankyrin domain by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : "SM00248"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "Swi6 fold",
 | 
			
		||||
    "description " : "Swi6 fold by similarity to structure",
 | 
			
		||||
    "sourceDB" : "PDB",
 | 
			
		||||
    "accession" : "1SW6_B"},
 | 
			
		||||
 | 
			
		||||
  { "name" : "coiled coil",
 | 
			
		||||
    "description " : "Coiled coil by SMART annotation",
 | 
			
		||||
    "sourceDB" : "SMART",
 | 
			
		||||
    "accession" : null},
 | 
			
		||||
 | 
			
		||||
  { "name" : "McInerny 2011",
 | 
			
		||||
    "description " : "Yeast cell cycle review",
 | 
			
		||||
    "sourceDB" : "PubMed",
 | 
			
		||||
    "accession" : "21310294"}
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,155 +1,155 @@
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_593032",
 | 
			
		||||
    "UniProtID" : "P41412",
 | 
			
		||||
    "taxonomyID" : 284812,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
 | 
			
		||||
       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
 | 
			
		||||
       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
 | 
			
		||||
       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
 | 
			
		||||
       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
 | 
			
		||||
       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
 | 
			
		||||
       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
 | 
			
		||||
       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
 | 
			
		||||
       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
 | 
			
		||||
       "AMSCGINPEDLSLEILDAVEEALTREK"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_660758",
 | 
			
		||||
    "UniProtID" : "Q5B8H6",
 | 
			
		||||
    "taxonomyID" : 227321,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
 | 
			
		||||
       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
 | 
			
		||||
       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
 | 
			
		||||
       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
 | 
			
		||||
       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
 | 
			
		||||
       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
 | 
			
		||||
       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
 | 
			
		||||
       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
 | 
			
		||||
       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
 | 
			
		||||
       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007682304",
 | 
			
		||||
    "UniProtID" : "W6ZM86",
 | 
			
		||||
    "taxonomyID" : 930090,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
 | 
			
		||||
       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
 | 
			
		||||
       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
 | 
			
		||||
       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
 | 
			
		||||
       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
 | 
			
		||||
       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
 | 
			
		||||
       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
 | 
			
		||||
       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
 | 
			
		||||
       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
 | 
			
		||||
       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_955821",
 | 
			
		||||
    "UniProtID" : "Q7RW59",
 | 
			
		||||
    "taxonomyID" : 367110,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
 | 
			
		||||
       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
 | 
			
		||||
       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
 | 
			
		||||
       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
 | 
			
		||||
       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
 | 
			
		||||
       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
 | 
			
		||||
       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
 | 
			
		||||
       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
 | 
			
		||||
       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
 | 
			
		||||
       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
 | 
			
		||||
       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
 | 
			
		||||
       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001837394",
 | 
			
		||||
    "UniProtID" : "A8NYC6",
 | 
			
		||||
    "taxonomyID" : 240176,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
 | 
			
		||||
       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
 | 
			
		||||
       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
 | 
			
		||||
       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
 | 
			
		||||
       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
 | 
			
		||||
       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
 | 
			
		||||
       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
 | 
			
		||||
       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
 | 
			
		||||
       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
 | 
			
		||||
       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
 | 
			
		||||
       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_569090",
 | 
			
		||||
    "UniProtID" : "Q5KMQ9",
 | 
			
		||||
    "taxonomyID" : 214684,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
 | 
			
		||||
       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
 | 
			
		||||
       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
 | 
			
		||||
       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
 | 
			
		||||
       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
 | 
			
		||||
       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
 | 
			
		||||
       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
 | 
			
		||||
       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
 | 
			
		||||
       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
 | 
			
		||||
       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
 | 
			
		||||
       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003327086",
 | 
			
		||||
    "UniProtID" : "E3KED4",
 | 
			
		||||
    "taxonomyID" : 418459,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
 | 
			
		||||
       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
 | 
			
		||||
       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
 | 
			
		||||
       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
 | 
			
		||||
       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
 | 
			
		||||
       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
 | 
			
		||||
       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
 | 
			
		||||
       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
 | 
			
		||||
       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
 | 
			
		||||
       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
 | 
			
		||||
       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
 | 
			
		||||
       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
 | 
			
		||||
       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
 | 
			
		||||
       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011392621",
 | 
			
		||||
    "UniProtID" : "A0A0D1DP35",
 | 
			
		||||
    "taxonomyID" : 237631,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
 | 
			
		||||
       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
 | 
			
		||||
       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
 | 
			
		||||
       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
 | 
			
		||||
       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
 | 
			
		||||
       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
 | 
			
		||||
       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
 | 
			
		||||
       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
 | 
			
		||||
       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
 | 
			
		||||
       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
 | 
			
		||||
       "P"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957051",
 | 
			
		||||
    "UniProtID" : "I4YGC0",
 | 
			
		||||
    "taxonomyID" : 671144,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
 | 
			
		||||
       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
 | 
			
		||||
       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
 | 
			
		||||
       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
 | 
			
		||||
       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
 | 
			
		||||
       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
 | 
			
		||||
       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
 | 
			
		||||
       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
 | 
			
		||||
       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
 | 
			
		||||
       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_SCHPO",
 | 
			
		||||
    "RefSeqID" : "NP_593032",
 | 
			
		||||
    "UniProtID" : "P41412",
 | 
			
		||||
    "taxonomyID" : 284812,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAPRSSAVHVAVYSGVEVYECFIKGVSVMRRRRDSWLNATQILKVADFDKPQRTRVLERQVQIGAHEKVQ",
 | 
			
		||||
       "GGYGKYQGTWVPFQRGVDLATKYKVDGIMSPILSLDIDEGKAIAPKKKQTKQKKPSVRGRRGRKPSSLSS",
 | 
			
		||||
       "STLHSVNEKQPNSSISPTIESSMNKVNLPGAEEQVSATPLPASPNALLSPNDNTIKPVEELGMLEAPLDK",
 | 
			
		||||
       "YEESLLDFFLHPEEGRIPSFLYSPPPDFQVNSVIDDDGHTSLHWACSMGHIEMIKLLLRANADIGVCNRL",
 | 
			
		||||
       "SQTPLMRSVIFTNNYDCQTFGQVLELLQSTIYAVDTNGQSIFHHIVQSTSTPSKVAAAKYYLDCILEKLI",
 | 
			
		||||
       "SIQPFENVVRLVNLQDSNGDTSLLIAARNGAMDCVNSLLSYNANPSIPNRQRRTASEYLLEADKKPHSLL",
 | 
			
		||||
       "QSNSNASHSAFSFSGISPAIISPSCSSHAFVKAIPSISSKFSQLAEEYESQLREKEEDLIRANRLKQDTL",
 | 
			
		||||
       "NEISRTYQELTFLQKNNPTYSQSMENLIREAQETYQQLSKRLLIWLEARQIFDLERSLKPHTSLSISFPS",
 | 
			
		||||
       "DFLKKEDGLSLNNDFKKPACNNVTNSDEYEQLINKLTSLQASRKKDTLYIRKLYEELGIDDTVNSYRRLI",
 | 
			
		||||
       "AMSCGINPEDLSLEILDAVEEALTREK"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_ASPNI",
 | 
			
		||||
    "RefSeqID" : "XP_660758",
 | 
			
		||||
    "UniProtID" : "Q5B8H6",
 | 
			
		||||
    "taxonomyID" : 227321,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAAVDFSNVYSATYSSVPVYEFKIGTDSVMRRRSDDWINATHILKVAGFDKPARTRILEREVQKGVHEKV",
 | 
			
		||||
       "QGGYGKYQGTWIPLQEGRQLAERNNILDKLLPIFDYVAGDRSPPPAPKHTSAASKPRAPKINKRVVKEDV",
 | 
			
		||||
       "FSAVNHHRSMGPPSFHHEHYDVNTGLDEDESIEQATLESSSMIADEDMISMSQNGPYSSRKRKRGINEVA",
 | 
			
		||||
       "AMSLSEQEHILYGDQLLDYFMTVGDAPEATRIPPPQPPANFQVDRPIDDSGNTALHWACAMGDLEIVKDL",
 | 
			
		||||
       "LRRGADMKALSIHEETPLVRAVLFTNNYEKRTFPALLDLLLDTISFRDWFGATLFHHIAQTTKSKGKWKS",
 | 
			
		||||
       "SRYYCEVALEKLRTTFSPEEVDLLLSCQDSVGDTAVLVAARNGVFRLVDLLLSRCPRAGDLVNKRGETAS",
 | 
			
		||||
       "SIMQRAHLAERDIPPPPSSITMGNDHIDGEVGAPTSLEPQSVTLHHESSPATAQLLSQIGAIMAEASRKL",
 | 
			
		||||
       "TSSYGAAKPSQKDSDDVANPEALYEQLEQDRQKIRRQYDALAAKEAAEESSDAQLGRYEQMRDNYESLLE",
 | 
			
		||||
       "QIQRARLKERLASTPVPTQTAVIGSSSPEQDRLLTTFQLSRALCSEQKIRRAAVKELAQQRADAGVSTKF",
 | 
			
		||||
       "DVHRKLVALATGLKEEELDPMAAELAETLEFDRMNGKGVGPESPEADHKDSASLPFPGPVVSVDA"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_BIPOR",
 | 
			
		||||
    "RefSeqID" : "XP_007682304",
 | 
			
		||||
    "UniProtID" : "W6ZM86",
 | 
			
		||||
    "taxonomyID" : 930090,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MPPAPDGKIYSATYSNVPVYECNVNGHHVMRRRADDWINATHILKVADYDKPARTRILEREVQKGVHEKV",
 | 
			
		||||
       "QGGYGKYQGTWIPLEEGRGLAERNGVLDKMRAIFDYVPGDRSPPPAPKHATAASNRMKPPRQTAAAVAAA",
 | 
			
		||||
       "AVAAAAAAAAVANHNALMSNSRSQASEDPYENSQRSQIYREDTPDNETVISESMLGDADLMDMSQYSADG",
 | 
			
		||||
       "NRKRKRGMDQMSLLDQQHQIWADQLLDYFMLLDHEAAVSWPEPPPSINLDRPIDEKGHAAMHWAAAMGDV",
 | 
			
		||||
       "GVVKELIHRGARLDCLSNNLETPLMRAVMFTNNFDKETMPSMVKIFQQTVHRTDWFGSTVFHHIAATTSS",
 | 
			
		||||
       "SNKYVCARWYLDCIINKLSETWIPEEVTRLLNAADQNGDTAIMIAARNGARKCVRSLLGRNVAVDIPNKK",
 | 
			
		||||
       "GETADDLIRELNQRRRMHGRTRQASSSPFAPAPEHRLNGHVPHFDGGPLMSVPVPSMAVRESVQYRSQTA",
 | 
			
		||||
       "SHLMTKVAPTLLEKCEELATAYEAELQEKEAEFFDAERVVKRRQAELEAVRKQVAELQSMSKGLHIDLND",
 | 
			
		||||
       "EEAERQQEDELRLLVEEAESLLEIEQKAELRRLCSSMPQQNSDSSPVDITEKMRLALLLHRAQLERRELV",
 | 
			
		||||
       "REVVGNLSVAGMSEKQGTYKKLIAKALGEREEDVESMLPEILQELEEAETQERAEGLDGSPV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_NEUCR",
 | 
			
		||||
    "RefSeqID" : "XP_955821",
 | 
			
		||||
    "UniProtID" : "Q7RW59",
 | 
			
		||||
    "taxonomyID" : 367110,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MVKENVGGNPEPGIYSATYSGIPVWEYQFGVDLKEHVMRRRHDDWVNATHILKAAGFDKPARTRILEREV",
 | 
			
		||||
       "QKDTHEKIQGGYGRYQGTWIPLEQAEALARRNNIYERLKPIFEFQPGNESPPPAPRHASKPKAPKVKPAV",
 | 
			
		||||
       "PTWGSKSAKNANPPQPGTFLPPGRKGLPAQAPDYNDADTHMHDDDTPDNLTVASASYMAEDDRYDHSHFS",
 | 
			
		||||
       "TGHRKRKRDELIEDMTEQQHAVYGDELLDYFLLSRNEQPAVRPDPPPNFKPDWPIDNERHTCLHWASAMG",
 | 
			
		||||
       "DVDVMRQLKKFGASLDAQNVRGETPFMRAVNFTNCFEKQTFPQVMKELFSTIDCRDLSGCTVIHHAAVMK",
 | 
			
		||||
       "IGRVNSQSCSRYYLDIILNRLQETHHPEFVQQLLDAQDNDGNTAVHLAAMRDARKCIRALLGRGASTDIP",
 | 
			
		||||
       "NKQGIRAEELIKELNASISKSRSNLPQRSSSPFAPDTQRHDAFHEAISESMVTSRKNSQPNYSSDAANTV",
 | 
			
		||||
       "QNRITPLVLQKLKDLTATYDSEFKEKDDAEKEARRILNKTQSELKALTASIDDYNSRLDTDDVAAKTAAE",
 | 
			
		||||
       "MATARHKVLAFVTHQNRISVQEAVKQELAALDRANAVTNGTSTKSKSSSPSKKPKLSPIPDQKDKPPKDE",
 | 
			
		||||
       "NETESEAEHPDPPAAQAHQQQPGPSSQDTEVEDQDREEEEDDYTHRLSLAAELRSILQEQRSAENDYVEA",
 | 
			
		||||
       "RGMLGTGERIDKYKHLLMSCLPPDEQENLEENLEEMIKLMEQEDESVTDLPAGAVGGGGGGNAADGSGGG",
 | 
			
		||||
       "GQPSNGRRESVLPALRGGNGDGEMSRRGSRTAAAAAAQVDGEREINGRAGAERTERIQEIAAV"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_COPCI",
 | 
			
		||||
    "RefSeqID" : "XP_001837394",
 | 
			
		||||
    "UniProtID" : "A8NYC6",
 | 
			
		||||
    "taxonomyID" : 240176,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MPEAQIFKATYSGIPVYEMMCKGVAVMRRRSDSWLNATQILKVAGFDKPQRTRVLEREVQKGEHEKVQGG",
 | 
			
		||||
       "YGKYQGTWIPLERGMQLAKQYNCEHLLRPIIEFTPAAKSPPLAPKHLVATAGNRPVRKPLTTDLSAAVIN",
 | 
			
		||||
       "TRSTRKQVADGVGEESDHDTHSLRGSEDGSMTPSPSEASSSSRTPSPIHSPGTYHSNGLDGPSSGGRNRY",
 | 
			
		||||
       "RQSNDRYDEDDDASRHNGMGDPRSYGDQILEYFISDTNQIPPILITPPPDFDPNMAIDDDGHTSLHWACA",
 | 
			
		||||
       "MGRIRIVKLLLSAGADIFKVNKAGQTALMRSVMFANNYDVRKFPELYELLHRSTLNIDNSNRTVFHHVVD",
 | 
			
		||||
       "VAMSKGKTHAARYYMETILTRLADYPKELADVINFQDEDGETALTMAARCRSKRLVKLLIDHGADPKINN",
 | 
			
		||||
       "HDGKNAEDYILEDERFRSSPAPSSRVAAMSYRNAQVAYPPPGAPSTYSFAPANHDRPPLHYSAAAQKAST",
 | 
			
		||||
       "RCVNDMASMLDSLAASFDQELRDKERDMAQAQALLTNIQAEILESQRTVLQLRQQAEGLSQAKQRLADLE",
 | 
			
		||||
       "NALQDKMGRRYRLGFEKWIKDEETREKVIRDAANGDLVLTPATTSYTVDEDGDSDSGSNGDKNKGKRKAQ",
 | 
			
		||||
       "VQQEEVSDLVELYSNIPTDPEELRKQCEALREEVSQSRKRRKAMFDELVTFQAEAGTSGRMSDYRRLIAA",
 | 
			
		||||
       "GCGGLEPLEIDSVLGMLLETLEAEDPSSTSATWSGSKGQQTG"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_CRYNE",
 | 
			
		||||
    "RefSeqID" : "XP_569090",
 | 
			
		||||
    "UniProtID" : "Q5KMQ9",
 | 
			
		||||
    "taxonomyID" : 214684,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MGKKVIASGGDNGPNTIYKATYSGVPVYEMVCRDVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREV",
 | 
			
		||||
       "QKGEHEKVQGGYGKYQGTWIPIERGLALAKQYGVEDILRPIIDYVPTSVSPPPAPKHSVAPPSKARRDKE",
 | 
			
		||||
       "KETGRTKATPSRTGPTSAAALQAQAQLNRAKMHDSTPDADASFRSFEERVSLTPEDDSSSDTPSPVASVM",
 | 
			
		||||
       "TDQDMEVDKMGMHMSMPNVTLSQNMEELGAGSRKRSAAMMMEDEDQFGQLRSIRGNSAVHTPHGTPRHLG",
 | 
			
		||||
       "IGMPPEPIGPEQYTDIILNYFVSETSQIPSILVSPPHDFDPNAPIDDDGHTALHWACAMGRVRVVKLLLT",
 | 
			
		||||
       "AGASIFAGNNAEQTPLMRSVMFSNNYDMRKFPELYELLHRSTLNIDKQNRTVFHHIANLALTKGKTHAAK",
 | 
			
		||||
       "YYMETILARLADYPQELADVINFQDEEGETALTIAARARSRRLVKALLDHGANPKIKNRDSRSAEDYILE",
 | 
			
		||||
       "DERFRSSPVPAPNGGIGKASTSAAAEKPLFAPQLYFSEAARLCGGQALTDITSHMQSLARSFDAELQGKE",
 | 
			
		||||
       "RDILQAKALLTNIHTEVTENGRSITAITNQAAPLEEKRRELEALQASLKTRVKDALKKGYIGWLEGELVR",
 | 
			
		||||
       "EQRWENGELEGNEEEKAAVQALRDVPTGGQEVVQAEEEKLRWEIEEKRKRRAMFVEKFVRAQTEAGTSEQ",
 | 
			
		||||
       "IAKYRKLVSAGLGGVSTNEVDELMNQLLEGLEEENDNQVYNTTAGESGPSSWVQ"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_PUCGR",
 | 
			
		||||
    "RefSeqID" : "XP_003327086",
 | 
			
		||||
    "UniProtID" : "E3KED4",
 | 
			
		||||
    "taxonomyID" : 418459,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MAYGGSIQPLRPPSRESATLHLHQPDLTVTSPPLSLTHCPPCVYSHFTHTPTSLIVIQVSLHSLLDQETY",
 | 
			
		||||
       "HLLPSRSPPTVSVRMGTTTIYKATYSGVPVLEMPCEGIAVMRRRSDSWLNATQILKVAGFDKPQRTRVLE",
 | 
			
		||||
       "REIQKGTHEKIQGGYGKYQGTWVPLDRGIDLAKQYGVDHLLSALFNFQPSSNESPPLAPKHVTALSTRVK",
 | 
			
		||||
       "VSKVSAASAARAARAVVPSLPSTSGLGGRNTNNSWSNFDSDNEPGLPPAASSRESNGNWATQSKLARSSN",
 | 
			
		||||
       "LARARANINNSHPEDLPVPAPDQLQASPLPSMQTADPENDNSLTPSELSLPSRTPSPIEDLPLTVNTASS",
 | 
			
		||||
       "QSTRNKGKSRDLPDDEDLSRGQKRKYDTSLVEDTSYSDGADDQYINGNPSNAASAKYAKLILDYFVSESS",
 | 
			
		||||
       "QIPNFLNDPPSDFDPNVVIDDDGHTALHWACAMGRIKIIKLLLTCGADIFRANNAGQTALMRAVMFTNNH",
 | 
			
		||||
       "DLRTFPELFESFSGSVINIDRTDRTVFHYVIDIALTKGKVPAARYYLETILSQLSEYPKELIDILNFQDE",
 | 
			
		||||
       "DGETALTLAARCRSKKLVKILLDHGANPKTANRDGKSAEDYILEDDKFRALSPTPCSSGPIRQLDQNSPG",
 | 
			
		||||
       "GTSNRSDFVDLVDPVPIDSNLIPQRSPNASPPHYSETGQRVTKQLLPEVTSMIELLATTFDTELQDKERD",
 | 
			
		||||
       "LDHAVGLLSNIEKEYLEGQRKILNYERMLSDFGEKKLALGDLEKELNDKLGKRYRFGWEKYVRDEEERAR",
 | 
			
		||||
       "RITEQRSKYLQELSIEDRKLLDSSNLRFADPSKQEVLMKLQADERENSDLLNLIRTNSTDVESECDLLRE",
 | 
			
		||||
       "SVQKLSEERERLFKEFINLSSENTGGENEEDDGANHTSANTSRLNNYRKLISLGCGGIGLDEVDEVIESL",
 | 
			
		||||
       "NEGIDVNELNDNGFLTEQDEELGNHQNYHNIHTQGR"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_USTMA",
 | 
			
		||||
    "RefSeqID" : "XP_011392621",
 | 
			
		||||
    "UniProtID" : "A0A0D1DP35",
 | 
			
		||||
    "taxonomyID" : 237631,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSGDKTIFKATYSGVPVYECIINNVAVMRRRSDDWLNATQILKVVGLDKPQRTRVLEREIQKGIHEKVQG",
 | 
			
		||||
       "GYGKYQGTWIPLDVAIELAERYNIQGLLQPITSYVPSAADSPPPAPKHTISTSNRSKKIIPADPGALGRS",
 | 
			
		||||
       "RRATSIETESEVIGAAPNNVSEGSMSPSPSDISSSSRTPSPLPADRAHPLHANHALAGYNGRDANNHARY",
 | 
			
		||||
       "ADIILDYFVTENTTVPSLLINPPPDFNPDMSIDDDEHTALHWACAMGRIRVVKLLLSAGADIFRVNSNQQ",
 | 
			
		||||
       "TALMRATMFSNNYDLRKFPELFELLHRSILNIDRNDRTVFHHVVDLALSRGKPHAARYYMETMINRLADY",
 | 
			
		||||
       "GDQLADILNFQDDEGETPLTMAARARSKRLVRLLLEHGADPKIRNKEGKNAEDYIIEDERFRSSPSRTGP",
 | 
			
		||||
       "AGIELGADGLPVLPTSSLHTSEAGQRTAGRAVTLMSNLLHSLADSYDSEINTAEKKLTQAHGLLKQIQTE",
 | 
			
		||||
       "IEDSAKVAEALHHEAQGVDEERKRVDSLQLALKHAINKRARDDLERRWSEGKQAIKRARLQAGLEPGALS",
 | 
			
		||||
       "TSNATNAPATGDQKSKDDAKSLIEALPAGTNVKTAIAELRKQLSQVQANKTELVDKFVARAREQGTGRTM",
 | 
			
		||||
       "AAYRRLIAAGCGGIAPDEVDAVVGVLCELLQESHTGARAGAGGERDDRARDVAMMLKGAGAAALAANAGA",
 | 
			
		||||
       "P"]
 | 
			
		||||
  },
 | 
			
		||||
  { "name" : "MBP1_WALME",
 | 
			
		||||
    "RefSeqID" : "XP_006957051",
 | 
			
		||||
    "UniProtID" : "I4YGC0",
 | 
			
		||||
    "taxonomyID" : 671144,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MSAPPIYKACYSGVPVYEFNCKNVAVMKRRSDSWMNATQILKVANFDKPQRTRILEREVQKGTHEKVQGG",
 | 
			
		||||
       "YGKYQGTWIPMERSVELARQYRIELLLDPIINYLPGPQSPPLAPKHATNVGSRARKSTAPAAQTLPSTSK",
 | 
			
		||||
       "VFHPLSSTKHPAKLAAATNAKAEISDGEDASIPSSPSFKSNSSRTPSPIRINARKRKLEDEATIPSSAID",
 | 
			
		||||
       "GSISYEDIILDYFISESTQIPALLIHPPSDFNPNMSIDDEGHTAMHWACAMGKVRVVKLLLSAGADIFRV",
 | 
			
		||||
       "NHSEQTALMRSVMFSNNYDIRKFPQLYELLHRSTLNLDKHDRTVLHHIVDLALTKSKTHAARYYMECVLS",
 | 
			
		||||
       "KLANYPDELADVINFQDDEGESALTLAARARSKRLVKLLLEHGADSKLPNKDGKTAEDYILEDERFRQSP",
 | 
			
		||||
       "LLNSNHLRLHPPDTSIYAPPAHLFNSETSQNIANTSMSSVANLLESLAQSYDKEITQKERDYQQAQVILR",
 | 
			
		||||
       "NIKTDIVEAKSNIEKMTIDSSEFEHLKHKLRELEMKLEEHSNDVYNKGWEEYSRNVDDPAIDAPSDNVQE",
 | 
			
		||||
       "ECASLRNKIKDLQEKRISSMQELIKRQKEVGTGKKMSEYRKLISVGCGIPTTEIDAVLEMLLESLESENA",
 | 
			
		||||
       "NKKAALASGISGALSSTSSAPSQATTSAPTGVATPGAPVPASSEKAGLLPPAPVMQ"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,22 +1,22 @@
 | 
			
		||||
[
 | 
			
		||||
  { "ID" : 227321,
 | 
			
		||||
    "species" : "Aspergillus nidulans FGSC A4"},
 | 
			
		||||
  { "ID" : 930090,
 | 
			
		||||
    "species" : "Bipolaris oryzae ATCC 44560"},
 | 
			
		||||
  { "ID" : 240176,
 | 
			
		||||
    "species" : "Coprinopsis cinerea okayama7#130"},
 | 
			
		||||
  { "ID" : 214684,
 | 
			
		||||
    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
 | 
			
		||||
  { "ID" : 367110,
 | 
			
		||||
    "species" : "Neurospora crassa OR74A"},
 | 
			
		||||
  { "ID" : 418459,
 | 
			
		||||
    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
 | 
			
		||||
  { "ID" : 559292,
 | 
			
		||||
    "species" : "Saccharomyces cerevisiae S288C"},
 | 
			
		||||
  { "ID" : 284812,
 | 
			
		||||
    "species" : "Schizosaccharomyces pombe 972h-"},
 | 
			
		||||
  { "ID" : 237631,
 | 
			
		||||
    "species" : "Ustilago maydis 521"},
 | 
			
		||||
  { "ID" : 671144,
 | 
			
		||||
    "species" : "Wallemia mellicola CBS 633.66"}
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  { "ID" : 227321,
 | 
			
		||||
    "species" : "Aspergillus nidulans FGSC A4"},
 | 
			
		||||
  { "ID" : 930090,
 | 
			
		||||
    "species" : "Bipolaris oryzae ATCC 44560"},
 | 
			
		||||
  { "ID" : 240176,
 | 
			
		||||
    "species" : "Coprinopsis cinerea okayama7#130"},
 | 
			
		||||
  { "ID" : 214684,
 | 
			
		||||
    "species" : "Cryptococcus neoformans var. neoformans JEC21"},
 | 
			
		||||
  { "ID" : 367110,
 | 
			
		||||
    "species" : "Neurospora crassa OR74A"},
 | 
			
		||||
  { "ID" : 418459,
 | 
			
		||||
    "species" : "Puccinia graminis f. sp. tritici CRL 75-36-700-3"},
 | 
			
		||||
  { "ID" : 559292,
 | 
			
		||||
    "species" : "Saccharomyces cerevisiae S288C"},
 | 
			
		||||
  { "ID" : 284812,
 | 
			
		||||
    "species" : "Schizosaccharomyces pombe 972h-"},
 | 
			
		||||
  { "ID" : 237631,
 | 
			
		||||
    "species" : "Ustilago maydis 521"},
 | 
			
		||||
  { "ID" : 671144,
 | 
			
		||||
    "species" : "Wallemia mellicola CBS 633.66"}
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,115 +1,115 @@
 | 
			
		||||
ID	protein.ID	feature.ID	start	end	note
 | 
			
		||||
# MBP1_SACCE
 | 
			
		||||
NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
 | 
			
		||||
NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	108	122	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	236	241	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	279	307	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	700	717	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	700	717	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
 | 
			
		||||
NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
 | 
			
		||||
# MBP1_ASPNI
 | 
			
		||||
NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
 | 
			
		||||
NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
 | 
			
		||||
NA	ref_pro_1	ref_ftr_4	529	534	low complexity
 | 
			
		||||
NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
 | 
			
		||||
NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
 | 
			
		||||
NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
 | 
			
		||||
NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
 | 
			
		||||
# MBP1_BIPOR
 | 
			
		||||
NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
 | 
			
		||||
NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	134	152	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	267	278	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	670	685	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
 | 
			
		||||
NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
 | 
			
		||||
NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
 | 
			
		||||
NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
 | 
			
		||||
NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
 | 
			
		||||
# MBP1_NEUCR
 | 
			
		||||
NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
 | 
			
		||||
NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	130	141	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	253	266	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	514	525	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	554	564	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	601	618	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	620	629	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	636	652	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	658	672	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	725	735	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	752	771	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
 | 
			
		||||
NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
 | 
			
		||||
NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
 | 
			
		||||
NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
 | 
			
		||||
# MBP1_SCHPO
 | 
			
		||||
NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
 | 
			
		||||
NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	111	125	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	136	145	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	176	191	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	422	447	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
 | 
			
		||||
NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
 | 
			
		||||
NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
 | 
			
		||||
NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
 | 
			
		||||
# MBP1_COPCI
 | 
			
		||||
NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
 | 
			
		||||
NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	170	191	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	435	450	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	611	626	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
 | 
			
		||||
NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
 | 
			
		||||
NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
 | 
			
		||||
# MBP1_CRYNE
 | 
			
		||||
NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
 | 
			
		||||
NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	66	85	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	413	423	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	633	644	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	697	709	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
 | 
			
		||||
NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
 | 
			
		||||
NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
 | 
			
		||||
# MBP1_PUCGR
 | 
			
		||||
NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
 | 
			
		||||
NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
 | 
			
		||||
NA	ref_pro_8	ref_ftr_4	208	227	low complexity
 | 
			
		||||
NA	ref_pro_8	ref_ftr_4	273	291	low complexity
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
 | 
			
		||||
NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
 | 
			
		||||
# MBP1_USTMA
 | 
			
		||||
NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
 | 
			
		||||
NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	106	116	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	161	183	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	657	672	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	776	796	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
 | 
			
		||||
NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
 | 
			
		||||
NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
 | 
			
		||||
NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
 | 
			
		||||
# MBP1_WALME
 | 
			
		||||
NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
 | 
			
		||||
NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	149	162	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	171	188	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	618	628	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	634	660	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
 | 
			
		||||
NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
 | 
			
		||||
NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
 | 
			
		||||
NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
 | 
			
		||||
ID	protein.ID	feature.ID	start	end	note
 | 
			
		||||
# MBP1_SACCE
 | 
			
		||||
NA	ref_pro_4	ref_ftr_1	4	102	APSES fold
 | 
			
		||||
NA	ref_pro_4	ref_ftr_2	22	105	KilA-N
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	108	122	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	236	241	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	279	307	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	700	717	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_4	700	717	low complexity
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	394	423	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	427	463	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_5	512	541	Ankyrin
 | 
			
		||||
NA	ref_pro_4	ref_ftr_6	381	547	Swi6 fold
 | 
			
		||||
NA	ref_pro_4	ref_ftr_7	633	655	coiled coil
 | 
			
		||||
# MBP1_ASPNI
 | 
			
		||||
NA	ref_pro_1	ref_ftr_1	9	106	APSES fold
 | 
			
		||||
NA	ref_pro_1	ref_ftr_2	26	109	KilA-N
 | 
			
		||||
NA	ref_pro_1	ref_ftr_4	529	534	low complexity
 | 
			
		||||
NA	ref_pro_1	ref_ftr_5	260	289	Ankyrin
 | 
			
		||||
NA	ref_pro_1	ref_ftr_5	381	413	Ankyrin
 | 
			
		||||
NA	ref_pro_1	ref_ftr_6	193	402	Swi6 fold
 | 
			
		||||
NA	ref_pro_1	ref_ftr_7	509	572	coiled coil
 | 
			
		||||
# MBP1_BIPOR
 | 
			
		||||
NA	ref_pro_2	ref_ftr_1	8	106	APSES fold
 | 
			
		||||
NA	ref_pro_2	ref_ftr_2	26	109	KilA-N
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	134	152	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	267	278	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_4	670	685	low complexity
 | 
			
		||||
NA	ref_pro_2	ref_ftr_5	266	295	Ankyrin
 | 
			
		||||
NA	ref_pro_2	ref_ftr_5	387	416	Ankyrin
 | 
			
		||||
NA	ref_pro_2	ref_ftr_6	253	421	Swi6 fold
 | 
			
		||||
NA	ref_pro_2	ref_ftr_7	659	681	coiled coil
 | 
			
		||||
NA	ref_pro_2	ref_ftr_7	500	590	coiled coil
 | 
			
		||||
# MBP1_NEUCR
 | 
			
		||||
NA	ref_pro_3	ref_ftr_1	14	114	APSES fold
 | 
			
		||||
NA	ref_pro_3	ref_ftr_2	34	117	KilA-N
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	130	141	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	253	266	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	514	525	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	554	564	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	601	618	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	620	629	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	636	652	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	658	672	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	725	735	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_4	752	771	low complexity
 | 
			
		||||
NA	ref_pro_3	ref_ftr_5	268	297	Ankyrin
 | 
			
		||||
NA	ref_pro_3	ref_ftr_5	390	419	Ankyrin
 | 
			
		||||
NA	ref_pro_3	ref_ftr_6	270	426	Swi6 fold
 | 
			
		||||
NA	ref_pro_3	ref_ftr_7	500	550	coiled coil
 | 
			
		||||
# MBP1_SCHPO
 | 
			
		||||
NA	ref_pro_5	ref_ftr_1	8	104	APSES fold
 | 
			
		||||
NA	ref_pro_5	ref_ftr_2	25	113	KilA-N
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	111	125	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	136	145	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	176	191	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_4	422	447	low complexity
 | 
			
		||||
NA	ref_pro_5	ref_ftr_5	247	276	Ankyrin
 | 
			
		||||
NA	ref_pro_5	ref_ftr_5	368	397	Ankyrin
 | 
			
		||||
NA	ref_pro_5	ref_ftr_6	234	400	Swi6 fold
 | 
			
		||||
NA	ref_pro_5	ref_ftr_7	457	538	coiled coil
 | 
			
		||||
# MBP1_COPCI
 | 
			
		||||
NA	ref_pro_6	ref_ftr_1	5	103	APSES fold
 | 
			
		||||
NA	ref_pro_6	ref_ftr_2	23	106	KilA-N
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	170	191	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	435	450	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_4	611	626	low complexity
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	270	299	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	389	418	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_5	474	509	Ankyrin
 | 
			
		||||
NA	ref_pro_6	ref_ftr_6	257	429	Swi6 fold
 | 
			
		||||
NA	ref_pro_6	ref_ftr_7	500	570	coiled coil
 | 
			
		||||
NA	ref_pro_6	ref_ftr_7	651	678	coiled coil
 | 
			
		||||
# MBP1_CRYNE
 | 
			
		||||
NA	ref_pro_7	ref_ftr_1	113	211	APSES fold
 | 
			
		||||
NA	ref_pro_7	ref_ftr_2	131	215	KilA-N
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	66	85	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	413	423	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	633	644	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_4	697	709	low complexity
 | 
			
		||||
NA	ref_pro_7	ref_ftr_5	477	506	Ankyrin
 | 
			
		||||
NA	ref_pro_7	ref_ftr_5	618	647	Ankyrin
 | 
			
		||||
NA	ref_pro_7	ref_ftr_6	452	663	Swi6 fold
 | 
			
		||||
# MBP1_PUCGR
 | 
			
		||||
NA	ref_pro_8	ref_ftr_1	90	187	APSES fold
 | 
			
		||||
NA	ref_pro_8	ref_ftr_2	107	190	KilA-N
 | 
			
		||||
NA	ref_pro_8	ref_ftr_4	208	227	low complexity
 | 
			
		||||
NA	ref_pro_8	ref_ftr_4	273	291	low complexity
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	442	271	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	475	509	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_5	561	590	Ankyrin
 | 
			
		||||
NA	ref_pro_8	ref_ftr_6	429	601	Swi6 fold
 | 
			
		||||
NA	ref_pro_8	ref_ftr_7	827	863	coiled coil
 | 
			
		||||
# MBP1_USTMA
 | 
			
		||||
NA	ref_pro_9	ref_ftr_1	7	104	APSES fold
 | 
			
		||||
NA	ref_pro_9	ref_ftr_2	24	107	KilA-N
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	106	116	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	161	183	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	657	672	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_4	776	796	low complexity
 | 
			
		||||
NA	ref_pro_9	ref_ftr_5	245	274	Ankyrin
 | 
			
		||||
NA	ref_pro_9	ref_ftr_5	355	384	Ankyrin
 | 
			
		||||
NA	ref_pro_9	ref_ftr_6	232	395	Swi6 fold
 | 
			
		||||
NA	ref_pro_9	ref_ftr_7	581	609	coiled coil
 | 
			
		||||
# MBP1_WALME
 | 
			
		||||
NA	ref_pro_10	ref_ftr_1	6	103	APSES fold
 | 
			
		||||
NA	ref_pro_10	ref_ftr_2	23	106	KilA-N
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	149	162	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	171	188	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	618	628	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_4	634	660	low complexity
 | 
			
		||||
NA	ref_pro_10	ref_ftr_5	250	279	Ankyrin
 | 
			
		||||
NA	ref_pro_10	ref_ftr_5	369	398	Ankyrin
 | 
			
		||||
NA	ref_pro_10	ref_ftr_6	237	409	Swi6 fold
 | 
			
		||||
NA	ref_pro_10	ref_ftr_7	461	585	coiled coil
 | 
			
		||||
 
 | 
			
		||||
@@ -1,37 +1,37 @@
 | 
			
		||||
# functionTemplate.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  (General)
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
myFunction <- function(a, b=1) {
 | 
			
		||||
	# Purpose:
 | 
			
		||||
	#     Describe ...
 | 
			
		||||
    # Version:
 | 
			
		||||
    # Date:
 | 
			
		||||
    # Author:
 | 
			
		||||
    #
 | 
			
		||||
    # Parameters:
 | 
			
		||||
	#     a: ...
 | 
			
		||||
	#     b: ...
 | 
			
		||||
	# Value:
 | 
			
		||||
	#     result: ...
 | 
			
		||||
	# Example: <example invocation>
 | 
			
		||||
 | 
			
		||||
	# code ...
 | 
			
		||||
 | 
			
		||||
	return(result)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# functionTemplate.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  (General)
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
myFunction <- function(a, b=1) {
 | 
			
		||||
	# Purpose:
 | 
			
		||||
	#     Describe ...
 | 
			
		||||
    # Version:
 | 
			
		||||
    # Date:
 | 
			
		||||
    # Author:
 | 
			
		||||
    #
 | 
			
		||||
    # Parameters:
 | 
			
		||||
	#     a: ...
 | 
			
		||||
	#     b: ...
 | 
			
		||||
	# Value:
 | 
			
		||||
	#     result: ...
 | 
			
		||||
	# Example: <example invocation>
 | 
			
		||||
 | 
			
		||||
	# code ...
 | 
			
		||||
 | 
			
		||||
	return(result)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,21 +1,21 @@
 | 
			
		||||
# .myProfile.R
 | 
			
		||||
# This contains information which the course framework needs from time to time
 | 
			
		||||
# to personalize assignments, validate submissions etc. Make sure that
 | 
			
		||||
# the information correctly matches our official records.
 | 
			
		||||
# myEmail          char      A string with your eMail address. Use your official
 | 
			
		||||
#                            UofT eMail address.
 | 
			
		||||
# myStudentNumber  numeric   Your UofT student number. Take care to have this
 | 
			
		||||
#                            correct.
 | 
			
		||||
#
 | 
			
		||||
# NOTE:
 | 
			
		||||
# After you have updated this script, move the file to your "myScripts" folder.
 | 
			
		||||
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# options(stringsAsFactors = FALSE)
 | 
			
		||||
 | 
			
		||||
myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
 | 
			
		||||
myStudentNumber <- 1005845285  # e.g. 1003141592
 | 
			
		||||
MYSPE <- "Cutaneotrichosporon oleaginosum" 
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# .myProfile.R
 | 
			
		||||
# This contains information which the course framework needs from time to time
 | 
			
		||||
# to personalize assignments, validate submissions etc. Make sure that
 | 
			
		||||
# the information correctly matches our official records.
 | 
			
		||||
# myEmail          char      A string with your eMail address. Use your official
 | 
			
		||||
#                            UofT eMail address.
 | 
			
		||||
# myStudentNumber  numeric   Your UofT student number. Take care to have this
 | 
			
		||||
#                            correct.
 | 
			
		||||
#
 | 
			
		||||
# NOTE:
 | 
			
		||||
# After you have updated this script, move the file to your "myScripts" folder.
 | 
			
		||||
# Utility scripts will look for it on the path: "./myScripts/.myProfile.R"
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# options(stringsAsFactors = FALSE)
 | 
			
		||||
 | 
			
		||||
myEMail <- "yh.deng@mail.utoronto.ca"        # e.g. "u.franklin@utoronto.ca"
 | 
			
		||||
myStudentNumber <- 1005845285  # e.g. 1003141592
 | 
			
		||||
MYSPE <- "Cutaneotrichosporon oleaginosum" 
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,54 +1,51 @@
 | 
			
		||||
myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
 | 
			
		||||
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
 | 
			
		||||
 | 
			
		||||
gen_mutations <- function(seq, N) {
 | 
			
		||||
  stats <- c()
 | 
			
		||||
  stats <- cbind(stats, c(0, 0, 0))
 | 
			
		||||
  rownames(stats) <- c("silent", "missense", "nonsense")
 | 
			
		||||
  colnames(stats) <- c("occurrences")
 | 
			
		||||
  # Actual function
 | 
			
		||||
  for (i in 1:217) {
 | 
			
		||||
    # select index for mutation
 | 
			
		||||
    working_seq <- Biostrings::DNAString(seq)
 | 
			
		||||
    aa_seq <- Biostrings::translate(working_seq, no.init.codon = TRUE)
 | 
			
		||||
    mut_action <- sample(c("ins", "del", "sub"), 1, TRUE)
 | 
			
		||||
    mut_seq <- Biostrings::DNAString(seq)
 | 
			
		||||
    if (mut_action == "sub") {
 | 
			
		||||
      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
 | 
			
		||||
      possible_mutations <- Biostrings::DNA_BASES
 | 
			
		||||
      possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(working_seq[mut_index]))]
 | 
			
		||||
      mut_change <- sample(possible_mutations, 1, replace = TRUE)
 | 
			
		||||
      mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, mut_change)
 | 
			
		||||
    } else if (mut_action == "ins") {
 | 
			
		||||
      mut_index <- sample(1:length(working_seq) - 2, 1, replace = TRUE)
 | 
			
		||||
      possible_mutations <- Biostrings::DNA_BASES
 | 
			
		||||
      mut_seq <- Biostrings::DNAString(paste(substring(working_seq, 1, mut_index - 1), sample(possible_mutations, 1), substring(working_seq, mut_index), sep = ""))
 | 
			
		||||
    } else {
 | 
			
		||||
      mut_index <- sample(1:length(working_seq), 1, replace = TRUE)
 | 
			
		||||
      mut_seq <- mut_seq[-mut_index]
 | 
			
		||||
    }
 | 
			
		||||
    mut_seq <- Biostrings::DNAString(substring(mut_seq, 1, length(mut_seq) - (length(mut_seq) %% 3)))
 | 
			
		||||
    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
 | 
			
		||||
 | 
			
		||||
    # Note: we need silent, nonsense, and missense
 | 
			
		||||
    mut_aa_stop <- match("*", Biostrings::as.matrix(mut_aa))
 | 
			
		||||
    aa_seq_stop <- match("*", Biostrings::as.matrix(aa_seq))
 | 
			
		||||
    if (!is.na(mut_aa_stop) & (is.na(aa_seq_stop) | mut_aa_stop < aa_seq_stop)) {
 | 
			
		||||
      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
 | 
			
		||||
    } else if (mut_aa == aa_seq) {
 | 
			
		||||
      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
 | 
			
		||||
    } else {
 | 
			
		||||
      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  return(stats)
 | 
			
		||||
}
 | 
			
		||||
N_test <- 1200
 | 
			
		||||
gen_mutations("ATGATGATGATGATGATG", N_test)
 | 
			
		||||
gen_mutations("CCCCCCCCCCCCCCCCCC", N_test)
 | 
			
		||||
gen_mutations("TATTACTATTACTATTAC", N_test)
 | 
			
		||||
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", N_test)
 | 
			
		||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", N_test)
 | 
			
		||||
gen_mutations <- function(seq, N) {
 | 
			
		||||
  sealKey() # See: http://steipe.biochemistry.utoronto.ca/abc/index.php/BCH441_Code_submisson_instructions
 | 
			
		||||
  stats <- c()
 | 
			
		||||
  stats <- cbind(stats, c(0, 0, 0))
 | 
			
		||||
  rownames(stats) <- c("silent", "missense", "nonsense")
 | 
			
		||||
  colnames(stats) <- c("occurrences")
 | 
			
		||||
  # Actual function
 | 
			
		||||
  for (i in 1:N) {
 | 
			
		||||
    original_seq <- Biostrings::DNAString(seq)
 | 
			
		||||
    aa_seq <- Biostrings::translate(original_seq, no.init.codon = TRUE)
 | 
			
		||||
 | 
			
		||||
    mut_seq <- Biostrings::DNAString(seq)
 | 
			
		||||
    mut_index <- sample(1:length(original_seq), 1, replace = TRUE)
 | 
			
		||||
    possible_mutations <- Biostrings::DNA_BASES
 | 
			
		||||
    possible_mutations <- possible_mutations[possible_mutations != as.character(unlist(original_seq[mut_index]))]
 | 
			
		||||
    mut_seq <- Biostrings::replaceLetterAt(mut_seq, mut_index, sample(possible_mutations, 1, replace = TRUE))
 | 
			
		||||
    mut_aa <- Biostrings::translate(mut_seq, no.init.codon = TRUE)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    term_aa <- regexpr(pattern = "\\*", aa_seq)
 | 
			
		||||
    term_mut_aa <- as.integer(regexpr(pattern = "\\*", mut_aa))
 | 
			
		||||
    if ((term_aa == -1 && term_mut_aa != -1) || (term_mut_aa != -1 && term_mut_aa < term_aa)) {
 | 
			
		||||
      stats["nonsense", "occurrences"] <- 1 + stats["nonsense", "occurrences"]
 | 
			
		||||
    } else if (mut_aa == aa_seq) {
 | 
			
		||||
      stats["silent", "occurrences"] <- 1 + stats["silent", "occurrences"]
 | 
			
		||||
    } else {
 | 
			
		||||
      stats["missense", "occurrences"] <- 1 + stats["missense", "occurrences"]
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  sealKey()
 | 
			
		||||
  return(stats)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
gen_mutations("ATGATGATGATGATGATG", 1000)
 | 
			
		||||
gen_mutations("CCCCCCCCCCCCCCCCCC", 500)
 | 
			
		||||
gen_mutations("TATTACTATTACTATTAC", 500)
 | 
			
		||||
gen_mutations("TGGTGGTGGTGGTGGTGGTGGTGG", 500)
 | 
			
		||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGT", 500)
 | 
			
		||||
gen_mutations("TGTTGTTGTTGTTGTTGTTGTTGA", 500)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
myFA <-             readFASTA("data/RAB39B_HSa_coding.fa")
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/PTPN5_HSa_coding.fa"))
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/PTPN11_HSa_coding.fa"))
 | 
			
		||||
myFA <- rbind(myFA, readFASTA("data/KRAS_HSa_coding.fa"))
 | 
			
		||||
rownames(myFA)<-c("RAB39B", "PTPN5", "PTPN11", "KRAS") # Assign row names
 | 
			
		||||
 | 
			
		||||
gen_mutations(myFA["RAB39B", 2], 10000)
 | 
			
		||||
gen_mutations(myFA["PTPN5", 2], 10000)
 | 
			
		||||
gen_mutations(myFA["PTPN11", 2], 10000)
 | 
			
		||||
gen_mutations(myFA["KRAS", 2], 10000)
 | 
			
		||||
 
 | 
			
		||||
@@ -1,41 +1,41 @@
 | 
			
		||||
# ==   1.3  Task: submit for credit (part 1/2)  ================================
 | 
			
		||||
# == Submission - Code to add another philosopher to the datamodel:
 | 
			
		||||
 | 
			
		||||
pID <- autoincrement(philDB$person)
 | 
			
		||||
immanuelKant <- data.frame(id = pID,
 | 
			
		||||
                           name = "Immanuel Kant",
 | 
			
		||||
                           born = "1724",
 | 
			
		||||
                           died = "1804",
 | 
			
		||||
                           school = "Enlightenment Philosophy")
 | 
			
		||||
philDB$person <- rbind(philDB$person, immanuelKant)
 | 
			
		||||
 | 
			
		||||
bID = autoincrement(philDB$books)
 | 
			
		||||
immanuelKantWork <- data.frame(id = bID,
 | 
			
		||||
                               title = "Critique of Pure Reason",
 | 
			
		||||
                               published = "1781")
 | 
			
		||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
 | 
			
		||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
 | 
			
		||||
 | 
			
		||||
bID = autoincrement(philDB$books)
 | 
			
		||||
immanuelKantWork <- data.frame(id = bID,
 | 
			
		||||
                               title = "Critique of Judgement",
 | 
			
		||||
                               published = "1790")
 | 
			
		||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
 | 
			
		||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
 | 
			
		||||
 | 
			
		||||
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
 | 
			
		||||
 | 
			
		||||
schools <- unique(philDB$person$school)
 | 
			
		||||
schools <- sort(schools)
 | 
			
		||||
 | 
			
		||||
for (s in schools) {
 | 
			
		||||
  cat(sprintf("%s\n", s))
 | 
			
		||||
  authors = which(philDB$person$school == s)
 | 
			
		||||
  for (author in authors) {
 | 
			
		||||
    works = which(philDB$works$personID == author)
 | 
			
		||||
    for (work in works) {
 | 
			
		||||
      bookId = which(philDB$books$id == philDB$works$bookID[work])
 | 
			
		||||
      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
# ==   1.3  Task: submit for credit (part 1/2)  ================================
 | 
			
		||||
# == Submission - Code to add another philosopher to the datamodel:
 | 
			
		||||
 | 
			
		||||
pID <- autoincrement(philDB$person)
 | 
			
		||||
immanuelKant <- data.frame(id = pID,
 | 
			
		||||
                           name = "Immanuel Kant",
 | 
			
		||||
                           born = "1724",
 | 
			
		||||
                           died = "1804",
 | 
			
		||||
                           school = "Enlightenment Philosophy")
 | 
			
		||||
philDB$person <- rbind(philDB$person, immanuelKant)
 | 
			
		||||
 | 
			
		||||
bID = autoincrement(philDB$books)
 | 
			
		||||
immanuelKantWork <- data.frame(id = bID,
 | 
			
		||||
                               title = "Critique of Pure Reason",
 | 
			
		||||
                               published = "1781")
 | 
			
		||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
 | 
			
		||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
 | 
			
		||||
 | 
			
		||||
bID = autoincrement(philDB$books)
 | 
			
		||||
immanuelKantWork <- data.frame(id = bID,
 | 
			
		||||
                               title = "Critique of Judgement",
 | 
			
		||||
                               published = "1790")
 | 
			
		||||
philDB$books <- rbind(philDB$books, immanuelKantWork)
 | 
			
		||||
philDB$works <- rbind(philDB$works, data.frame(id = autoincrement(philDB$works), personID = pID, bookID = bID))
 | 
			
		||||
 | 
			
		||||
# == Submission: Code to list the philosophical schools in alphabetical order as well as their respective books in alphabetical order.
 | 
			
		||||
 | 
			
		||||
schools <- unique(philDB$person$school)
 | 
			
		||||
schools <- sort(schools)
 | 
			
		||||
 | 
			
		||||
for (s in schools) {
 | 
			
		||||
  cat(sprintf("%s\n", s))
 | 
			
		||||
  authors = which(philDB$person$school == s)
 | 
			
		||||
  for (author in authors) {
 | 
			
		||||
    works = which(philDB$works$personID == author)
 | 
			
		||||
    for (work in works) {
 | 
			
		||||
      bookId = which(philDB$books$id == philDB$works$bookID[work])
 | 
			
		||||
      cat(sprintf("\t%s - (%s)\n", philDB$books$title[bookId], philDB$books$published[bookId]))
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
[{
 | 
			
		||||
	"ID": 879819,
 | 
			
		||||
	"species": "Cutaneotrichosporon oleaginosum"}
 | 
			
		||||
]
 | 
			
		||||
[{
 | 
			
		||||
	"ID": 879819,
 | 
			
		||||
	"species": "Cutaneotrichosporon oleaginosum"}
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,19 +1,19 @@
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_CUTOL",
 | 
			
		||||
    "RefSeqID" : "XP_018278493.1",
 | 
			
		||||
    "UniProtID" : "A0A0J0XLN0",
 | 
			
		||||
    "taxonomyID" : 879819,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
 | 
			
		||||
       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
 | 
			
		||||
       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
 | 
			
		||||
       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
 | 
			
		||||
       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
 | 
			
		||||
       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
 | 
			
		||||
       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
 | 
			
		||||
       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
 | 
			
		||||
       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
 | 
			
		||||
       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
 | 
			
		||||
       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
[
 | 
			
		||||
  { "name" : "MBP1_CUTOL",
 | 
			
		||||
    "RefSeqID" : "XP_018278493.1",
 | 
			
		||||
    "UniProtID" : "A0A0J0XLN0",
 | 
			
		||||
    "taxonomyID" : 879819,
 | 
			
		||||
    "sequence" : [
 | 
			
		||||
       "MGKKAAAAGDGGPNTIYKATYSGVPVFEFICRNVAVMRRRSDAYLNATQILKVAGFDKPQRTRVLEREVQ",
 | 
			
		||||
       "KGEHEKVQGGYGKYQGTWVPIERGLALAKQYNVEDLLRPIIDFVPRESVSPPPAPKHAVAPPTKRNKEPK",
 | 
			
		||||
       "PKEGLVPIKSAGVLSGTGRHQTPDSVGEDVESEVMDDMSESQTPSPLNGTSLLPAVDERSIDGMDIDGFS",
 | 
			
		||||
       "MMNGGGHARKRSAAMMDDEDEYEQLKRARGNSAVHTPPPPGQSPRYGGMQHPLTQDEYNDIVLNYFVSEA",
 | 
			
		||||
       "TQIPAVMTNPPYNWDPNGIIDDDHHTALHWAAAMGRTRVIKLLLSAGARIFDKNNLDQTPLMRSVMFTNN",
 | 
			
		||||
       "YDLRKFPEVFELLHRSTLNIDKNNRTVFHHIANLALYKGKTHAARYYMEVILSRLADYPQELADVINFAD",
 | 
			
		||||
       "EDGETALTLAARARSKRIVKALLDHGADPKLRNRDHKSAEDYILEDERFRSSPDVMLNRTQPSAAPRNPT",
 | 
			
		||||
       "SLGAAVFSQGLPPQLYNSEAARLASGPHSSDILQQMQALARSFEAEKLNKERDVLEAKAMLTSIHTEVND",
 | 
			
		||||
       "AGRTLHNLGEQMKPLEAKQGELDGLVERLQSKLQKDLARGARKWKAADEGRENRWKNGDDPSQAGEDYSD",
 | 
			
		||||
       "LPELTAIPDNAEAEEERLRGEIEKMRARRGELVTRLVKAQTQTGTTDKMAQYRRLITAGCGGDINPGEID",
 | 
			
		||||
       "DIVGQLLDMLENEAQSGRPAPPPQAAPSWVTS"]
 | 
			
		||||
  }
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,8 @@
 | 
			
		||||
README - myScripts folder:
 | 
			
		||||
==========================
 | 
			
		||||
 | 
			
		||||
The "myScripts" folder is a place to keep your personal files
 | 
			
		||||
safe. No files will be submitted into this folder on the GitHub, master
 | 
			
		||||
copy. Thefore everything you put into this folder is safe from being
 | 
			
		||||
inadvertently overwritten by a file with the same name that would be
 | 
			
		||||
downloaded in a GitHub "pull" request.
 | 
			
		||||
README - myScripts folder:
 | 
			
		||||
==========================
 | 
			
		||||
 | 
			
		||||
The "myScripts" folder is a place to keep your personal files
 | 
			
		||||
safe. No files will be submitted into this folder on the GitHub, master
 | 
			
		||||
copy. Thefore everything you put into this folder is safe from being
 | 
			
		||||
inadvertently overwritten by a file with the same name that would be
 | 
			
		||||
downloaded in a GitHub "pull" request.
 | 
			
		||||
 
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
source("./scripts/ABC-createRefDB.R")
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
 | 
			
		||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
 | 
			
		||||
source("./scripts/ABC-createRefDB.R")
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./myScripts/MBP1_CUTOL.json"))
 | 
			
		||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./myScripts/CUTOLtaxonomy.json"))
 | 
			
		||||
 
 | 
			
		||||
@@ -1,38 +1,38 @@
 | 
			
		||||
# myScript.R
 | 
			
		||||
#
 | 
			
		||||
# --- As you work with this file, you can delete the instructions below --------
 | 
			
		||||
# Write your notes and code experiments into this document. Save it
 | 
			
		||||
# from time to time - however I recommend that you do not _commit_
 | 
			
		||||
# your saved version.
 | 
			
		||||
#
 | 
			
		||||
# As long as you do not _commit_ this script to version control,
 | 
			
		||||
# you can _pull_ updated versions of the entire project from GitHub
 | 
			
		||||
# by using the RStudio version control interface. However, once
 | 
			
		||||
# you _commit_ any file in your local version, RStudio will require
 | 
			
		||||
# you to resolve conflicts before you can _pull_ updates.
 | 
			
		||||
# --- As you work with this file, you can delete the instructions above --------
 | 
			
		||||
#
 | 
			
		||||
## Purpose: <...>
 | 
			
		||||
#
 | 
			
		||||
# Version: <...>
 | 
			
		||||
#
 | 
			
		||||
# Date:    <...>
 | 
			
		||||
# Author:  <Name> (<namee@mail.utoronto.ca>)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#
 | 
			
		||||
#   <number>    <Features>
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   <...>
 | 
			
		||||
#
 | 
			
		||||
# ====================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
# myScript.R
 | 
			
		||||
#
 | 
			
		||||
# --- As you work with this file, you can delete the instructions below --------
 | 
			
		||||
# Write your notes and code experiments into this document. Save it
 | 
			
		||||
# from time to time - however I recommend that you do not _commit_
 | 
			
		||||
# your saved version.
 | 
			
		||||
#
 | 
			
		||||
# As long as you do not _commit_ this script to version control,
 | 
			
		||||
# you can _pull_ updated versions of the entire project from GitHub
 | 
			
		||||
# by using the RStudio version control interface. However, once
 | 
			
		||||
# you _commit_ any file in your local version, RStudio will require
 | 
			
		||||
# you to resolve conflicts before you can _pull_ updates.
 | 
			
		||||
# --- As you work with this file, you can delete the instructions above --------
 | 
			
		||||
#
 | 
			
		||||
## Purpose: <...>
 | 
			
		||||
#
 | 
			
		||||
# Version: <...>
 | 
			
		||||
#
 | 
			
		||||
# Date:    <...>
 | 
			
		||||
# Author:  <Name> (<namee@mail.utoronto.ca>)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#
 | 
			
		||||
#   <number>    <Features>
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#   <...>
 | 
			
		||||
#
 | 
			
		||||
# ====================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
							
						
						
									
										2868
									
								
								plottingReference.R
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							
							
						
						
									
										150
									
								
								scriptTemplate.R
									
									
									
									
									
								
							@@ -1,75 +1,75 @@
 | 
			
		||||
# scriptTemplate.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose:
 | 
			
		||||
# Version:
 | 
			
		||||
# Date:
 | 
			
		||||
# Author:
 | 
			
		||||
#
 | 
			
		||||
# Input:
 | 
			
		||||
# Output:
 | 
			
		||||
# Dependencies:
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
setwd("<your/project/directory>")
 | 
			
		||||
 | 
			
		||||
# ====  PARAMETERS  ============================================================
 | 
			
		||||
# Define and explain all parameters. No "magic numbers" in your code below.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  PACKAGES  ==============================================================
 | 
			
		||||
# Check that required packages have been installed. Install if needed.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Note: use package functions with the :: operator - eg.
 | 
			
		||||
# seqinr::aaa("K")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  FUNCTIONS  =============================================================
 | 
			
		||||
 | 
			
		||||
# Define functions or source external files
 | 
			
		||||
source("<myUtilityFunctionsScript.R>")
 | 
			
		||||
 | 
			
		||||
myFunction <- function(a, b=1) {
 | 
			
		||||
	# Purpose:
 | 
			
		||||
	#     Describe ...
 | 
			
		||||
	# Parameters:
 | 
			
		||||
	#     a: ...
 | 
			
		||||
	#     b: ...
 | 
			
		||||
	# Value:
 | 
			
		||||
	#     result: ...
 | 
			
		||||
 | 
			
		||||
	# code ...
 | 
			
		||||
 | 
			
		||||
	return(result)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  PROCESS  ===============================================================
 | 
			
		||||
# Enter the step-by-step process of your project here. Strive to write your
 | 
			
		||||
# code so that you can simply run this entire file and re-create all
 | 
			
		||||
# intermediate results.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# scriptTemplate.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose:
 | 
			
		||||
# Version:
 | 
			
		||||
# Date:
 | 
			
		||||
# Author:
 | 
			
		||||
#
 | 
			
		||||
# Input:
 | 
			
		||||
# Output:
 | 
			
		||||
# Dependencies:
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
setwd("<your/project/directory>")
 | 
			
		||||
 | 
			
		||||
# ====  PARAMETERS  ============================================================
 | 
			
		||||
# Define and explain all parameters. No "magic numbers" in your code below.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  PACKAGES  ==============================================================
 | 
			
		||||
# Check that required packages have been installed. Install if needed.
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
 | 
			
		||||
  install.packages("seqinr")
 | 
			
		||||
}
 | 
			
		||||
# Package information:
 | 
			
		||||
#  library(help = seqinr)       # basic information
 | 
			
		||||
#  browseVignettes("seqinr")    # available vignettes
 | 
			
		||||
#  data(package = "seqinr")     # available datasets
 | 
			
		||||
 | 
			
		||||
# Note: use package functions with the :: operator - eg.
 | 
			
		||||
# seqinr::aaa("K")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  FUNCTIONS  =============================================================
 | 
			
		||||
 | 
			
		||||
# Define functions or source external files
 | 
			
		||||
source("<myUtilityFunctionsScript.R>")
 | 
			
		||||
 | 
			
		||||
myFunction <- function(a, b=1) {
 | 
			
		||||
	# Purpose:
 | 
			
		||||
	#     Describe ...
 | 
			
		||||
	# Parameters:
 | 
			
		||||
	#     a: ...
 | 
			
		||||
	#     b: ...
 | 
			
		||||
	# Value:
 | 
			
		||||
	#     result: ...
 | 
			
		||||
 | 
			
		||||
	# code ...
 | 
			
		||||
 | 
			
		||||
	return(result)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  PROCESS  ===============================================================
 | 
			
		||||
# Enter the step-by-step process of your project here. Strive to write your
 | 
			
		||||
# code so that you can simply run this entire file and re-create all
 | 
			
		||||
# intermediate results.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,30 +1,30 @@
 | 
			
		||||
# ABC-createRefDB.R
 | 
			
		||||
#
 | 
			
		||||
# Create a reference protein database for Mbp1-like proteins
 | 
			
		||||
#
 | 
			
		||||
# Boris Steipe for ABC learning units
 | 
			
		||||
#
 | 
			
		||||
# For the species, see:
 | 
			
		||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 | 
			
		||||
#
 | 
			
		||||
# For the data model, see
 | 
			
		||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
 | 
			
		||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
myDB <- dbInit()
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# ABC-createRefDB.R
 | 
			
		||||
#
 | 
			
		||||
# Create a reference protein database for Mbp1-like proteins
 | 
			
		||||
#
 | 
			
		||||
# Boris Steipe for ABC learning units
 | 
			
		||||
#
 | 
			
		||||
# For the species, see:
 | 
			
		||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
 | 
			
		||||
#
 | 
			
		||||
# For the data model, see
 | 
			
		||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
 | 
			
		||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
myDB <- dbInit()
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
 | 
			
		||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
 | 
			
		||||
 | 
			
		||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,443 +1,443 @@
 | 
			
		||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  Create a list of genome sequenced fungi with protein annotations and
 | 
			
		||||
#               Mbp1 homologues.
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:    2016  09  -  2021  09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions
 | 
			
		||||
#          1.4    New retrieval logic
 | 
			
		||||
#          1.3    Rewrite to change datasource. NCBI has not been updated
 | 
			
		||||
#                   since 2012. Use ensembl fungi as initial source.
 | 
			
		||||
#          1.2    Change from require() to requireNamespace()
 | 
			
		||||
#          1.1.2  Moved BLAST.R to ./scripts directory
 | 
			
		||||
#          1.1    Update 2017
 | 
			
		||||
#          1.0    First code 2016
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT  source()  THIS FILE!
 | 
			
		||||
#
 | 
			
		||||
# This file is code I provide for your deeper understanding of a process and
 | 
			
		||||
# to provide you with useful sample code. It is not actually necessary for
 | 
			
		||||
# you to run this code, but I encourage you to read it carefully and discuss
 | 
			
		||||
# if there are parts you don't understand.
 | 
			
		||||
#
 | 
			
		||||
# Run the commands that interact with the NCBI servers only if you want to
 | 
			
		||||
# experiment specifically with the code and/or parameters. I have commented out
 | 
			
		||||
# those parts. If you only want to study the general workflow, just load()
 | 
			
		||||
# the respective intermediate results.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                    Line
 | 
			
		||||
#TOC> --------------------------------------------------------
 | 
			
		||||
#TOC>   1        The strategy                               55
 | 
			
		||||
#TOC>   2        PACKAGES AND INITIALIZATIONS               67
 | 
			
		||||
#TOC>   3        ENSEMBL FUNGI                              75
 | 
			
		||||
#TOC>   3.1        Import                                   78
 | 
			
		||||
#TOC>   4        BLAST SEARCH                              155
 | 
			
		||||
#TOC>   4.1        find homologous proteins                161
 | 
			
		||||
#TOC>   4.2        Identify species in "hits"              192
 | 
			
		||||
#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
 | 
			
		||||
#TOC>   6        STUDENT NUMBERS                           375
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  The strategy  ========================================================
 | 
			
		||||
 | 
			
		||||
# This script will create a list of "MYSPE" species and save it in an R object
 | 
			
		||||
# MYSPEspecies that is stored in the data subdirectory of this project from
 | 
			
		||||
# where it can be loaded. The strategy is as follows: we download a list of
 | 
			
		||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
 | 
			
		||||
# species that have been annotated.
 | 
			
		||||
# Next we perform a BLAST search, to identify fungal species that have
 | 
			
		||||
# genes that are homologous to yeast MBP1.
 | 
			
		||||
#
 | 
			
		||||
# ...
 | 
			
		||||
 | 
			
		||||
# =    2  PACKAGES AND INITIALIZATIONS  ========================================
 | 
			
		||||
 | 
			
		||||
# httr provides interfaces to Webservers on the Internet
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  ENSEMBL FUNGI  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Import  ============================================================
 | 
			
		||||
 | 
			
		||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
 | 
			
		||||
# list of all species: https://fungi.ensembl.org/species.html
 | 
			
		||||
# On the page, click on the spreadsheet symbol top right and choose
 | 
			
		||||
# "download whole table". The file will be named  "Species.csv", in your
 | 
			
		||||
# usual downloads folder. Move it to the data folder, and read it.
 | 
			
		||||
 | 
			
		||||
sDat <- read.csv("./data/Species.csv")
 | 
			
		||||
str(sDat)
 | 
			
		||||
 | 
			
		||||
# The most obvious way to partition these is according to Classification ...
 | 
			
		||||
# (poking around a bit in the UniProt taxonomy database shows that the
 | 
			
		||||
#  classification used here is the taxonomic rank of "order").
 | 
			
		||||
# how many classifications do we have?
 | 
			
		||||
length(unique(sDat$Classification))  # 66
 | 
			
		||||
 | 
			
		||||
# To have a good set for the class, we should have about 100.
 | 
			
		||||
# Let's see for which of these we can find Mbp1 homologues.
 | 
			
		||||
# First, we'll keep only the colums for name, classification, and taxID, and
 | 
			
		||||
# drop the rest ...
 | 
			
		||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
 | 
			
		||||
colnames(sDat) <- c("name", "order", "taxID")
 | 
			
		||||
 | 
			
		||||
# Next, we make an extra column: genus - the first part of the binomial name.
 | 
			
		||||
# We'll use the gsub() function, and for that we need a "regular expression"
 | 
			
		||||
# that matches to all characters from the first blank to the end of the string:
 | 
			
		||||
myPatt <- "\\s.*$"  # one whitespace (\\s) ...
 | 
			
		||||
                    # followed by any character (.) 0..n times (*) ...
 | 
			
		||||
                    # until the end of the string
 | 
			
		||||
 | 
			
		||||
# using gsub() we substitue all matching characters with the empty string "" -
 | 
			
		||||
# this deletes the matching characters
 | 
			
		||||
# Test this:
 | 
			
		||||
gsub(myPatt, "", "Genus")                      # one word: unchanged
 | 
			
		||||
gsub(myPatt, "", "gEnus species")              # two words: return only first
 | 
			
		||||
gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
 | 
			
		||||
 | 
			
		||||
# apply this to the "name" column and add the result as a separate column
 | 
			
		||||
# called "genus"
 | 
			
		||||
sDat$genus <- gsub(myPatt, "", sDat$name)
 | 
			
		||||
 | 
			
		||||
# what do we get?
 | 
			
		||||
c(head(unique(sDat$genus)),
 | 
			
		||||
  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
 | 
			
		||||
                             # is a problem that we have to keep in mind.
 | 
			
		||||
                             # (Always inspect your results!)
 | 
			
		||||
# Drop all rows for which the genus contains special chracters -
 | 
			
		||||
# like "[Candida]"
 | 
			
		||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
 | 
			
		||||
 | 
			
		||||
length(table(sDat$genus))    # how many genus?
 | 
			
		||||
hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
 | 
			
		||||
                                              # most genus have very few, but
 | 
			
		||||
                                              # some have very many species.
 | 
			
		||||
sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
 | 
			
		||||
 | 
			
		||||
# We should have at least one species from each taxonomic order, but we can
 | 
			
		||||
# add a few genus until we have about 100 validated species.
 | 
			
		||||
 | 
			
		||||
# Let's add a column for species, by changing our regular expression a bit,
 | 
			
		||||
# using ^ (start of string), \\S (NOT a whitespace),
 | 
			
		||||
# and + (one or more matches), capturing the match (...), and returning
 | 
			
		||||
# it as the substitution (\\1) ...
 | 
			
		||||
 | 
			
		||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
 | 
			
		||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
 | 
			
		||||
 | 
			
		||||
# And we reorder the columns, just for aesthetics:
 | 
			
		||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
 | 
			
		||||
 | 
			
		||||
# Final check:
 | 
			
		||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Now we check which of these have Mbp1 homologues ...
 | 
			
		||||
 | 
			
		||||
# =    4  BLAST SEARCH  ========================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
 | 
			
		||||
# fungus. With the results, we'll annotate our sDat table.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  find homologous proteins  ==========================================
 | 
			
		||||
#
 | 
			
		||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
 | 
			
		||||
# contain them.
 | 
			
		||||
 | 
			
		||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
 | 
			
		||||
# amount of error handling involved that is not supported by the API in a
 | 
			
		||||
# principled way but requires rather ad hoc solutions. The code I threw together
 | 
			
		||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
 | 
			
		||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
 | 
			
		||||
# standard task of communicating with servers and parsing responses - everyday
 | 
			
		||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
 | 
			
		||||
# parser in currently available packages.
 | 
			
		||||
#
 | 
			
		||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
 | 
			
		||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
 | 
			
		||||
# server, they will blacklist your IP-address. See:
 | 
			
		||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 | 
			
		||||
#
 | 
			
		||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 | 
			
		||||
# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 | 
			
		||||
#                    db = "refseq_protein",        # database to search in
 | 
			
		||||
#                    nHits = 3000,                 # 945 hits in 2020
 | 
			
		||||
#                    E = 0.01,                     #
 | 
			
		||||
#                    limits = "txid4751[ORGN]")    # = fungi
 | 
			
		||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
 | 
			
		||||
#
 | 
			
		||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
 | 
			
		||||
#
 | 
			
		||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
 | 
			
		||||
 | 
			
		||||
# ==   4.2  Identify species in "hits"  ========================================
 | 
			
		||||
 | 
			
		||||
# This is a very big list that can't be usefully analyzed manually. Here
 | 
			
		||||
# we are only interested in the species names that it contains.
 | 
			
		||||
 | 
			
		||||
# How many hits in the list?
 | 
			
		||||
length(BLASThits$hits)      # 1,134
 | 
			
		||||
 | 
			
		||||
# Let's look at a hit somewhere down the list
 | 
			
		||||
str(BLASThits$hit[[277]])
 | 
			
		||||
 | 
			
		||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
 | 
			
		||||
# in a useful way. The species information is in the $species element of every
 | 
			
		||||
# hit.
 | 
			
		||||
 | 
			
		||||
# Run a loop to extract all the species names into a vector. We subset ...
 | 
			
		||||
# Blasthits$hits                 ... the list of hits, from which we choose ...
 | 
			
		||||
# Blasthits$hits[[i]]            ... the i-th hit, and get ...
 | 
			
		||||
# Blasthits$hits[[i]]$species    ... the species element from that.
 | 
			
		||||
# Subsetting FTW.
 | 
			
		||||
 | 
			
		||||
BLASTspecies <- character()
 | 
			
		||||
for (i in seq_along(BLASThits$hits)) {
 | 
			
		||||
    BLASTspecies[i] <- BLASThits$hits[[i]]$species
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# You can confirm that BLASTspecies has the expected size.
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
 | 
			
		||||
# if we delete some of these later on, we still want to remember which hit
 | 
			
		||||
# they came from. Thus we name() the elements with their index, which is the
 | 
			
		||||
# same as the index of the hit in BLASThits
 | 
			
		||||
names(BLASTspecies) <- 1:length(BLASTspecies)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# let's plot the distribution of E-values
 | 
			
		||||
eVals <- numeric()
 | 
			
		||||
for (i in seq_along(BLASThits$hits)) {
 | 
			
		||||
  eVals[i] <- BLASThits$hits[[i]]$E
 | 
			
		||||
}
 | 
			
		||||
range(eVals)
 | 
			
		||||
sum(eVals == 0)
 | 
			
		||||
 | 
			
		||||
# let's plot the log of all values > 0 to see how they are distributed
 | 
			
		||||
# plotting only one vectyor of numbers plots their index as x, and
 | 
			
		||||
# their value as y ...
 | 
			
		||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
 | 
			
		||||
 | 
			
		||||
# This is very informative: I would suspect that the first ten or so are
 | 
			
		||||
# virtually identical to the yeast protein, then we have about 800 hits with
 | 
			
		||||
# decreasing similarity, and then about 200 more that may actually be false
 | 
			
		||||
# positives. Also - we plotted them by index, that means the table is SORTED:
 | 
			
		||||
# Lower E-values strictly come before higher E-values.
 | 
			
		||||
 | 
			
		||||
# Again, some species appear more than once, e.g. ...
 | 
			
		||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
 | 
			
		||||
 | 
			
		||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
 | 
			
		||||
 | 
			
		||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
 | 
			
		||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
 | 
			
		||||
# species, we will retain only the protein that has the highest similarity
 | 
			
		||||
# to yeast Mbp1, not any of its more distant paralogues.
 | 
			
		||||
sel <- ! duplicated(BLASTspecies)
 | 
			
		||||
BLASTspecies <- BLASTspecies[sel]
 | 
			
		||||
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
# i.e. we got rid of about two thirds of the hits.
 | 
			
		||||
tail(BLASTspecies)  # see how the names are useful!
 | 
			
		||||
                    # again - there are some special characters ...
 | 
			
		||||
                    # what are they?
 | 
			
		||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
 | 
			
		||||
 | 
			
		||||
# remove the brackets ...
 | 
			
		||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
 | 
			
		||||
# drop any new duplicates ...
 | 
			
		||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
 | 
			
		||||
 | 
			
		||||
# check the number again:
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
# Think a bit about this: what may be the biological reason to find that
 | 
			
		||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
 | 
			
		||||
# three sequences that are homologous to yeast Mbp1?
 | 
			
		||||
 | 
			
		||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
 | 
			
		||||
# we plot all values that are TRUE in the vector "sel" that we created above,
 | 
			
		||||
# AND greater than 0
 | 
			
		||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
 | 
			
		||||
 | 
			
		||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
 | 
			
		||||
# the E-value, and the Query-bounds from which we can estimate which domains
 | 
			
		||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
 | 
			
		||||
# Mbp1's N-terminal APSES domain.)
 | 
			
		||||
#
 | 
			
		||||
# First we pull the hits we wanted from the BLASTspecies:
 | 
			
		||||
iHits <- as.numeric(names(BLASTspecies))
 | 
			
		||||
length(iHits)     # one index for each TRUE in sel
 | 
			
		||||
 | 
			
		||||
# add columns to sDat
 | 
			
		||||
l <- nrow(sDat)
 | 
			
		||||
sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
 | 
			
		||||
sDat$eVal   <- numeric(l)  # E-value of the hit
 | 
			
		||||
sDat$lAli   <- numeric(l)  # length of the aligned region
 | 
			
		||||
 | 
			
		||||
# extract and merge
 | 
			
		||||
for (iHit in iHits) {
 | 
			
		||||
  thisSp <- BLASThits$hits[[iHit]]$species
 | 
			
		||||
  sel <- sDat$species == thisSp
 | 
			
		||||
 | 
			
		||||
  sDat$iHit[sel]   <- iHit
 | 
			
		||||
  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
 | 
			
		||||
  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Are all reference species accounted for?
 | 
			
		||||
selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
 | 
			
		||||
REFspecies %in% sDat$species[selA]     # yes, all there
 | 
			
		||||
 | 
			
		||||
selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
 | 
			
		||||
 | 
			
		||||
sum(selA & selB)   # How many rows?
 | 
			
		||||
 | 
			
		||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
 | 
			
		||||
# may be different strains. We'll use the same strategy as before and keep
 | 
			
		||||
# only the best hit: order the rows by E-value, then drop all rows which
 | 
			
		||||
# are duplicated.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# drop all rows without BLAST hits ...
 | 
			
		||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
 | 
			
		||||
 | 
			
		||||
# order sDat by E-value ...
 | 
			
		||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
 | 
			
		||||
 | 
			
		||||
# drop all rows with duplicated species ...
 | 
			
		||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
 | 
			
		||||
 | 
			
		||||
# Lets look at the E-values ...
 | 
			
		||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
 | 
			
		||||
 | 
			
		||||
# and alignment lengths ...
 | 
			
		||||
plot(sDat$lAli, col = "#00DDAA")
 | 
			
		||||
 | 
			
		||||
# How many ...
 | 
			
		||||
length(unique(sDat$name))
 | 
			
		||||
length(unique(sDat$species))
 | 
			
		||||
length(unique(sDat$genus))
 | 
			
		||||
length(unique(sDat$order))
 | 
			
		||||
 | 
			
		||||
# I need an extra species for admin purposes later on ...
 | 
			
		||||
sel <- grep("Sporothrix schenckii", sDat$species)
 | 
			
		||||
SPOSCdat <- sDat[sel, ]
 | 
			
		||||
sDat <- sDat[-sel, ]
 | 
			
		||||
 | 
			
		||||
# To get the final dataset, we remove the reference species with their
 | 
			
		||||
# entire orders ...
 | 
			
		||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
 | 
			
		||||
sel <- sDat$order %in% REForders
 | 
			
		||||
REFdat <- sDat[sel , ]
 | 
			
		||||
sDat   <- sDat[ ! sel , ]
 | 
			
		||||
 | 
			
		||||
# REFdat should now contain only the REFspecies ...
 | 
			
		||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
 | 
			
		||||
 | 
			
		||||
# ... but all of them
 | 
			
		||||
sum(REFspecies %in% REFdat$species)
 | 
			
		||||
 | 
			
		||||
# ... and we have enough left in sDat to prune sDat to unique genus
 | 
			
		||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
 | 
			
		||||
nrow(sDat)   # 84
 | 
			
		||||
 | 
			
		||||
# I add back "Sporothrix schenckii" ...
 | 
			
		||||
sDat <- rbind(SPOSCdat, sDat)
 | 
			
		||||
 | 
			
		||||
# ... and save for future use.
 | 
			
		||||
# saveRDS(sDat, file = "data/sDat.rds")
 | 
			
		||||
# saveRDS(REFdat, file = "data/REFdat.rds")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  STUDENT NUMBERS  =====================================================
 | 
			
		||||
#
 | 
			
		||||
# An asymmetric function to retrieve a MYSPE species
 | 
			
		||||
#
 | 
			
		||||
sDat <- readRDS(file = "data/sDat.rds")
 | 
			
		||||
 | 
			
		||||
students <- read.csv("../BCH441-2021-students.csv")
 | 
			
		||||
sN <- students$Integration.ID
 | 
			
		||||
sN <- sN[! is.na(sN)]
 | 
			
		||||
sN <- as.character(sN)
 | 
			
		||||
sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
 | 
			
		||||
all(sort(theseSpecies$name) == sort(sDat$name))
 | 
			
		||||
nrow((theseSpecies))
 | 
			
		||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
 | 
			
		||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
 | 
			
		||||
rndMin <-  992000000
 | 
			
		||||
rndMax <- 1020000000
 | 
			
		||||
N <- 10000
 | 
			
		||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
 | 
			
		||||
keys <- keys[! (keys %in% sN)]
 | 
			
		||||
keys <- keys[1:N]
 | 
			
		||||
keys[1:length(sN)] <- sN
 | 
			
		||||
 | 
			
		||||
nRep <- floor(N/nrow(theseSpecies))
 | 
			
		||||
MYSPEdat <- theseSpecies
 | 
			
		||||
for(i in 1:nRep) {
 | 
			
		||||
  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
 | 
			
		||||
}
 | 
			
		||||
MYSPEdat <- MYSPEdat[1:N, ]
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
 | 
			
		||||
 | 
			
		||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
 | 
			
		||||
 | 
			
		||||
# === validate
 | 
			
		||||
x <- character()
 | 
			
		||||
for (n in sN) {
 | 
			
		||||
  sp <- getMYSPE(n)
 | 
			
		||||
  if (length(sp) != 1) {
 | 
			
		||||
    stop(print(as.character(n)))
 | 
			
		||||
  } else {
 | 
			
		||||
    x <- c(x, sp)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# === species for late-comers
 | 
			
		||||
y <- unique(MYSPEdat$species)
 | 
			
		||||
print(y[!(y %in% x)])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# === validate
 | 
			
		||||
l <- length(sN)
 | 
			
		||||
sp <- character(l)
 | 
			
		||||
for(i in 1:l) {
 | 
			
		||||
  sp[i] <- getMYSPE(sN[i])
 | 
			
		||||
}
 | 
			
		||||
any(duplicated(sp))
 | 
			
		||||
length(unique(sp))
 | 
			
		||||
which(! sDat$species %in% sp)  # these can be assigned to late-comers
 | 
			
		||||
 | 
			
		||||
# Done.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
 | 
			
		||||
#
 | 
			
		||||
# Purpose:  Create a list of genome sequenced fungi with protein annotations and
 | 
			
		||||
#               Mbp1 homologues.
 | 
			
		||||
#
 | 
			
		||||
# Version: 1.4
 | 
			
		||||
#
 | 
			
		||||
# Date:    2016  09  -  2021  09
 | 
			
		||||
# Author:  Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions
 | 
			
		||||
#          1.4    New retrieval logic
 | 
			
		||||
#          1.3    Rewrite to change datasource. NCBI has not been updated
 | 
			
		||||
#                   since 2012. Use ensembl fungi as initial source.
 | 
			
		||||
#          1.2    Change from require() to requireNamespace()
 | 
			
		||||
#          1.1.2  Moved BLAST.R to ./scripts directory
 | 
			
		||||
#          1.1    Update 2017
 | 
			
		||||
#          1.0    First code 2016
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
#
 | 
			
		||||
# DO NOT  source()  THIS FILE!
 | 
			
		||||
#
 | 
			
		||||
# This file is code I provide for your deeper understanding of a process and
 | 
			
		||||
# to provide you with useful sample code. It is not actually necessary for
 | 
			
		||||
# you to run this code, but I encourage you to read it carefully and discuss
 | 
			
		||||
# if there are parts you don't understand.
 | 
			
		||||
#
 | 
			
		||||
# Run the commands that interact with the NCBI servers only if you want to
 | 
			
		||||
# experiment specifically with the code and/or parameters. I have commented out
 | 
			
		||||
# those parts. If you only want to study the general workflow, just load()
 | 
			
		||||
# the respective intermediate results.
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                    Line
 | 
			
		||||
#TOC> --------------------------------------------------------
 | 
			
		||||
#TOC>   1        The strategy                               55
 | 
			
		||||
#TOC>   2        PACKAGES AND INITIALIZATIONS               67
 | 
			
		||||
#TOC>   3        ENSEMBL FUNGI                              75
 | 
			
		||||
#TOC>   3.1        Import                                   78
 | 
			
		||||
#TOC>   4        BLAST SEARCH                              155
 | 
			
		||||
#TOC>   4.1        find homologous proteins                161
 | 
			
		||||
#TOC>   4.2        Identify species in "hits"              192
 | 
			
		||||
#TOC>   5        MERGE ENSEMBL AND BLAST RESULTS           282
 | 
			
		||||
#TOC>   6        STUDENT NUMBERS                           375
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  The strategy  ========================================================
 | 
			
		||||
 | 
			
		||||
# This script will create a list of "MYSPE" species and save it in an R object
 | 
			
		||||
# MYSPEspecies that is stored in the data subdirectory of this project from
 | 
			
		||||
# where it can be loaded. The strategy is as follows: we download a list of
 | 
			
		||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
 | 
			
		||||
# species that have been annotated.
 | 
			
		||||
# Next we perform a BLAST search, to identify fungal species that have
 | 
			
		||||
# genes that are homologous to yeast MBP1.
 | 
			
		||||
#
 | 
			
		||||
# ...
 | 
			
		||||
 | 
			
		||||
# =    2  PACKAGES AND INITIALIZATIONS  ========================================
 | 
			
		||||
 | 
			
		||||
# httr provides interfaces to Webservers on the Internet
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  ENSEMBL FUNGI  =======================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Import  ============================================================
 | 
			
		||||
 | 
			
		||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
 | 
			
		||||
# list of all species: https://fungi.ensembl.org/species.html
 | 
			
		||||
# On the page, click on the spreadsheet symbol top right and choose
 | 
			
		||||
# "download whole table". The file will be named  "Species.csv", in your
 | 
			
		||||
# usual downloads folder. Move it to the data folder, and read it.
 | 
			
		||||
 | 
			
		||||
sDat <- read.csv("./data/Species.csv")
 | 
			
		||||
str(sDat)
 | 
			
		||||
 | 
			
		||||
# The most obvious way to partition these is according to Classification ...
 | 
			
		||||
# (poking around a bit in the UniProt taxonomy database shows that the
 | 
			
		||||
#  classification used here is the taxonomic rank of "order").
 | 
			
		||||
# how many classifications do we have?
 | 
			
		||||
length(unique(sDat$Classification))  # 66
 | 
			
		||||
 | 
			
		||||
# To have a good set for the class, we should have about 100.
 | 
			
		||||
# Let's see for which of these we can find Mbp1 homologues.
 | 
			
		||||
# First, we'll keep only the colums for name, classification, and taxID, and
 | 
			
		||||
# drop the rest ...
 | 
			
		||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
 | 
			
		||||
colnames(sDat) <- c("name", "order", "taxID")
 | 
			
		||||
 | 
			
		||||
# Next, we make an extra column: genus - the first part of the binomial name.
 | 
			
		||||
# We'll use the gsub() function, and for that we need a "regular expression"
 | 
			
		||||
# that matches to all characters from the first blank to the end of the string:
 | 
			
		||||
myPatt <- "\\s.*$"  # one whitespace (\\s) ...
 | 
			
		||||
                    # followed by any character (.) 0..n times (*) ...
 | 
			
		||||
                    # until the end of the string
 | 
			
		||||
 | 
			
		||||
# using gsub() we substitue all matching characters with the empty string "" -
 | 
			
		||||
# this deletes the matching characters
 | 
			
		||||
# Test this:
 | 
			
		||||
gsub(myPatt, "", "Genus")                      # one word: unchanged
 | 
			
		||||
gsub(myPatt, "", "gEnus species")              # two words: return only first
 | 
			
		||||
gsub(myPatt, "", "geNus species strain 123")   # many words: return only first
 | 
			
		||||
 | 
			
		||||
# apply this to the "name" column and add the result as a separate column
 | 
			
		||||
# called "genus"
 | 
			
		||||
sDat$genus <- gsub(myPatt, "", sDat$name)
 | 
			
		||||
 | 
			
		||||
# what do we get?
 | 
			
		||||
c(head(unique(sDat$genus)),
 | 
			
		||||
  tail(unique(sDat$genus)))  # inspect the first and last few. Note that there
 | 
			
		||||
                             # is a problem that we have to keep in mind.
 | 
			
		||||
                             # (Always inspect your results!)
 | 
			
		||||
# Drop all rows for which the genus contains special chracters -
 | 
			
		||||
# like "[Candida]"
 | 
			
		||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
 | 
			
		||||
 | 
			
		||||
length(table(sDat$genus))    # how many genus?
 | 
			
		||||
hist(table(sDat$genus), col = "#E9F4FF")      # Distribution ...
 | 
			
		||||
                                              # most genus have very few, but
 | 
			
		||||
                                              # some have very many species.
 | 
			
		||||
sort(table(sDat$genus), decreasing = TRUE)[1:10]  # Top ten...
 | 
			
		||||
 | 
			
		||||
# We should have at least one species from each taxonomic order, but we can
 | 
			
		||||
# add a few genus until we have about 100 validated species.
 | 
			
		||||
 | 
			
		||||
# Let's add a column for species, by changing our regular expression a bit,
 | 
			
		||||
# using ^ (start of string), \\S (NOT a whitespace),
 | 
			
		||||
# and + (one or more matches), capturing the match (...), and returning
 | 
			
		||||
# it as the substitution (\\1) ...
 | 
			
		||||
 | 
			
		||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
 | 
			
		||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
 | 
			
		||||
 | 
			
		||||
# And we reorder the columns, just for aesthetics:
 | 
			
		||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
 | 
			
		||||
 | 
			
		||||
# Final check:
 | 
			
		||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Now we check which of these have Mbp1 homologues ...
 | 
			
		||||
 | 
			
		||||
# =    4  BLAST SEARCH  ========================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
 | 
			
		||||
# fungus. With the results, we'll annotate our sDat table.
 | 
			
		||||
 | 
			
		||||
# ==   4.1  find homologous proteins  ==========================================
 | 
			
		||||
#
 | 
			
		||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
 | 
			
		||||
# contain them.
 | 
			
		||||
 | 
			
		||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
 | 
			
		||||
# amount of error handling involved that is not supported by the API in a
 | 
			
		||||
# principled way but requires rather ad hoc solutions. The code I threw together
 | 
			
		||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
 | 
			
		||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
 | 
			
		||||
# standard task of communicating with servers and parsing responses - everyday
 | 
			
		||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
 | 
			
		||||
# parser in currently available packages.
 | 
			
		||||
#
 | 
			
		||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
 | 
			
		||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
 | 
			
		||||
# server, they will blacklist your IP-address. See:
 | 
			
		||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 | 
			
		||||
#
 | 
			
		||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
 | 
			
		||||
# BLASThits <- BLAST("NP_010227",                  # Yeast Mbp1 RefSeq ID
 | 
			
		||||
#                    db = "refseq_protein",        # database to search in
 | 
			
		||||
#                    nHits = 3000,                 # 945 hits in 2020
 | 
			
		||||
#                    E = 0.01,                     #
 | 
			
		||||
#                    limits = "txid4751[ORGN]")    # = fungi
 | 
			
		||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
 | 
			
		||||
#
 | 
			
		||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
 | 
			
		||||
#
 | 
			
		||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
 | 
			
		||||
 | 
			
		||||
# ==   4.2  Identify species in "hits"  ========================================
 | 
			
		||||
 | 
			
		||||
# This is a very big list that can't be usefully analyzed manually. Here
 | 
			
		||||
# we are only interested in the species names that it contains.
 | 
			
		||||
 | 
			
		||||
# How many hits in the list?
 | 
			
		||||
length(BLASThits$hits)      # 1,134
 | 
			
		||||
 | 
			
		||||
# Let's look at a hit somewhere down the list
 | 
			
		||||
str(BLASThits$hit[[277]])
 | 
			
		||||
 | 
			
		||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
 | 
			
		||||
# in a useful way. The species information is in the $species element of every
 | 
			
		||||
# hit.
 | 
			
		||||
 | 
			
		||||
# Run a loop to extract all the species names into a vector. We subset ...
 | 
			
		||||
# Blasthits$hits                 ... the list of hits, from which we choose ...
 | 
			
		||||
# Blasthits$hits[[i]]            ... the i-th hit, and get ...
 | 
			
		||||
# Blasthits$hits[[i]]$species    ... the species element from that.
 | 
			
		||||
# Subsetting FTW.
 | 
			
		||||
 | 
			
		||||
BLASTspecies <- character()
 | 
			
		||||
for (i in seq_along(BLASThits$hits)) {
 | 
			
		||||
    BLASTspecies[i] <- BLASThits$hits[[i]]$species
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# You can confirm that BLASTspecies has the expected size.
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
 | 
			
		||||
# if we delete some of these later on, we still want to remember which hit
 | 
			
		||||
# they came from. Thus we name() the elements with their index, which is the
 | 
			
		||||
# same as the index of the hit in BLASThits
 | 
			
		||||
names(BLASTspecies) <- 1:length(BLASTspecies)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# let's plot the distribution of E-values
 | 
			
		||||
eVals <- numeric()
 | 
			
		||||
for (i in seq_along(BLASThits$hits)) {
 | 
			
		||||
  eVals[i] <- BLASThits$hits[[i]]$E
 | 
			
		||||
}
 | 
			
		||||
range(eVals)
 | 
			
		||||
sum(eVals == 0)
 | 
			
		||||
 | 
			
		||||
# let's plot the log of all values > 0 to see how they are distributed
 | 
			
		||||
# plotting only one vectyor of numbers plots their index as x, and
 | 
			
		||||
# their value as y ...
 | 
			
		||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
 | 
			
		||||
 | 
			
		||||
# This is very informative: I would suspect that the first ten or so are
 | 
			
		||||
# virtually identical to the yeast protein, then we have about 800 hits with
 | 
			
		||||
# decreasing similarity, and then about 200 more that may actually be false
 | 
			
		||||
# positives. Also - we plotted them by index, that means the table is SORTED:
 | 
			
		||||
# Lower E-values strictly come before higher E-values.
 | 
			
		||||
 | 
			
		||||
# Again, some species appear more than once, e.g. ...
 | 
			
		||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
 | 
			
		||||
 | 
			
		||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
 | 
			
		||||
 | 
			
		||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
 | 
			
		||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
 | 
			
		||||
# species, we will retain only the protein that has the highest similarity
 | 
			
		||||
# to yeast Mbp1, not any of its more distant paralogues.
 | 
			
		||||
sel <- ! duplicated(BLASTspecies)
 | 
			
		||||
BLASTspecies <- BLASTspecies[sel]
 | 
			
		||||
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
# i.e. we got rid of about two thirds of the hits.
 | 
			
		||||
tail(BLASTspecies)  # see how the names are useful!
 | 
			
		||||
                    # again - there are some special characters ...
 | 
			
		||||
                    # what are they?
 | 
			
		||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
 | 
			
		||||
 | 
			
		||||
# remove the brackets ...
 | 
			
		||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
 | 
			
		||||
# drop any new duplicates ...
 | 
			
		||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
 | 
			
		||||
 | 
			
		||||
# check the number again:
 | 
			
		||||
length(BLASTspecies)
 | 
			
		||||
# Think a bit about this: what may be the biological reason to find that
 | 
			
		||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
 | 
			
		||||
# three sequences that are homologous to yeast Mbp1?
 | 
			
		||||
 | 
			
		||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
 | 
			
		||||
# we plot all values that are TRUE in the vector "sel" that we created above,
 | 
			
		||||
# AND greater than 0
 | 
			
		||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  MERGE ENSEMBL AND BLAST RESULTS  =====================================
 | 
			
		||||
 | 
			
		||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
 | 
			
		||||
# the E-value, and the Query-bounds from which we can estimate which domains
 | 
			
		||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
 | 
			
		||||
# Mbp1's N-terminal APSES domain.)
 | 
			
		||||
#
 | 
			
		||||
# First we pull the hits we wanted from the BLASTspecies:
 | 
			
		||||
iHits <- as.numeric(names(BLASTspecies))
 | 
			
		||||
length(iHits)     # one index for each TRUE in sel
 | 
			
		||||
 | 
			
		||||
# add columns to sDat
 | 
			
		||||
l <- nrow(sDat)
 | 
			
		||||
sDat$iHit   <- numeric(l)  # index of the hit in the BLAST results
 | 
			
		||||
sDat$eVal   <- numeric(l)  # E-value of the hit
 | 
			
		||||
sDat$lAli   <- numeric(l)  # length of the aligned region
 | 
			
		||||
 | 
			
		||||
# extract and merge
 | 
			
		||||
for (iHit in iHits) {
 | 
			
		||||
  thisSp <- BLASThits$hits[[iHit]]$species
 | 
			
		||||
  sel <- sDat$species == thisSp
 | 
			
		||||
 | 
			
		||||
  sDat$iHit[sel]   <- iHit
 | 
			
		||||
  sDat$eVal[sel]   <- BLASThits$hits[[iHit]]$E
 | 
			
		||||
  sDat$lAli[sel]   <- BLASThits$hits[[iHit]]$lengthAli
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Are all reference species accounted for?
 | 
			
		||||
selA <- sDat$iHit != 0                 # all rows which matched to a BLAST hit
 | 
			
		||||
REFspecies %in% sDat$species[selA]     # yes, all there
 | 
			
		||||
 | 
			
		||||
selB <- sDat$species %in% REFspecies   # all rows which have one of REF species
 | 
			
		||||
 | 
			
		||||
sum(selA & selB)   # How many rows?
 | 
			
		||||
 | 
			
		||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
 | 
			
		||||
# may be different strains. We'll use the same strategy as before and keep
 | 
			
		||||
# only the best hit: order the rows by E-value, then drop all rows which
 | 
			
		||||
# are duplicated.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# drop all rows without BLAST hits ...
 | 
			
		||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
 | 
			
		||||
 | 
			
		||||
# order sDat by E-value ...
 | 
			
		||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
 | 
			
		||||
 | 
			
		||||
# drop all rows with duplicated species ...
 | 
			
		||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
 | 
			
		||||
 | 
			
		||||
# Lets look at the E-values ...
 | 
			
		||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
 | 
			
		||||
 | 
			
		||||
# and alignment lengths ...
 | 
			
		||||
plot(sDat$lAli, col = "#00DDAA")
 | 
			
		||||
 | 
			
		||||
# How many ...
 | 
			
		||||
length(unique(sDat$name))
 | 
			
		||||
length(unique(sDat$species))
 | 
			
		||||
length(unique(sDat$genus))
 | 
			
		||||
length(unique(sDat$order))
 | 
			
		||||
 | 
			
		||||
# I need an extra species for admin purposes later on ...
 | 
			
		||||
sel <- grep("Sporothrix schenckii", sDat$species)
 | 
			
		||||
SPOSCdat <- sDat[sel, ]
 | 
			
		||||
sDat <- sDat[-sel, ]
 | 
			
		||||
 | 
			
		||||
# To get the final dataset, we remove the reference species with their
 | 
			
		||||
# entire orders ...
 | 
			
		||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
 | 
			
		||||
sel <- sDat$order %in% REForders
 | 
			
		||||
REFdat <- sDat[sel , ]
 | 
			
		||||
sDat   <- sDat[ ! sel , ]
 | 
			
		||||
 | 
			
		||||
# REFdat should now contain only the REFspecies ...
 | 
			
		||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
 | 
			
		||||
 | 
			
		||||
# ... but all of them
 | 
			
		||||
sum(REFspecies %in% REFdat$species)
 | 
			
		||||
 | 
			
		||||
# ... and we have enough left in sDat to prune sDat to unique genus
 | 
			
		||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
 | 
			
		||||
nrow(sDat)   # 84
 | 
			
		||||
 | 
			
		||||
# I add back "Sporothrix schenckii" ...
 | 
			
		||||
sDat <- rbind(SPOSCdat, sDat)
 | 
			
		||||
 | 
			
		||||
# ... and save for future use.
 | 
			
		||||
# saveRDS(sDat, file = "data/sDat.rds")
 | 
			
		||||
# saveRDS(REFdat, file = "data/REFdat.rds")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  STUDENT NUMBERS  =====================================================
 | 
			
		||||
#
 | 
			
		||||
# An asymmetric function to retrieve a MYSPE species
 | 
			
		||||
#
 | 
			
		||||
sDat <- readRDS(file = "data/sDat.rds")
 | 
			
		||||
 | 
			
		||||
students <- read.csv("../BCH441-2021-students.csv")
 | 
			
		||||
sN <- students$Integration.ID
 | 
			
		||||
sN <- sN[! is.na(sN)]
 | 
			
		||||
sN <- as.character(sN)
 | 
			
		||||
sN <- c("1003141593", sN)  # will map to  "Sporothrix schenckii"
 | 
			
		||||
 | 
			
		||||
set.seed(112358)
 | 
			
		||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
 | 
			
		||||
all(sort(theseSpecies$name) == sort(sDat$name))
 | 
			
		||||
nrow((theseSpecies))
 | 
			
		||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
 | 
			
		||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
 | 
			
		||||
rndMin <-  992000000
 | 
			
		||||
rndMax <- 1020000000
 | 
			
		||||
N <- 10000
 | 
			
		||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
 | 
			
		||||
keys <- keys[! (keys %in% sN)]
 | 
			
		||||
keys <- keys[1:N]
 | 
			
		||||
keys[1:length(sN)] <- sN
 | 
			
		||||
 | 
			
		||||
nRep <- floor(N/nrow(theseSpecies))
 | 
			
		||||
MYSPEdat <- theseSpecies
 | 
			
		||||
for(i in 1:nRep) {
 | 
			
		||||
  MYSPEdat <- rbind(MYSPEdat, theseSpecies)
 | 
			
		||||
}
 | 
			
		||||
MYSPEdat <- MYSPEdat[1:N, ]
 | 
			
		||||
for (i in 1:N) {
 | 
			
		||||
  rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
 | 
			
		||||
}
 | 
			
		||||
set.seed(NULL)
 | 
			
		||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
 | 
			
		||||
 | 
			
		||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
 | 
			
		||||
 | 
			
		||||
# === validate
 | 
			
		||||
x <- character()
 | 
			
		||||
for (n in sN) {
 | 
			
		||||
  sp <- getMYSPE(n)
 | 
			
		||||
  if (length(sp) != 1) {
 | 
			
		||||
    stop(print(as.character(n)))
 | 
			
		||||
  } else {
 | 
			
		||||
    x <- c(x, sp)
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# === species for late-comers
 | 
			
		||||
y <- unique(MYSPEdat$species)
 | 
			
		||||
print(y[!(y %in% x)])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# === validate
 | 
			
		||||
l <- length(sN)
 | 
			
		||||
sp <- character(l)
 | 
			
		||||
for(i in 1:l) {
 | 
			
		||||
  sp[i] <- getMYSPE(sN[i])
 | 
			
		||||
}
 | 
			
		||||
any(duplicated(sp))
 | 
			
		||||
length(unique(sp))
 | 
			
		||||
which(! sDat$species %in% sp)  # these can be assigned to late-comers
 | 
			
		||||
 | 
			
		||||
# Done.
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,168 +1,168 @@
 | 
			
		||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
 | 
			
		||||
#
 | 
			
		||||
# Create a subnetwork of high-confidence human STRING edges.
 | 
			
		||||
#
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
#      The large source- datafile is NOT posted to github. If you want to
 | 
			
		||||
#      experiment with the original data, download it and place it into your
 | 
			
		||||
#      local  ./data  directory.
 | 
			
		||||
#
 | 
			
		||||
#      STRING data source:
 | 
			
		||||
#        Download page:
 | 
			
		||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
 | 
			
		||||
#        Data: (127.6 Mb)
 | 
			
		||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    Rewrite
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Initialize                          44
 | 
			
		||||
#TOC>   2        Read STRING Data                    51
 | 
			
		||||
#TOC>   3        Define cutoff and subset            63
 | 
			
		||||
#TOC>   4        Drop  duplicates                   103
 | 
			
		||||
#TOC>   5        Simple statistics                  127
 | 
			
		||||
#TOC>   6        Write to file                      160
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Initialize  ==========================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("readr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("readr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Read STRING Data  ====================================================
 | 
			
		||||
 | 
			
		||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
 | 
			
		||||
# 848 Mb. Fortunately readr:: can read from compressed
 | 
			
		||||
# files, and does so automatically, based on the file extension.
 | 
			
		||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
 | 
			
		||||
STR <- readr::read_delim(fn, delim = " ")
 | 
			
		||||
nrow(STR)  #  11,759,454 rows
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Define cutoff and subset  ============================================
 | 
			
		||||
 | 
			
		||||
# approximate distribution of combined_score
 | 
			
		||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
 | 
			
		||||
 | 
			
		||||
# Let's table the counts >= 850 and plot them for better resolution.
 | 
			
		||||
 | 
			
		||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
 | 
			
		||||
is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
 | 
			
		||||
 | 
			
		||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
 | 
			
		||||
myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
 | 
			
		||||
                         # frequently assigns a combined score of 0.900
 | 
			
		||||
 | 
			
		||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
 | 
			
		||||
# as combined scores.
 | 
			
		||||
myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
 | 
			
		||||
plot(myX,
 | 
			
		||||
     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
 | 
			
		||||
     xlim = c(1.0, 0.85),            # reverse x-axis
 | 
			
		||||
     type = "l",
 | 
			
		||||
     main = "STRING interactions for 9606 (top 600,000)",
 | 
			
		||||
     xlab = "combined_score",
 | 
			
		||||
     ylab = "cumulative counts",
 | 
			
		||||
     col = "#CC0000")
 | 
			
		||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
 | 
			
		||||
 | 
			
		||||
# What's the cutoff for 100,000 edges?
 | 
			
		||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
sum(STR$combined_score >= 964) # 101,348
 | 
			
		||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
 | 
			
		||||
 | 
			
		||||
# subset the table, and use only the protein IDs and the combined_score
 | 
			
		||||
STR <- STR[STR$combined_score >= 964,
 | 
			
		||||
            c("protein1", "protein2", "combined_score")]
 | 
			
		||||
colnames(STR) <- c("a", "b", "score")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Drop  duplicates  ====================================================
 | 
			
		||||
 | 
			
		||||
# identify duplicate interactions by creating keys in a defined alphabetical
 | 
			
		||||
# sort order, then checking for  duplicated().
 | 
			
		||||
# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
 | 
			
		||||
# (X:U, X:U) has a duplicate.
 | 
			
		||||
 | 
			
		||||
AB <- STR$a < STR$b        # logical vector: genes we need to swap
 | 
			
		||||
tmp <- STR$b               # copy column b
 | 
			
		||||
STR$b[AB] <- STR$a[AB]     # copy a's into b
 | 
			
		||||
STR$a[AB] <- tmp[AB]       # copy tmp's into a
 | 
			
		||||
all(STR$a >= STR$b)        # confirm: TRUE
 | 
			
		||||
 | 
			
		||||
# now, make combined keys, like this:
 | 
			
		||||
paste0(STR$a[1:10], ":", STR$b[1:10])
 | 
			
		||||
 | 
			
		||||
tmp <- paste0(STR$a, ":", STR$b)
 | 
			
		||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
 | 
			
		||||
                     # both a:b and b:a !
 | 
			
		||||
 | 
			
		||||
# drop all duplicated interactions from tmp
 | 
			
		||||
STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Simple statistics  ===================================================
 | 
			
		||||
 | 
			
		||||
# how many unique genes?
 | 
			
		||||
length(unique(c(STR$a, STR$b)))   # 8,445
 | 
			
		||||
 | 
			
		||||
# how many self-edges?
 | 
			
		||||
sum(STR$a == STR$b)  # none
 | 
			
		||||
 | 
			
		||||
# log(rank) / log(frequency)
 | 
			
		||||
myTbl <- table(c(STR$a, STR$b))
 | 
			
		||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
 | 
			
		||||
 | 
			
		||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
 | 
			
		||||
 | 
			
		||||
# number of singletons
 | 
			
		||||
sum(myTbl == 1) # almost a quarter
 | 
			
		||||
 | 
			
		||||
# maximum?
 | 
			
		||||
myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
 | 
			
		||||
                                   # Google: CDC5L
 | 
			
		||||
 | 
			
		||||
# Zipf-plot
 | 
			
		||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
 | 
			
		||||
     type = "b", cex = 0.7,
 | 
			
		||||
     main = "STRINGedges - degrees",
 | 
			
		||||
     xlab = "log(rank)",
 | 
			
		||||
     ylab = "log(frequency)",
 | 
			
		||||
     col = "#FFBB88")
 | 
			
		||||
 | 
			
		||||
sprintf("Average number of interactions: %5.2f",
 | 
			
		||||
         nrow(STR) / length(unique(c(STR$a, STR$b))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Write to file  =======================================================
 | 
			
		||||
 | 
			
		||||
saveRDS(STR, file = "./data/STRINGedges.rds")
 | 
			
		||||
 | 
			
		||||
# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
 | 
			
		||||
                                                    # object when needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
 | 
			
		||||
#
 | 
			
		||||
# Create a subnetwork of high-confidence human STRING edges.
 | 
			
		||||
#
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
#      The large source- datafile is NOT posted to github. If you want to
 | 
			
		||||
#      experiment with the original data, download it and place it into your
 | 
			
		||||
#      local  ./data  directory.
 | 
			
		||||
#
 | 
			
		||||
#      STRING data source:
 | 
			
		||||
#        Download page:
 | 
			
		||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
 | 
			
		||||
#        Data: (127.6 Mb)
 | 
			
		||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.0
 | 
			
		||||
#
 | 
			
		||||
# Date:     2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.0    Rewrite
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                             Line
 | 
			
		||||
#TOC> -------------------------------------------------
 | 
			
		||||
#TOC>   1        Initialize                          44
 | 
			
		||||
#TOC>   2        Read STRING Data                    51
 | 
			
		||||
#TOC>   3        Define cutoff and subset            63
 | 
			
		||||
#TOC>   4        Drop  duplicates                   103
 | 
			
		||||
#TOC>   5        Simple statistics                  127
 | 
			
		||||
#TOC>   6        Write to file                      160
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  Initialize  ==========================================================
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("readr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("readr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  Read STRING Data  ====================================================
 | 
			
		||||
 | 
			
		||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
 | 
			
		||||
# 848 Mb. Fortunately readr:: can read from compressed
 | 
			
		||||
# files, and does so automatically, based on the file extension.
 | 
			
		||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
 | 
			
		||||
STR <- readr::read_delim(fn, delim = " ")
 | 
			
		||||
nrow(STR)  #  11,759,454 rows
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  Define cutoff and subset  ============================================
 | 
			
		||||
 | 
			
		||||
# approximate distribution of combined_score
 | 
			
		||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
 | 
			
		||||
 | 
			
		||||
# Let's table the counts >= 850 and plot them for better resolution.
 | 
			
		||||
 | 
			
		||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
 | 
			
		||||
is.unsorted(as.integer(names(myTb)))  # Good - they are all in order
 | 
			
		||||
 | 
			
		||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
 | 
			
		||||
myTb[myTb == max(myTb)]  # Apparently there is an algorithmic effect that
 | 
			
		||||
                         # frequently assigns a combined score of 0.900
 | 
			
		||||
 | 
			
		||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
 | 
			
		||||
# as combined scores.
 | 
			
		||||
myX <- 1 - (1:length(myTb)) / 1000   # x-values, decreasing
 | 
			
		||||
plot(myX,
 | 
			
		||||
     cumsum(myTb[length(myTb):1]),   # cumulative sum, decreasing
 | 
			
		||||
     xlim = c(1.0, 0.85),            # reverse x-axis
 | 
			
		||||
     type = "l",
 | 
			
		||||
     main = "STRING interactions for 9606 (top 600,000)",
 | 
			
		||||
     xlab = "combined_score",
 | 
			
		||||
     ylab = "cumulative counts",
 | 
			
		||||
     col = "#CC0000")
 | 
			
		||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
 | 
			
		||||
 | 
			
		||||
# What's the cutoff for 100,000 edges?
 | 
			
		||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
 | 
			
		||||
 | 
			
		||||
# confirm
 | 
			
		||||
sum(STR$combined_score >= 964) # 101,348
 | 
			
		||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
 | 
			
		||||
 | 
			
		||||
# subset the table, and use only the protein IDs and the combined_score
 | 
			
		||||
STR <- STR[STR$combined_score >= 964,
 | 
			
		||||
            c("protein1", "protein2", "combined_score")]
 | 
			
		||||
colnames(STR) <- c("a", "b", "score")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  Drop  duplicates  ====================================================
 | 
			
		||||
 | 
			
		||||
# identify duplicate interactions by creating keys in a defined alphabetical
 | 
			
		||||
# sort order, then checking for  duplicated().
 | 
			
		||||
# e.g  if we have (X:U, U:X), we change U:X to X:U and now find that
 | 
			
		||||
# (X:U, X:U) has a duplicate.
 | 
			
		||||
 | 
			
		||||
AB <- STR$a < STR$b        # logical vector: genes we need to swap
 | 
			
		||||
tmp <- STR$b               # copy column b
 | 
			
		||||
STR$b[AB] <- STR$a[AB]     # copy a's into b
 | 
			
		||||
STR$a[AB] <- tmp[AB]       # copy tmp's into a
 | 
			
		||||
all(STR$a >= STR$b)        # confirm: TRUE
 | 
			
		||||
 | 
			
		||||
# now, make combined keys, like this:
 | 
			
		||||
paste0(STR$a[1:10], ":", STR$b[1:10])
 | 
			
		||||
 | 
			
		||||
tmp <- paste0(STR$a, ":", STR$b)
 | 
			
		||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
 | 
			
		||||
                     # both a:b and b:a !
 | 
			
		||||
 | 
			
		||||
# drop all duplicated interactions from tmp
 | 
			
		||||
STR <- STR[ ! duplicated(tmp), ]   # 50,674 interactions remain
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    5  Simple statistics  ===================================================
 | 
			
		||||
 | 
			
		||||
# how many unique genes?
 | 
			
		||||
length(unique(c(STR$a, STR$b)))   # 8,445
 | 
			
		||||
 | 
			
		||||
# how many self-edges?
 | 
			
		||||
sum(STR$a == STR$b)  # none
 | 
			
		||||
 | 
			
		||||
# log(rank) / log(frequency)
 | 
			
		||||
myTbl <- table(c(STR$a, STR$b))
 | 
			
		||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
 | 
			
		||||
 | 
			
		||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
 | 
			
		||||
 | 
			
		||||
# number of singletons
 | 
			
		||||
sum(myTbl == 1) # almost a quarter
 | 
			
		||||
 | 
			
		||||
# maximum?
 | 
			
		||||
myTbl[which(myTbl == max(myTbl))]  # 9606.ENSP00000360532: 465
 | 
			
		||||
                                   # Google: CDC5L
 | 
			
		||||
 | 
			
		||||
# Zipf-plot
 | 
			
		||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
 | 
			
		||||
     type = "b", cex = 0.7,
 | 
			
		||||
     main = "STRINGedges - degrees",
 | 
			
		||||
     xlab = "log(rank)",
 | 
			
		||||
     ylab = "log(frequency)",
 | 
			
		||||
     col = "#FFBB88")
 | 
			
		||||
 | 
			
		||||
sprintf("Average number of interactions: %5.2f",
 | 
			
		||||
         nrow(STR) / length(unique(c(STR$a, STR$b))))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    6  Write to file  =======================================================
 | 
			
		||||
 | 
			
		||||
saveRDS(STR, file = "./data/STRINGedges.rds")
 | 
			
		||||
 | 
			
		||||
# STRINGedges <- readRDS("./data/STRINGedges.rds")  # use this to restore the
 | 
			
		||||
                                                    # object when needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,167 +1,167 @@
 | 
			
		||||
# tocID <- "scripts/ABC-makeScCCnet.R"
 | 
			
		||||
#
 | 
			
		||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
 | 
			
		||||
# GOSlim annotation.
 | 
			
		||||
#
 | 
			
		||||
# Boris Steipe for ABC learning units
 | 
			
		||||
#
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
#      The large source- datafiles are NOT posted to github. If you want to
 | 
			
		||||
#      experiment with your own code, download them and place them into your
 | 
			
		||||
#      local  ./data  directory.
 | 
			
		||||
#
 | 
			
		||||
#      STRING data source:
 | 
			
		||||
#        Download page:
 | 
			
		||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
 | 
			
		||||
#        Data: (20.1 mb)
 | 
			
		||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
 | 
			
		||||
#
 | 
			
		||||
#      GOSlim data source: (Note: this has moved from GO to SGD)
 | 
			
		||||
#        Info page: https://www.yeastgenome.org/downloads
 | 
			
		||||
#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
 | 
			
		||||
#        Data: (3 mb)
 | 
			
		||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Update. GO Slim Yeast mow at SGD
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# SRCDIR <- "./instructor"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                           Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------
 | 
			
		||||
#TOC>   1        INITIALIZE                                        58
 | 
			
		||||
#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
 | 
			
		||||
#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
 | 
			
		||||
#TOC>   3.1        Intersect interactions and annotations         122
 | 
			
		||||
#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  INITIALIZE  ==========================================================
 | 
			
		||||
 | 
			
		||||
SRCDIR <- "./data"
 | 
			
		||||
if (! requireNamespace("readr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("readr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
 | 
			
		||||
 | 
			
		||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
 | 
			
		||||
# really not necessary to uncompress since readr:: can read from compressed
 | 
			
		||||
# files, and does so automatically, based on the file extension.
 | 
			
		||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
 | 
			
		||||
STR <- readr::read_delim(fn, delim = " ")
 | 
			
		||||
 | 
			
		||||
# Subset only IDs and combined_score column
 | 
			
		||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
 | 
			
		||||
 | 
			
		||||
# head(STR)
 | 
			
		||||
# sum(STR$combined_score > 909)  # 100270 edges
 | 
			
		||||
# subset for 100,000 highest confidence edges
 | 
			
		||||
STR <- STR[(STR$combined_score > 909), ]
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
 | 
			
		||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
 | 
			
		||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
# get a vector of gene names in this list
 | 
			
		||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
 | 
			
		||||
                                                      # names
 | 
			
		||||
length(myIntxGenes)
 | 
			
		||||
sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
 | 
			
		||||
#
 | 
			
		||||
# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
 | 
			
		||||
 | 
			
		||||
Gsl <- readr::read_tsv(fn,
 | 
			
		||||
                       col_names = c("ID",
 | 
			
		||||
                                     "name",
 | 
			
		||||
                                     "SGDId",
 | 
			
		||||
                                     "Ontology",
 | 
			
		||||
                                     "termName",
 | 
			
		||||
                                     "termID",
 | 
			
		||||
                                     "status"))
 | 
			
		||||
 | 
			
		||||
head(Gsl)
 | 
			
		||||
 | 
			
		||||
# What cell cycle names does it contain?
 | 
			
		||||
myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
 | 
			
		||||
myGslTermNames[grep("cycle", myGslTermNames)]
 | 
			
		||||
# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
 | 
			
		||||
 | 
			
		||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
 | 
			
		||||
 | 
			
		||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
 | 
			
		||||
length(scCCgenes)  # 324 genes annotated to that term
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Intersect interactions and annotations  ============================
 | 
			
		||||
 | 
			
		||||
sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
 | 
			
		||||
#                                # functional interactions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
 | 
			
		||||
#
 | 
			
		||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
 | 
			
		||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
 | 
			
		||||
#
 | 
			
		||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
 | 
			
		||||
               (STR$protein2 %in% scCCgenes), ]
 | 
			
		||||
 | 
			
		||||
# How many genes are there?
 | 
			
		||||
length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
 | 
			
		||||
 | 
			
		||||
# Each edge is listed twice - now remove duplicates.
 | 
			
		||||
 | 
			
		||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
 | 
			
		||||
#         smaller Than the second one. This brings the two names into a defined
 | 
			
		||||
#         order. Then concatenate them with a "." - the resulting string
 | 
			
		||||
#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
 | 
			
		||||
#         and c("B", "A") also gives "A.B". This identifies duplicates.
 | 
			
		||||
 | 
			
		||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
 | 
			
		||||
           1,
 | 
			
		||||
           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
 | 
			
		||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
 | 
			
		||||
 | 
			
		||||
sum(duplicated(x))  # 1453
 | 
			
		||||
 | 
			
		||||
# Step 2: drop all rows that contain duplicates in x
 | 
			
		||||
scCCnet <- scCCnet[! duplicated(x), ]
 | 
			
		||||
 | 
			
		||||
# Confirm we didn't loose genes
 | 
			
		||||
length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
 | 
			
		||||
nrow(scCCnet)
 | 
			
		||||
# Network has 283 nodes, 1453 edges
 | 
			
		||||
 | 
			
		||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
 | 
			
		||||
 | 
			
		||||
# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
 | 
			
		||||
                                             #      object when needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "scripts/ABC-makeScCCnet.R"
 | 
			
		||||
#
 | 
			
		||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
 | 
			
		||||
# GOSlim annotation.
 | 
			
		||||
#
 | 
			
		||||
# Boris Steipe for ABC learning units
 | 
			
		||||
#
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
#      The large source- datafiles are NOT posted to github. If you want to
 | 
			
		||||
#      experiment with your own code, download them and place them into your
 | 
			
		||||
#      local  ./data  directory.
 | 
			
		||||
#
 | 
			
		||||
#      STRING data source:
 | 
			
		||||
#        Download page:
 | 
			
		||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
 | 
			
		||||
#        Data: (20.1 mb)
 | 
			
		||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
 | 
			
		||||
#
 | 
			
		||||
#      GOSlim data source: (Note: this has moved from GO to SGD)
 | 
			
		||||
#        Info page: https://www.yeastgenome.org/downloads
 | 
			
		||||
#        Info page: http://sgd-archive.yeastgenome.org/curation/literature/
 | 
			
		||||
#        Data: (3 mb)
 | 
			
		||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version:  1.2
 | 
			
		||||
#
 | 
			
		||||
# Date:     2017-10  -  2020-09
 | 
			
		||||
# Author:   Boris Steipe (boris.steipe@utoronto.ca)
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#           1.2    2020 Update. GO Slim Yeast mow at SGD
 | 
			
		||||
#           1.1    Change from require() to requireNamespace(),
 | 
			
		||||
#                      use <package>::<function>() idiom throughout
 | 
			
		||||
#           1.0    First code copied from 2016 material.
 | 
			
		||||
#
 | 
			
		||||
# TODO:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
# SRCDIR <- "./instructor"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC>   Section  Title                                           Line
 | 
			
		||||
#TOC> ---------------------------------------------------------------
 | 
			
		||||
#TOC>   1        INITIALIZE                                        58
 | 
			
		||||
#TOC>   2        STRING FUNCTIONAL INTERACTION DATA                66
 | 
			
		||||
#TOC>   3        GOSlim FUNCTIONAL ANNOTATIONS                     96
 | 
			
		||||
#TOC>   3.1        Intersect interactions and annotations         122
 | 
			
		||||
#TOC>   4        DEFINE THE CELL-CYCLE NETWORK                    128
 | 
			
		||||
#TOC> 
 | 
			
		||||
#TOC> ==========================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    1  INITIALIZE  ==========================================================
 | 
			
		||||
 | 
			
		||||
SRCDIR <- "./data"
 | 
			
		||||
if (! requireNamespace("readr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("readr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    2  STRING FUNCTIONAL INTERACTION DATA  ==================================
 | 
			
		||||
 | 
			
		||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
 | 
			
		||||
# really not necessary to uncompress since readr:: can read from compressed
 | 
			
		||||
# files, and does so automatically, based on the file extension.
 | 
			
		||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
 | 
			
		||||
STR <- readr::read_delim(fn, delim = " ")
 | 
			
		||||
 | 
			
		||||
# Subset only IDs and combined_score column
 | 
			
		||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
 | 
			
		||||
 | 
			
		||||
# head(STR)
 | 
			
		||||
# sum(STR$combined_score > 909)  # 100270 edges
 | 
			
		||||
# subset for 100,000 highest confidence edges
 | 
			
		||||
STR <- STR[(STR$combined_score > 909), ]
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
 | 
			
		||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
 | 
			
		||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
 | 
			
		||||
head(STR)
 | 
			
		||||
 | 
			
		||||
# get a vector of gene names in this list
 | 
			
		||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene
 | 
			
		||||
                                                      # names
 | 
			
		||||
length(myIntxGenes)
 | 
			
		||||
sample(myIntxGenes, 10)  # choose 10 at random (sanity check)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    3  GOSlim FUNCTIONAL ANNOTATIONS  =======================================
 | 
			
		||||
#
 | 
			
		||||
# Read GOSlim data  (needs to be downloaded from database, see URL in Notes)
 | 
			
		||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
 | 
			
		||||
 | 
			
		||||
Gsl <- readr::read_tsv(fn,
 | 
			
		||||
                       col_names = c("ID",
 | 
			
		||||
                                     "name",
 | 
			
		||||
                                     "SGDId",
 | 
			
		||||
                                     "Ontology",
 | 
			
		||||
                                     "termName",
 | 
			
		||||
                                     "termID",
 | 
			
		||||
                                     "status"))
 | 
			
		||||
 | 
			
		||||
head(Gsl)
 | 
			
		||||
 | 
			
		||||
# What cell cycle names does it contain?
 | 
			
		||||
myGslTermNames <- unique(Gsl$termName)  # 169 unique terms
 | 
			
		||||
myGslTermNames[grep("cycle", myGslTermNames)]
 | 
			
		||||
# [1] "regulation of cell cycle"  "mitotic cell cycle"  "meiotic cell cycle"
 | 
			
		||||
 | 
			
		||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
 | 
			
		||||
 | 
			
		||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
 | 
			
		||||
length(scCCgenes)  # 324 genes annotated to that term
 | 
			
		||||
 | 
			
		||||
# ==   3.1  Intersect interactions and annotations  ============================
 | 
			
		||||
 | 
			
		||||
sum(scCCgenes %in% myIntxGenes)  # 307 of these have high-confidence
 | 
			
		||||
#                                # functional interactions
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# =    4  DEFINE THE CELL-CYCLE NETWORK  =======================================
 | 
			
		||||
#
 | 
			
		||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
 | 
			
		||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
 | 
			
		||||
#
 | 
			
		||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
 | 
			
		||||
               (STR$protein2 %in% scCCgenes), ]
 | 
			
		||||
 | 
			
		||||
# How many genes are there?
 | 
			
		||||
length(unique(c(scCCnet$protein1, scCCnet$protein2)))  #283
 | 
			
		||||
 | 
			
		||||
# Each edge is listed twice - now remove duplicates.
 | 
			
		||||
 | 
			
		||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
 | 
			
		||||
#         smaller Than the second one. This brings the two names into a defined
 | 
			
		||||
#         order. Then concatenate them with a "." - the resulting string
 | 
			
		||||
#         is always the same, for any order. E.g. c("A", "B") gives "A.B"
 | 
			
		||||
#         and c("B", "A") also gives "A.B". This identifies duplicates.
 | 
			
		||||
 | 
			
		||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
 | 
			
		||||
           1,
 | 
			
		||||
           FUN = function(x) { return(paste(sort(x), collapse = ".")) })
 | 
			
		||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
 | 
			
		||||
 | 
			
		||||
sum(duplicated(x))  # 1453
 | 
			
		||||
 | 
			
		||||
# Step 2: drop all rows that contain duplicates in x
 | 
			
		||||
scCCnet <- scCCnet[! duplicated(x), ]
 | 
			
		||||
 | 
			
		||||
# Confirm we didn't loose genes
 | 
			
		||||
length(unique(c(scCCnet$protein1, scCCnet$protein2)))  # 283, no change
 | 
			
		||||
nrow(scCCnet)
 | 
			
		||||
# Network has 283 nodes, 1453 edges
 | 
			
		||||
 | 
			
		||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
 | 
			
		||||
 | 
			
		||||
# scCCnet <- readRDS("./data/scCCnet.rds")   # <<<- use this to restore the
 | 
			
		||||
                                             #      object when needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,135 +1,135 @@
 | 
			
		||||
# tocID <- "scripts/ABC-writeALN.R"
 | 
			
		||||
#
 | 
			
		||||
# ToDo:    calculate consensus line
 | 
			
		||||
#          append sequence numbers
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
writeALN <- function(ali,
 | 
			
		||||
                     range,
 | 
			
		||||
                     note = "",
 | 
			
		||||
                     myCon = stdout(),
 | 
			
		||||
                     blockWidth = 60) {
 | 
			
		||||
  # Purpose:
 | 
			
		||||
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
 | 
			
		||||
  #     a file in multi-FASTA format.
 | 
			
		||||
  # Version: 2.0
 | 
			
		||||
  # Date:    2017 10
 | 
			
		||||
  # Author:  Boris Steipe
 | 
			
		||||
  #
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
 | 
			
		||||
  #                       vector.
 | 
			
		||||
  #     range      num  a two-integer vector of start and end positions if
 | 
			
		||||
  #                       only a range of the MSA should be written, e.g.
 | 
			
		||||
  #                       a domain. Defaults to the full alignment length.
 | 
			
		||||
  #     note       chr  a vector of character that is appended to the name
 | 
			
		||||
  #                       of a sequence in the FASTA header. Recycling of
 | 
			
		||||
  #                       shorter vectors applies, thus a vector of length one
 | 
			
		||||
  #                       is added to all headers.
 | 
			
		||||
  #     myCon           a connection (cf. the con argument for writeLines).
 | 
			
		||||
  #                       Defaults to stdout()
 | 
			
		||||
  #     blockWidth int  width of sequence block. Default 80 characters.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #     NA   the function is invoked for its side effect of printing an
 | 
			
		||||
  #          alignment to stdout() or file.
 | 
			
		||||
 | 
			
		||||
  blockWidth <- as.integer(blockWidth)
 | 
			
		||||
  if (is.na(blockWidth)) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
 | 
			
		||||
  }
 | 
			
		||||
  if (blockWidth < 1) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
 | 
			
		||||
  }
 | 
			
		||||
  if (blockWidth > 60) {
 | 
			
		||||
    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Extract the raw data from the objects depending on their respective class
 | 
			
		||||
  # and put it into a named vector of strings.
 | 
			
		||||
 | 
			
		||||
  # Extract XStringSet from MsaXMultipleAlignment ...
 | 
			
		||||
  if (class(ali) == "MsaAAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaDNAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaRNAMultipleAlignment") {
 | 
			
		||||
      ali <- ali@unmasked
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process XStringSet
 | 
			
		||||
  if (class(ali) == "AAStringSet" |
 | 
			
		||||
      class(ali) == "DNAStringSet" |
 | 
			
		||||
      class(ali) == "RNAStringSet") {
 | 
			
		||||
    sSet <- as.character(ali) # we use as.character(), not toString() thus
 | 
			
		||||
                              # we don't _have_ to load Biostrings
 | 
			
		||||
  } else if (class(ali) == "character") {
 | 
			
		||||
    sSet <- ali
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(paste("Input object of class",
 | 
			
		||||
               class(ali),
 | 
			
		||||
               "can't be handled by this function."))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (missing(range)) {
 | 
			
		||||
    range <- 1
 | 
			
		||||
    range[2] <- max(nchar(sSet))
 | 
			
		||||
  } else {
 | 
			
		||||
    range <- as.integer(range)
 | 
			
		||||
    if(length(range) != 2 ||
 | 
			
		||||
       any(is.na(range)) ||
 | 
			
		||||
       range[1] > range[2] ||
 | 
			
		||||
       range[1] < 1) {
 | 
			
		||||
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Right-pad any sequence with "-" that is shorter than ranges[2]
 | 
			
		||||
    for (i in seq_along(sSet)) {
 | 
			
		||||
      if (nchar(sSet[i]) < range[2]) {
 | 
			
		||||
        sSet[i] <- paste0(sSet[i],
 | 
			
		||||
                          paste0(rep("-", range[2] - nchar(sSet[i])),
 | 
			
		||||
                                 collapse = ""))
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  # Right-pad sequence names
 | 
			
		||||
  sNames <- names(sSet)
 | 
			
		||||
  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
 | 
			
		||||
  for (i in seq_along(sNames)) {
 | 
			
		||||
    sNames[i] <- paste0(sNames[i],
 | 
			
		||||
                      paste0(rep(" ", len - nchar(sNames[i])),
 | 
			
		||||
                             collapse = ""))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # Process each sequence
 | 
			
		||||
  txt <- paste0("CLUSTAL W format. ", note)
 | 
			
		||||
  txt[2] <- ""
 | 
			
		||||
 | 
			
		||||
  iStarts <- seq(range[1], range[2], by = blockWidth)
 | 
			
		||||
  iEnds <- c((iStarts[-1] - 1), range[2])
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(iStarts)) {
 | 
			
		||||
    for (j in seq_along(sSet)) {
 | 
			
		||||
      txt <- c(txt,
 | 
			
		||||
               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
 | 
			
		||||
    }
 | 
			
		||||
    txt <- c(txt, "")  # append a blank consenus line
 | 
			
		||||
    txt <- c(txt, "")  # append a separator line
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  writeLines(txt, con= myCon)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# tocID <- "scripts/ABC-writeALN.R"
 | 
			
		||||
#
 | 
			
		||||
# ToDo:    calculate consensus line
 | 
			
		||||
#          append sequence numbers
 | 
			
		||||
# Notes:
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
writeALN <- function(ali,
 | 
			
		||||
                     range,
 | 
			
		||||
                     note = "",
 | 
			
		||||
                     myCon = stdout(),
 | 
			
		||||
                     blockWidth = 60) {
 | 
			
		||||
  # Purpose:
 | 
			
		||||
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
 | 
			
		||||
  #     a file in multi-FASTA format.
 | 
			
		||||
  # Version: 2.0
 | 
			
		||||
  # Date:    2017 10
 | 
			
		||||
  # Author:  Boris Steipe
 | 
			
		||||
  #
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
 | 
			
		||||
  #                       vector.
 | 
			
		||||
  #     range      num  a two-integer vector of start and end positions if
 | 
			
		||||
  #                       only a range of the MSA should be written, e.g.
 | 
			
		||||
  #                       a domain. Defaults to the full alignment length.
 | 
			
		||||
  #     note       chr  a vector of character that is appended to the name
 | 
			
		||||
  #                       of a sequence in the FASTA header. Recycling of
 | 
			
		||||
  #                       shorter vectors applies, thus a vector of length one
 | 
			
		||||
  #                       is added to all headers.
 | 
			
		||||
  #     myCon           a connection (cf. the con argument for writeLines).
 | 
			
		||||
  #                       Defaults to stdout()
 | 
			
		||||
  #     blockWidth int  width of sequence block. Default 80 characters.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #     NA   the function is invoked for its side effect of printing an
 | 
			
		||||
  #          alignment to stdout() or file.
 | 
			
		||||
 | 
			
		||||
  blockWidth <- as.integer(blockWidth)
 | 
			
		||||
  if (is.na(blockWidth)) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
 | 
			
		||||
  }
 | 
			
		||||
  if (blockWidth < 1) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
 | 
			
		||||
  }
 | 
			
		||||
  if (blockWidth > 60) {
 | 
			
		||||
    warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Extract the raw data from the objects depending on their respective class
 | 
			
		||||
  # and put it into a named vector of strings.
 | 
			
		||||
 | 
			
		||||
  # Extract XStringSet from MsaXMultipleAlignment ...
 | 
			
		||||
  if (class(ali) == "MsaAAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaDNAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaRNAMultipleAlignment") {
 | 
			
		||||
      ali <- ali@unmasked
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process XStringSet
 | 
			
		||||
  if (class(ali) == "AAStringSet" |
 | 
			
		||||
      class(ali) == "DNAStringSet" |
 | 
			
		||||
      class(ali) == "RNAStringSet") {
 | 
			
		||||
    sSet <- as.character(ali) # we use as.character(), not toString() thus
 | 
			
		||||
                              # we don't _have_ to load Biostrings
 | 
			
		||||
  } else if (class(ali) == "character") {
 | 
			
		||||
    sSet <- ali
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(paste("Input object of class",
 | 
			
		||||
               class(ali),
 | 
			
		||||
               "can't be handled by this function."))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (missing(range)) {
 | 
			
		||||
    range <- 1
 | 
			
		||||
    range[2] <- max(nchar(sSet))
 | 
			
		||||
  } else {
 | 
			
		||||
    range <- as.integer(range)
 | 
			
		||||
    if(length(range) != 2 ||
 | 
			
		||||
       any(is.na(range)) ||
 | 
			
		||||
       range[1] > range[2] ||
 | 
			
		||||
       range[1] < 1) {
 | 
			
		||||
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Right-pad any sequence with "-" that is shorter than ranges[2]
 | 
			
		||||
    for (i in seq_along(sSet)) {
 | 
			
		||||
      if (nchar(sSet[i]) < range[2]) {
 | 
			
		||||
        sSet[i] <- paste0(sSet[i],
 | 
			
		||||
                          paste0(rep("-", range[2] - nchar(sSet[i])),
 | 
			
		||||
                                 collapse = ""))
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  # Right-pad sequence names
 | 
			
		||||
  sNames <- names(sSet)
 | 
			
		||||
  len <- max(nchar(sNames)) + 2 # longest name plus two spaces
 | 
			
		||||
  for (i in seq_along(sNames)) {
 | 
			
		||||
    sNames[i] <- paste0(sNames[i],
 | 
			
		||||
                      paste0(rep(" ", len - nchar(sNames[i])),
 | 
			
		||||
                             collapse = ""))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # Process each sequence
 | 
			
		||||
  txt <- paste0("CLUSTAL W format. ", note)
 | 
			
		||||
  txt[2] <- ""
 | 
			
		||||
 | 
			
		||||
  iStarts <- seq(range[1], range[2], by = blockWidth)
 | 
			
		||||
  iEnds <- c((iStarts[-1] - 1), range[2])
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(iStarts)) {
 | 
			
		||||
    for (j in seq_along(sSet)) {
 | 
			
		||||
      txt <- c(txt,
 | 
			
		||||
               paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
 | 
			
		||||
    }
 | 
			
		||||
    txt <- c(txt, "")  # append a blank consenus line
 | 
			
		||||
    txt <- c(txt, "")  # append a separator line
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  writeLines(txt, con= myCon)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,121 +1,121 @@
 | 
			
		||||
# ABC-writeMFA.R
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
writeMFA <- function(ali,
 | 
			
		||||
                     range,
 | 
			
		||||
                     note = "",
 | 
			
		||||
                     myCon = stdout(),
 | 
			
		||||
                     blockWidth = 80) {
 | 
			
		||||
  # Purpose:
 | 
			
		||||
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
 | 
			
		||||
  #     a file in multi-FASTA format.
 | 
			
		||||
  # Version: 2.1
 | 
			
		||||
  # Date:    2017  10
 | 
			
		||||
  # Author:  Boris Steipe
 | 
			
		||||
  #
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
 | 
			
		||||
  #                       vector
 | 
			
		||||
  #     range      num  a two-integer vector of start and end positions if
 | 
			
		||||
  #                       only a range of the MSA should be written, e.g.
 | 
			
		||||
  #                       a domain. Defaults to the full sequence length.
 | 
			
		||||
  #     note       chr  a vector of character that is appended to the name
 | 
			
		||||
  #                       of a sequence in the FASTA header. Recycling of
 | 
			
		||||
  #                       shorter vectors applies, thus a vector of length one
 | 
			
		||||
  #                       is added to all headers.
 | 
			
		||||
  #     myCon           a connection (cf. the con argument for writeLines).
 | 
			
		||||
  #                       Defaults to stdout()
 | 
			
		||||
  #     blockWidth int  width of sequence block. Default 80 characters.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #     NA   the function is invoked for its side effect of printing an
 | 
			
		||||
  #          alignment to stdout() or file.
 | 
			
		||||
 | 
			
		||||
  blockWidth <- as.integer(blockWidth)
 | 
			
		||||
  if (is.na(blockWidth)) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
 | 
			
		||||
  }
 | 
			
		||||
  if (! blockWidth > 0){
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Extract the raw data from the objects depending on their respective class
 | 
			
		||||
  # and put it into a named vector of strings.
 | 
			
		||||
 | 
			
		||||
  # Extract XStringSet from MsaXMultipleAlignment ...
 | 
			
		||||
  if (class(ali) == "MsaAAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaDNAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaRNAMultipleAlignment") {
 | 
			
		||||
      ali <- ali@unmasked
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process XStringSet
 | 
			
		||||
  if (class(ali) == "AAStringSet" |
 | 
			
		||||
      class(ali) == "DNAStringSet" |
 | 
			
		||||
      class(ali) == "RNAStringSet") {
 | 
			
		||||
    sSet <- as.character(ali) # we use as.character(), not toString() thus
 | 
			
		||||
                              # we don't _have_ to load Biostrings
 | 
			
		||||
  } else if (class(ali) == "character") {
 | 
			
		||||
    sSet <- ali
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(paste("Input object of class",
 | 
			
		||||
               class(ali),
 | 
			
		||||
               "can't be handled by this function."))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (missing(range)) {
 | 
			
		||||
    range <- 1
 | 
			
		||||
    range[2] <- max(nchar(sSet))
 | 
			
		||||
  } else {
 | 
			
		||||
    range <- as.integer(range)
 | 
			
		||||
    if(length(range) != 2 ||
 | 
			
		||||
       any(is.na(range)) ||
 | 
			
		||||
       range[1] > range[2] ||
 | 
			
		||||
       range[1] < 1) {
 | 
			
		||||
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process each sequence
 | 
			
		||||
  txt <- character()
 | 
			
		||||
  if (note != "") {  # construct header line
 | 
			
		||||
    headers <- paste(names(sSet), note)
 | 
			
		||||
  } else {
 | 
			
		||||
    headers <- names(sSet)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(sSet)) {
 | 
			
		||||
 | 
			
		||||
    # output FASTA header
 | 
			
		||||
    txt <- c(txt, sprintf(">%s", headers[i]))
 | 
			
		||||
 | 
			
		||||
    # output the sequence in blocks of blockWidth per line ...
 | 
			
		||||
    iStarts <- seq(range[1], range[2], by = blockWidth)
 | 
			
		||||
    iEnds <- c((iStarts[-1] - 1), range[2])
 | 
			
		||||
 | 
			
		||||
    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
 | 
			
		||||
    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
 | 
			
		||||
    txt <- c(txt, thisSeq)
 | 
			
		||||
 | 
			
		||||
    txt <- c(txt, "")  # append an empty line for readability
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  writeLines(txt, con = myCon)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# ABC-writeMFA.R
 | 
			
		||||
#
 | 
			
		||||
# ToDo:
 | 
			
		||||
# Notes:  2.1  bugfix: empty notes caused superfluous blank after header.
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
writeMFA <- function(ali,
 | 
			
		||||
                     range,
 | 
			
		||||
                     note = "",
 | 
			
		||||
                     myCon = stdout(),
 | 
			
		||||
                     blockWidth = 80) {
 | 
			
		||||
  # Purpose:
 | 
			
		||||
  #     Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
 | 
			
		||||
  #     a file in multi-FASTA format.
 | 
			
		||||
  # Version: 2.1
 | 
			
		||||
  # Date:    2017  10
 | 
			
		||||
  # Author:  Boris Steipe
 | 
			
		||||
  #
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #     ali             MsaAAMultipleAlignment or AAStringSet or character
 | 
			
		||||
  #                       vector
 | 
			
		||||
  #     range      num  a two-integer vector of start and end positions if
 | 
			
		||||
  #                       only a range of the MSA should be written, e.g.
 | 
			
		||||
  #                       a domain. Defaults to the full sequence length.
 | 
			
		||||
  #     note       chr  a vector of character that is appended to the name
 | 
			
		||||
  #                       of a sequence in the FASTA header. Recycling of
 | 
			
		||||
  #                       shorter vectors applies, thus a vector of length one
 | 
			
		||||
  #                       is added to all headers.
 | 
			
		||||
  #     myCon           a connection (cf. the con argument for writeLines).
 | 
			
		||||
  #                       Defaults to stdout()
 | 
			
		||||
  #     blockWidth int  width of sequence block. Default 80 characters.
 | 
			
		||||
  # Value:
 | 
			
		||||
  #     NA   the function is invoked for its side effect of printing an
 | 
			
		||||
  #          alignment to stdout() or file.
 | 
			
		||||
 | 
			
		||||
  blockWidth <- as.integer(blockWidth)
 | 
			
		||||
  if (is.na(blockWidth)) {
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be numeric.")
 | 
			
		||||
  }
 | 
			
		||||
  if (! blockWidth > 0){
 | 
			
		||||
    stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Extract the raw data from the objects depending on their respective class
 | 
			
		||||
  # and put it into a named vector of strings.
 | 
			
		||||
 | 
			
		||||
  # Extract XStringSet from MsaXMultipleAlignment ...
 | 
			
		||||
  if (class(ali) == "MsaAAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaDNAMultipleAlignment" |
 | 
			
		||||
      class(ali) == "MsaRNAMultipleAlignment") {
 | 
			
		||||
      ali <- ali@unmasked
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process XStringSet
 | 
			
		||||
  if (class(ali) == "AAStringSet" |
 | 
			
		||||
      class(ali) == "DNAStringSet" |
 | 
			
		||||
      class(ali) == "RNAStringSet") {
 | 
			
		||||
    sSet <- as.character(ali) # we use as.character(), not toString() thus
 | 
			
		||||
                              # we don't _have_ to load Biostrings
 | 
			
		||||
  } else if (class(ali) == "character") {
 | 
			
		||||
    sSet <- ali
 | 
			
		||||
  } else {
 | 
			
		||||
    stop(paste("Input object of class",
 | 
			
		||||
               class(ali),
 | 
			
		||||
               "can't be handled by this function."))
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (missing(range)) {
 | 
			
		||||
    range <- 1
 | 
			
		||||
    range[2] <- max(nchar(sSet))
 | 
			
		||||
  } else {
 | 
			
		||||
    range <- as.integer(range)
 | 
			
		||||
    if(length(range) != 2 ||
 | 
			
		||||
       any(is.na(range)) ||
 | 
			
		||||
       range[1] > range[2] ||
 | 
			
		||||
       range[1] < 1) {
 | 
			
		||||
      stop("PANIC: \"range\" parameter must contain valid start and end index.")
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # Process each sequence
 | 
			
		||||
  txt <- character()
 | 
			
		||||
  if (note != "") {  # construct header line
 | 
			
		||||
    headers <- paste(names(sSet), note)
 | 
			
		||||
  } else {
 | 
			
		||||
    headers <- names(sSet)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (i in seq_along(sSet)) {
 | 
			
		||||
 | 
			
		||||
    # output FASTA header
 | 
			
		||||
    txt <- c(txt, sprintf(">%s", headers[i]))
 | 
			
		||||
 | 
			
		||||
    # output the sequence in blocks of blockWidth per line ...
 | 
			
		||||
    iStarts <- seq(range[1], range[2], by = blockWidth)
 | 
			
		||||
    iEnds <- c((iStarts[-1] - 1), range[2])
 | 
			
		||||
 | 
			
		||||
    thisSeq <- substring(sSet[i], iStarts, iEnds)  # collect all blocks
 | 
			
		||||
    thisSeq <- thisSeq[! nchar(thisSeq) == 0]      # drop empty blocks
 | 
			
		||||
    txt <- c(txt, thisSeq)
 | 
			
		||||
 | 
			
		||||
    txt <- c(txt, "")  # append an empty line for readability
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  writeLines(txt, con = myCon)
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# ====  TESTS  =================================================================
 | 
			
		||||
# Enter your function tests here...
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # test ...
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							
							
						
						
									
										768
									
								
								scripts/BLAST.R
									
									
									
									
									
								
							@@ -1,384 +1,384 @@
 | 
			
		||||
# BLAST.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose: Send off one BLAST search and return parsed list of results
 | 
			
		||||
#          This script uses the BLAST URL-API
 | 
			
		||||
#          (Application Programming Interface) at the NCBI.
 | 
			
		||||
#          Read about the constraints here:
 | 
			
		||||
#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version: 3.2
 | 
			
		||||
# Date:    2016 09 - 2020 09
 | 
			
		||||
# Author:  Boris Steipe
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#    3.2   2020 updates
 | 
			
		||||
#    3.1   Change from require() to requireNamespace(),
 | 
			
		||||
#          use <package>::<function>() idiom throughout
 | 
			
		||||
#    3.0   parsing logic had not been fully implemented; Fixed.
 | 
			
		||||
#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 | 
			
		||||
#          refactored parseBLASTalignment() to handle lists with multiple hits.
 | 
			
		||||
#    2.0   Completely rewritten because the interface completely changed.
 | 
			
		||||
#          Code adpated in part from NCBI Perl sample code:
 | 
			
		||||
#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
 | 
			
		||||
#    1.0   first version posted for BCH441 2016, based on BLAST - API
 | 
			
		||||
#
 | 
			
		||||
# ToDo:    Return the organism/strain name in the output, and propagate
 | 
			
		||||
#          into MYSPE selection script.
 | 
			
		||||
#
 | 
			
		||||
# Notes:   This is somewhat pedestrian, but apparently there are currently
 | 
			
		||||
#          no R packages that contain such code.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BLAST <- function(Q,
 | 
			
		||||
                  db = "refseq_protein",
 | 
			
		||||
                  nHits = 30,
 | 
			
		||||
                  E = 0.1,
 | 
			
		||||
                  limits = "",
 | 
			
		||||
                  rid = "",
 | 
			
		||||
                  query = "",
 | 
			
		||||
                  quietly = FALSE,
 | 
			
		||||
                  myTimeout = 120) {
 | 
			
		||||
    # Purpose:
 | 
			
		||||
    #     Basic BLAST search
 | 
			
		||||
    #
 | 
			
		||||
    # Parameters:
 | 
			
		||||
    #     Q: query - either a valid ID or a sequence
 | 
			
		||||
    #     db: "refseq_protein" by default,
 | 
			
		||||
    #         other legal values include: "nr", "pdb", "swissprot" ...
 | 
			
		||||
    #     nHits: number of hits to maximally return
 | 
			
		||||
    #     E: E-value cutoff. Do not return hits whose score would be expected
 | 
			
		||||
    #        to occur E or more times in a database of random sequence.
 | 
			
		||||
    #     limits: a valid ENTREZ filter
 | 
			
		||||
    #     rid: a request ID - to retrieve earlier search results
 | 
			
		||||
    #     query: the actual query string (needed when retrieving results
 | 
			
		||||
    #            with an rid)
 | 
			
		||||
    #     quietly: controls printing of wait-time progress bar
 | 
			
		||||
    #     timeout: how much longer _after_ rtoe to wait for a result
 | 
			
		||||
    #              before giving up (seconds)
 | 
			
		||||
    # Value:
 | 
			
		||||
    #     result: list of process status or resulting hits, and some metadata
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
 | 
			
		||||
 | 
			
		||||
    results <- list()
 | 
			
		||||
    results$query = query
 | 
			
		||||
    results$rid <- rid
 | 
			
		||||
    results$rtoe <- 0
 | 
			
		||||
 | 
			
		||||
    if (rid == "") {  # If no rid is available, spawn a search.
 | 
			
		||||
                      # Else, proceed directly to retrieval.
 | 
			
		||||
 | 
			
		||||
      # prepare query, GET(), and parse rid and rtoe from BLAST server response
 | 
			
		||||
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                              "?",
 | 
			
		||||
                              "CMD=Put",
 | 
			
		||||
                              "&PROGRAM=", "blastp",
 | 
			
		||||
                              "&QUERY=", URLencode(Q),
 | 
			
		||||
                              "&DATABASE=", db,
 | 
			
		||||
                              "&MATRIX=", "BLOSUM62",
 | 
			
		||||
                              "&EXPECT=", as.character(E),
 | 
			
		||||
                              "&HITLIST_SIZE=", as.character(nHits),
 | 
			
		||||
                              "&ALIGNMENTS=", as.character(nHits),
 | 
			
		||||
                              "&FORMAT_TYPE=Text")
 | 
			
		||||
 | 
			
		||||
      if (limits != "") {
 | 
			
		||||
        results$query <- paste0(
 | 
			
		||||
          results$query,
 | 
			
		||||
          "&ENTREZ_QUERY=", limits)
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      # send it off ...
 | 
			
		||||
      response <- httr::GET(results$query)
 | 
			
		||||
      if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
 | 
			
		||||
                     httr::http_status(response)$message))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
      patt <- "RID = (\\w+)" # match the request id
 | 
			
		||||
      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
 | 
			
		||||
 | 
			
		||||
      patt <- "RTOE = (\\d+)" # match the expected completion time
 | 
			
		||||
      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
 | 
			
		||||
 | 
			
		||||
      # Now we wait ...
 | 
			
		||||
      if (quietly) {
 | 
			
		||||
        Sys.sleep(results$rtoe)
 | 
			
		||||
      } else {
 | 
			
		||||
        cat(sprintf("BLAST is processing %s:\n", results$rid))
 | 
			
		||||
        waitTimer(results$rtoe)
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    } # done sending query and retrieving rid, rtoe
 | 
			
		||||
 | 
			
		||||
    # Enter an infinite loop to check for result availability
 | 
			
		||||
    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                         "?",
 | 
			
		||||
                         "CMD=Get",
 | 
			
		||||
                         "&RID=", results$rid,
 | 
			
		||||
                         "&FORMAT_TYPE=Text",
 | 
			
		||||
                         "&FORMAT_OBJECT=SearchInfo",
 | 
			
		||||
                         sep = "")
 | 
			
		||||
 | 
			
		||||
    while (TRUE) {
 | 
			
		||||
      # Check whether the result is ready
 | 
			
		||||
      response <- httr::GET(checkStatus)
 | 
			
		||||
      if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
 | 
			
		||||
                     httr::http_status(response)$message))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
      if (length(grep("Status=WAITING",  txt)) > 0) {
 | 
			
		||||
        myTimeout <- myTimeout - EXTRAWAIT
 | 
			
		||||
 | 
			
		||||
        if (myTimeout <= 0) { # abort
 | 
			
		||||
          cat("BLAST search not concluded before timeout. Aborting.\n")
 | 
			
		||||
          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
 | 
			
		||||
                      "Trying checking back later with >",
 | 
			
		||||
                      results$rid))
 | 
			
		||||
          return(results)
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (quietly) {
 | 
			
		||||
          Sys.sleep(EXTRAWAIT)
 | 
			
		||||
        } else {
 | 
			
		||||
          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
 | 
			
		||||
                      EXTRAWAIT,
 | 
			
		||||
                      myTimeout))
 | 
			
		||||
          waitTimer(EXTRAWAIT)
 | 
			
		||||
          next
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=FAILED",  txt)) > 0) {
 | 
			
		||||
          cat("BLAST search returned status \"FAILED\". Aborting.\n")
 | 
			
		||||
          return(results)
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
 | 
			
		||||
          cat("BLAST search returned status \"UNKNOWN\".\n")
 | 
			
		||||
          cat("This probably means the rid has expired. Aborting.\n")
 | 
			
		||||
          return(results)
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
 | 
			
		||||
 | 
			
		||||
          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
 | 
			
		||||
            cat("BLAST search ready but no hits found. Aborting.\n")
 | 
			
		||||
            return(results)
 | 
			
		||||
 | 
			
		||||
          } else {
 | 
			
		||||
            break  # done ... retrieve search result
 | 
			
		||||
          }
 | 
			
		||||
      }
 | 
			
		||||
    } # end result-check loop
 | 
			
		||||
 | 
			
		||||
    # retrieve results from BLAST server
 | 
			
		||||
    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                      "?",
 | 
			
		||||
                      "&CMD=Get",
 | 
			
		||||
                      "&RID=", results$rid,
 | 
			
		||||
                      "&FORMAT_TYPE=Text",
 | 
			
		||||
                      sep = "")
 | 
			
		||||
 | 
			
		||||
    response <- httr::GET(retrieve)
 | 
			
		||||
    if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
 | 
			
		||||
                   httr::http_status(response)$message))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
    # txt contains the whole set of results. Process:
 | 
			
		||||
 | 
			
		||||
    # First, we strsplit() on linebreaks:
 | 
			
		||||
    txt <- unlist(strsplit(txt, "\n"))
 | 
			
		||||
 | 
			
		||||
    # The alignments range from the first line that begins with ">" ...
 | 
			
		||||
    iFirst <- grep("^>", txt)[1]
 | 
			
		||||
 | 
			
		||||
    # ... to the last line that begins with "Sbjct"
 | 
			
		||||
    x <- grep("^Sbjct", txt)
 | 
			
		||||
    iLast <- x[length(x)]
 | 
			
		||||
 | 
			
		||||
    # Get the alignments block
 | 
			
		||||
    txt <- txt[iFirst:iLast]
 | 
			
		||||
 | 
			
		||||
    # Drop empty lines
 | 
			
		||||
    txt <- txt[!(nchar(txt) == 0)]
 | 
			
		||||
 | 
			
		||||
    # A line that ends "]" but does not begin ">" seems to be a split
 | 
			
		||||
    # defline ... eg.
 | 
			
		||||
    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
 | 
			
		||||
    #  [2] "EXF-2481]"
 | 
			
		||||
    #  Merge these lines to the preceding lines and delete them.
 | 
			
		||||
    #
 | 
			
		||||
    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
 | 
			
		||||
    if (length(x) > 0) {
 | 
			
		||||
      txt[x-1] <- paste0(txt[x-1], txt[x])
 | 
			
		||||
      txt <- txt[-x]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Special case: there may be multiple deflines when the BLAST hit is to
 | 
			
		||||
    # redundant, identical sequences. Keep only the first instance.
 | 
			
		||||
    iKeep <- ! grepl("^>", txt)
 | 
			
		||||
    x <- rle(iKeep)
 | 
			
		||||
    x$positions <- cumsum(x$lengths)
 | 
			
		||||
    i <- which(x$lengths > 1 & x$values == FALSE)
 | 
			
		||||
    if (length(i) > 0) {
 | 
			
		||||
      firsts <- x$positions[i] - x$lengths[i] + 1
 | 
			
		||||
      iKeep[firsts] <- TRUE
 | 
			
		||||
      txt <- txt[iKeep]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # After this preprocessing the following should be true:
 | 
			
		||||
    # - Every alignment block begins with a defline in which the
 | 
			
		||||
    #   first character is ">"
 | 
			
		||||
    # - There is only one defline in each block.
 | 
			
		||||
    # - Lines are not split.
 | 
			
		||||
 | 
			
		||||
    # Make a dataframe of first and last indices of alignment blocks
 | 
			
		||||
    x <- grep("^>", txt)
 | 
			
		||||
    blocks <- data.frame(iFirst = x,
 | 
			
		||||
                         iLast  = c((x[-1] - 1), length(txt)))
 | 
			
		||||
 | 
			
		||||
    # Build the hits list by parsing the blocks
 | 
			
		||||
    results$hits <- list()
 | 
			
		||||
 | 
			
		||||
    for (i in seq_len(nrow(blocks))) {
 | 
			
		||||
      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
 | 
			
		||||
      results$hits[[i]] <- parseBLASTalignment(thisBlock)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return(results)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
parseBLASTalignment <- function(hit) {
 | 
			
		||||
  # Parse data from a character vector containing a BLAST hit
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    hit  char   one BLAST hit as char vector
 | 
			
		||||
  # Value:
 | 
			
		||||
  #          list   $def          chr   defline
 | 
			
		||||
  #                 $accession    chr   accession number
 | 
			
		||||
  #                 $organism     chr   complete organism definition
 | 
			
		||||
  #                 $species      chr   binomial species
 | 
			
		||||
  #                 $E            num   E value
 | 
			
		||||
  #                 $lengthAli    num   length of the alignment
 | 
			
		||||
  #                 $nIdentitites num   number of identities
 | 
			
		||||
  #                 $nGaps        num   number of gaps
 | 
			
		||||
  #                 $Qbounds      num   2-element vector of query start-end
 | 
			
		||||
  #                 $Sbounds      num   2-element vector of subject start-end
 | 
			
		||||
  #                 $Qseq         chr   query sequence
 | 
			
		||||
  #                 $midSeq       chr   midline string
 | 
			
		||||
  #                 $Sseq         chr   subject sequence
 | 
			
		||||
 | 
			
		||||
  getToken <- function(patt, v) {
 | 
			
		||||
    # get the first token identified by pattern patt in character vector v
 | 
			
		||||
    v <- v[grep(patt, v)]
 | 
			
		||||
    if (length(v) > 1) { v <- v[1] }
 | 
			
		||||
    if (length(v) == 0) { token <- NA
 | 
			
		||||
    } else {
 | 
			
		||||
      token <- regmatches(v, regexec(patt, v))[[1]][2] }
 | 
			
		||||
    return(token)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  h <- list()
 | 
			
		||||
 | 
			
		||||
  # FASTA defline
 | 
			
		||||
  h$def <- hit[1]
 | 
			
		||||
 | 
			
		||||
  # accesion number (ID), use the first if there are several, separated by "|"
 | 
			
		||||
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
 | 
			
		||||
  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | 
			
		||||
 | 
			
		||||
  # organism
 | 
			
		||||
  patt <- "\\[(.+)]"
 | 
			
		||||
  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | 
			
		||||
 | 
			
		||||
  # species
 | 
			
		||||
  x <- unlist(strsplit(h$organism, "\\s+"))
 | 
			
		||||
  if (length(x) >= 2) {
 | 
			
		||||
    h$species <- paste(x[1], x[2])
 | 
			
		||||
  } else if (length(x) == 1) {
 | 
			
		||||
    h$species <- paste(x[1], "sp.")
 | 
			
		||||
  } else {
 | 
			
		||||
    h$species <- NA
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # E-value
 | 
			
		||||
  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
 | 
			
		||||
 | 
			
		||||
  # length of alignment
 | 
			
		||||
  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
 | 
			
		||||
 | 
			
		||||
  # number of identities
 | 
			
		||||
  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
 | 
			
		||||
 | 
			
		||||
  # number of gaps
 | 
			
		||||
  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
 | 
			
		||||
 | 
			
		||||
  # split up alignment section
 | 
			
		||||
  idx <- grep("^Query ", hit)
 | 
			
		||||
  Que <- hit[idx]
 | 
			
		||||
  Mid <- hit[idx + 1]
 | 
			
		||||
  Sbj <- hit[idx + 2]
 | 
			
		||||
 | 
			
		||||
  # first and last positions
 | 
			
		||||
  h$Qbounds <- c(start = 0, end = 0)
 | 
			
		||||
  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
 | 
			
		||||
  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
 | 
			
		||||
 | 
			
		||||
  h$Sbounds <- c(start = 0, end = 0)
 | 
			
		||||
  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
 | 
			
		||||
  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
 | 
			
		||||
 | 
			
		||||
  # aligned sequences
 | 
			
		||||
  for (i in seq_along(Que)) {
 | 
			
		||||
    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
 | 
			
		||||
    m <- regexec(patt, Que[i])
 | 
			
		||||
    iFirst <- m[[1]][2]
 | 
			
		||||
    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
 | 
			
		||||
    Que[i] <- substring(Que[i], iFirst, iLast)
 | 
			
		||||
    Mid[i] <- substring(Mid[i], iFirst, iLast)
 | 
			
		||||
    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  h$Qseq   <- paste0(Que, collapse = "")
 | 
			
		||||
  h$midSeq <- paste0(Mid, collapse = "")
 | 
			
		||||
  h$Sseq   <- paste0(Sbj, collapse = "")
 | 
			
		||||
 | 
			
		||||
  return(h)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==== TESTS ===================================================================
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # define query:
 | 
			
		||||
  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
 | 
			
		||||
               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
 | 
			
		||||
               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
 | 
			
		||||
               sep="")
 | 
			
		||||
  # or ...
 | 
			
		||||
  q <- "NP_010227" # refseq ID
 | 
			
		||||
 | 
			
		||||
  test <- BLAST(q,
 | 
			
		||||
                nHits = 100,
 | 
			
		||||
                E = 0.001,
 | 
			
		||||
                rid = "",
 | 
			
		||||
                limits = "txid4751[ORGN]")  # Fungi
 | 
			
		||||
  str(test)
 | 
			
		||||
  length(test$hits)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
# BLAST.R
 | 
			
		||||
#
 | 
			
		||||
# Purpose: Send off one BLAST search and return parsed list of results
 | 
			
		||||
#          This script uses the BLAST URL-API
 | 
			
		||||
#          (Application Programming Interface) at the NCBI.
 | 
			
		||||
#          Read about the constraints here:
 | 
			
		||||
#          https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
 | 
			
		||||
#
 | 
			
		||||
#
 | 
			
		||||
# Version: 3.2
 | 
			
		||||
# Date:    2016 09 - 2020 09
 | 
			
		||||
# Author:  Boris Steipe
 | 
			
		||||
#
 | 
			
		||||
# Versions:
 | 
			
		||||
#    3.2   2020 updates
 | 
			
		||||
#    3.1   Change from require() to requireNamespace(),
 | 
			
		||||
#          use <package>::<function>() idiom throughout
 | 
			
		||||
#    3.0   parsing logic had not been fully implemented; Fixed.
 | 
			
		||||
#    2.1   bugfix in BLAST(), bug was blanking non-split deflines;
 | 
			
		||||
#          refactored parseBLASTalignment() to handle lists with multiple hits.
 | 
			
		||||
#    2.0   Completely rewritten because the interface completely changed.
 | 
			
		||||
#          Code adpated in part from NCBI Perl sample code:
 | 
			
		||||
#          $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
 | 
			
		||||
#    1.0   first version posted for BCH441 2016, based on BLAST - API
 | 
			
		||||
#
 | 
			
		||||
# ToDo:    Return the organism/strain name in the output, and propagate
 | 
			
		||||
#          into MYSPE selection script.
 | 
			
		||||
#
 | 
			
		||||
# Notes:   This is somewhat pedestrian, but apparently there are currently
 | 
			
		||||
#          no R packages that contain such code.
 | 
			
		||||
#
 | 
			
		||||
# ==============================================================================
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if (! requireNamespace("httr", quietly = TRUE)) {
 | 
			
		||||
  install.packages("httr")
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
BLAST <- function(Q,
 | 
			
		||||
                  db = "refseq_protein",
 | 
			
		||||
                  nHits = 30,
 | 
			
		||||
                  E = 0.1,
 | 
			
		||||
                  limits = "",
 | 
			
		||||
                  rid = "",
 | 
			
		||||
                  query = "",
 | 
			
		||||
                  quietly = FALSE,
 | 
			
		||||
                  myTimeout = 120) {
 | 
			
		||||
    # Purpose:
 | 
			
		||||
    #     Basic BLAST search
 | 
			
		||||
    #
 | 
			
		||||
    # Parameters:
 | 
			
		||||
    #     Q: query - either a valid ID or a sequence
 | 
			
		||||
    #     db: "refseq_protein" by default,
 | 
			
		||||
    #         other legal values include: "nr", "pdb", "swissprot" ...
 | 
			
		||||
    #     nHits: number of hits to maximally return
 | 
			
		||||
    #     E: E-value cutoff. Do not return hits whose score would be expected
 | 
			
		||||
    #        to occur E or more times in a database of random sequence.
 | 
			
		||||
    #     limits: a valid ENTREZ filter
 | 
			
		||||
    #     rid: a request ID - to retrieve earlier search results
 | 
			
		||||
    #     query: the actual query string (needed when retrieving results
 | 
			
		||||
    #            with an rid)
 | 
			
		||||
    #     quietly: controls printing of wait-time progress bar
 | 
			
		||||
    #     timeout: how much longer _after_ rtoe to wait for a result
 | 
			
		||||
    #              before giving up (seconds)
 | 
			
		||||
    # Value:
 | 
			
		||||
    #     result: list of process status or resulting hits, and some metadata
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
 | 
			
		||||
 | 
			
		||||
    results <- list()
 | 
			
		||||
    results$query = query
 | 
			
		||||
    results$rid <- rid
 | 
			
		||||
    results$rtoe <- 0
 | 
			
		||||
 | 
			
		||||
    if (rid == "") {  # If no rid is available, spawn a search.
 | 
			
		||||
                      # Else, proceed directly to retrieval.
 | 
			
		||||
 | 
			
		||||
      # prepare query, GET(), and parse rid and rtoe from BLAST server response
 | 
			
		||||
      results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                              "?",
 | 
			
		||||
                              "CMD=Put",
 | 
			
		||||
                              "&PROGRAM=", "blastp",
 | 
			
		||||
                              "&QUERY=", URLencode(Q),
 | 
			
		||||
                              "&DATABASE=", db,
 | 
			
		||||
                              "&MATRIX=", "BLOSUM62",
 | 
			
		||||
                              "&EXPECT=", as.character(E),
 | 
			
		||||
                              "&HITLIST_SIZE=", as.character(nHits),
 | 
			
		||||
                              "&ALIGNMENTS=", as.character(nHits),
 | 
			
		||||
                              "&FORMAT_TYPE=Text")
 | 
			
		||||
 | 
			
		||||
      if (limits != "") {
 | 
			
		||||
        results$query <- paste0(
 | 
			
		||||
          results$query,
 | 
			
		||||
          "&ENTREZ_QUERY=", limits)
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      # send it off ...
 | 
			
		||||
      response <- httr::GET(results$query)
 | 
			
		||||
      if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
        stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
 | 
			
		||||
                     httr::http_status(response)$message))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
      patt <- "RID = (\\w+)" # match the request id
 | 
			
		||||
      results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2]
 | 
			
		||||
 | 
			
		||||
      patt <- "RTOE = (\\d+)" # match the expected completion time
 | 
			
		||||
      results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
 | 
			
		||||
 | 
			
		||||
      # Now we wait ...
 | 
			
		||||
      if (quietly) {
 | 
			
		||||
        Sys.sleep(results$rtoe)
 | 
			
		||||
      } else {
 | 
			
		||||
        cat(sprintf("BLAST is processing %s:\n", results$rid))
 | 
			
		||||
        waitTimer(results$rtoe)
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    } # done sending query and retrieving rid, rtoe
 | 
			
		||||
 | 
			
		||||
    # Enter an infinite loop to check for result availability
 | 
			
		||||
    checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                         "?",
 | 
			
		||||
                         "CMD=Get",
 | 
			
		||||
                         "&RID=", results$rid,
 | 
			
		||||
                         "&FORMAT_TYPE=Text",
 | 
			
		||||
                         "&FORMAT_OBJECT=SearchInfo",
 | 
			
		||||
                         sep = "")
 | 
			
		||||
 | 
			
		||||
    while (TRUE) {
 | 
			
		||||
      # Check whether the result is ready
 | 
			
		||||
      response <- httr::GET(checkStatus)
 | 
			
		||||
      if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
        stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
 | 
			
		||||
                     httr::http_status(response)$message))
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
      if (length(grep("Status=WAITING",  txt)) > 0) {
 | 
			
		||||
        myTimeout <- myTimeout - EXTRAWAIT
 | 
			
		||||
 | 
			
		||||
        if (myTimeout <= 0) { # abort
 | 
			
		||||
          cat("BLAST search not concluded before timeout. Aborting.\n")
 | 
			
		||||
          cat(sprintf("%s  BLASThits <- BLAST(rid=\"%s\")\n",
 | 
			
		||||
                      "Trying checking back later with >",
 | 
			
		||||
                      results$rid))
 | 
			
		||||
          return(results)
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if (quietly) {
 | 
			
		||||
          Sys.sleep(EXTRAWAIT)
 | 
			
		||||
        } else {
 | 
			
		||||
          cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
 | 
			
		||||
                      EXTRAWAIT,
 | 
			
		||||
                      myTimeout))
 | 
			
		||||
          waitTimer(EXTRAWAIT)
 | 
			
		||||
          next
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=FAILED",  txt)) > 0) {
 | 
			
		||||
          cat("BLAST search returned status \"FAILED\". Aborting.\n")
 | 
			
		||||
          return(results)
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=UNKNOWN",  txt)) > 0) {
 | 
			
		||||
          cat("BLAST search returned status \"UNKNOWN\".\n")
 | 
			
		||||
          cat("This probably means the rid has expired. Aborting.\n")
 | 
			
		||||
          return(results)
 | 
			
		||||
 | 
			
		||||
      } else if (length(grep("Status=READY",  txt)) > 0) {  # Done
 | 
			
		||||
 | 
			
		||||
          if (length(grep("ThereAreHits=yes",  txt)) == 0) {  # No hits
 | 
			
		||||
            cat("BLAST search ready but no hits found. Aborting.\n")
 | 
			
		||||
            return(results)
 | 
			
		||||
 | 
			
		||||
          } else {
 | 
			
		||||
            break  # done ... retrieve search result
 | 
			
		||||
          }
 | 
			
		||||
      }
 | 
			
		||||
    } # end result-check loop
 | 
			
		||||
 | 
			
		||||
    # retrieve results from BLAST server
 | 
			
		||||
    retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
 | 
			
		||||
                      "?",
 | 
			
		||||
                      "&CMD=Get",
 | 
			
		||||
                      "&RID=", results$rid,
 | 
			
		||||
                      "&FORMAT_TYPE=Text",
 | 
			
		||||
                      sep = "")
 | 
			
		||||
 | 
			
		||||
    response <- httr::GET(retrieve)
 | 
			
		||||
    if (httr::http_status(response)$category != "Success" ) {
 | 
			
		||||
      stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
 | 
			
		||||
                   httr::http_status(response)$message))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    txt <- httr::content(response, "text", encoding = "UTF-8")
 | 
			
		||||
 | 
			
		||||
    # txt contains the whole set of results. Process:
 | 
			
		||||
 | 
			
		||||
    # First, we strsplit() on linebreaks:
 | 
			
		||||
    txt <- unlist(strsplit(txt, "\n"))
 | 
			
		||||
 | 
			
		||||
    # The alignments range from the first line that begins with ">" ...
 | 
			
		||||
    iFirst <- grep("^>", txt)[1]
 | 
			
		||||
 | 
			
		||||
    # ... to the last line that begins with "Sbjct"
 | 
			
		||||
    x <- grep("^Sbjct", txt)
 | 
			
		||||
    iLast <- x[length(x)]
 | 
			
		||||
 | 
			
		||||
    # Get the alignments block
 | 
			
		||||
    txt <- txt[iFirst:iLast]
 | 
			
		||||
 | 
			
		||||
    # Drop empty lines
 | 
			
		||||
    txt <- txt[!(nchar(txt) == 0)]
 | 
			
		||||
 | 
			
		||||
    # A line that ends "]" but does not begin ">" seems to be a split
 | 
			
		||||
    # defline ... eg.
 | 
			
		||||
    #  [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
 | 
			
		||||
    #  [2] "EXF-2481]"
 | 
			
		||||
    #  Merge these lines to the preceding lines and delete them.
 | 
			
		||||
    #
 | 
			
		||||
    x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
 | 
			
		||||
    if (length(x) > 0) {
 | 
			
		||||
      txt[x-1] <- paste0(txt[x-1], txt[x])
 | 
			
		||||
      txt <- txt[-x]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Special case: there may be multiple deflines when the BLAST hit is to
 | 
			
		||||
    # redundant, identical sequences. Keep only the first instance.
 | 
			
		||||
    iKeep <- ! grepl("^>", txt)
 | 
			
		||||
    x <- rle(iKeep)
 | 
			
		||||
    x$positions <- cumsum(x$lengths)
 | 
			
		||||
    i <- which(x$lengths > 1 & x$values == FALSE)
 | 
			
		||||
    if (length(i) > 0) {
 | 
			
		||||
      firsts <- x$positions[i] - x$lengths[i] + 1
 | 
			
		||||
      iKeep[firsts] <- TRUE
 | 
			
		||||
      txt <- txt[iKeep]
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # After this preprocessing the following should be true:
 | 
			
		||||
    # - Every alignment block begins with a defline in which the
 | 
			
		||||
    #   first character is ">"
 | 
			
		||||
    # - There is only one defline in each block.
 | 
			
		||||
    # - Lines are not split.
 | 
			
		||||
 | 
			
		||||
    # Make a dataframe of first and last indices of alignment blocks
 | 
			
		||||
    x <- grep("^>", txt)
 | 
			
		||||
    blocks <- data.frame(iFirst = x,
 | 
			
		||||
                         iLast  = c((x[-1] - 1), length(txt)))
 | 
			
		||||
 | 
			
		||||
    # Build the hits list by parsing the blocks
 | 
			
		||||
    results$hits <- list()
 | 
			
		||||
 | 
			
		||||
    for (i in seq_len(nrow(blocks))) {
 | 
			
		||||
      thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
 | 
			
		||||
      results$hits[[i]] <- parseBLASTalignment(thisBlock)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return(results)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
parseBLASTalignment <- function(hit) {
 | 
			
		||||
  # Parse data from a character vector containing a BLAST hit
 | 
			
		||||
  # Parameters:
 | 
			
		||||
  #    hit  char   one BLAST hit as char vector
 | 
			
		||||
  # Value:
 | 
			
		||||
  #          list   $def          chr   defline
 | 
			
		||||
  #                 $accession    chr   accession number
 | 
			
		||||
  #                 $organism     chr   complete organism definition
 | 
			
		||||
  #                 $species      chr   binomial species
 | 
			
		||||
  #                 $E            num   E value
 | 
			
		||||
  #                 $lengthAli    num   length of the alignment
 | 
			
		||||
  #                 $nIdentitites num   number of identities
 | 
			
		||||
  #                 $nGaps        num   number of gaps
 | 
			
		||||
  #                 $Qbounds      num   2-element vector of query start-end
 | 
			
		||||
  #                 $Sbounds      num   2-element vector of subject start-end
 | 
			
		||||
  #                 $Qseq         chr   query sequence
 | 
			
		||||
  #                 $midSeq       chr   midline string
 | 
			
		||||
  #                 $Sseq         chr   subject sequence
 | 
			
		||||
 | 
			
		||||
  getToken <- function(patt, v) {
 | 
			
		||||
    # get the first token identified by pattern patt in character vector v
 | 
			
		||||
    v <- v[grep(patt, v)]
 | 
			
		||||
    if (length(v) > 1) { v <- v[1] }
 | 
			
		||||
    if (length(v) == 0) { token <- NA
 | 
			
		||||
    } else {
 | 
			
		||||
      token <- regmatches(v, regexec(patt, v))[[1]][2] }
 | 
			
		||||
    return(token)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  h <- list()
 | 
			
		||||
 | 
			
		||||
  # FASTA defline
 | 
			
		||||
  h$def <- hit[1]
 | 
			
		||||
 | 
			
		||||
  # accesion number (ID), use the first if there are several, separated by "|"
 | 
			
		||||
  patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
 | 
			
		||||
  h$accession <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | 
			
		||||
 | 
			
		||||
  # organism
 | 
			
		||||
  patt <- "\\[(.+)]"
 | 
			
		||||
  h$organism <-  regmatches(h$def, regexec(patt, h$def))[[1]][2]
 | 
			
		||||
 | 
			
		||||
  # species
 | 
			
		||||
  x <- unlist(strsplit(h$organism, "\\s+"))
 | 
			
		||||
  if (length(x) >= 2) {
 | 
			
		||||
    h$species <- paste(x[1], x[2])
 | 
			
		||||
  } else if (length(x) == 1) {
 | 
			
		||||
    h$species <- paste(x[1], "sp.")
 | 
			
		||||
  } else {
 | 
			
		||||
    h$species <- NA
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # E-value
 | 
			
		||||
  h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
 | 
			
		||||
 | 
			
		||||
  # length of alignment
 | 
			
		||||
  h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
 | 
			
		||||
 | 
			
		||||
  # number of identities
 | 
			
		||||
  h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
 | 
			
		||||
 | 
			
		||||
  # number of gaps
 | 
			
		||||
  h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
 | 
			
		||||
 | 
			
		||||
  # split up alignment section
 | 
			
		||||
  idx <- grep("^Query ", hit)
 | 
			
		||||
  Que <- hit[idx]
 | 
			
		||||
  Mid <- hit[idx + 1]
 | 
			
		||||
  Sbj <- hit[idx + 2]
 | 
			
		||||
 | 
			
		||||
  # first and last positions
 | 
			
		||||
  h$Qbounds <- c(start = 0, end = 0)
 | 
			
		||||
  h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
 | 
			
		||||
  h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
 | 
			
		||||
 | 
			
		||||
  h$Sbounds <- c(start = 0, end = 0)
 | 
			
		||||
  h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
 | 
			
		||||
  h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
 | 
			
		||||
 | 
			
		||||
  # aligned sequences
 | 
			
		||||
  for (i in seq_along(Que)) {
 | 
			
		||||
    patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
 | 
			
		||||
    m <- regexec(patt, Que[i])
 | 
			
		||||
    iFirst <- m[[1]][2]
 | 
			
		||||
    iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
 | 
			
		||||
    Que[i] <- substring(Que[i], iFirst, iLast)
 | 
			
		||||
    Mid[i] <- substring(Mid[i], iFirst, iLast)
 | 
			
		||||
    Sbj[i] <- substring(Sbj[i], iFirst, iLast)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  h$Qseq   <- paste0(Que, collapse = "")
 | 
			
		||||
  h$midSeq <- paste0(Mid, collapse = "")
 | 
			
		||||
  h$Sseq   <- paste0(Sbj, collapse = "")
 | 
			
		||||
 | 
			
		||||
  return(h)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# ==== TESTS ===================================================================
 | 
			
		||||
 | 
			
		||||
if (FALSE) {
 | 
			
		||||
  # define query:
 | 
			
		||||
  q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
 | 
			
		||||
               "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
 | 
			
		||||
               "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
 | 
			
		||||
               sep="")
 | 
			
		||||
  # or ...
 | 
			
		||||
  q <- "NP_010227" # refseq ID
 | 
			
		||||
 | 
			
		||||
  test <- BLAST(q,
 | 
			
		||||
                nHits = 100,
 | 
			
		||||
                E = 0.001,
 | 
			
		||||
                rid = "",
 | 
			
		||||
                limits = "txid4751[ORGN]")  # Fungi
 | 
			
		||||
  str(test)
 | 
			
		||||
  length(test$hits)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,32 +1,32 @@
 | 
			
		||||
# test_biCode.R
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
context("biCode() utility function tests")  # A set of tests for some
 | 
			
		||||
                                            # functionality
 | 
			
		||||
 | 
			
		||||
test_that("expected input is processed correctly", {  # Related expectations
 | 
			
		||||
  expect_equal(biCode("homo sapiens"), "HOMSA")
 | 
			
		||||
  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
 | 
			
		||||
  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
 | 
			
		||||
               c("PHACI", "MACRU"))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("unexpected input is managed", {
 | 
			
		||||
  expect_equal(biCode(""), ".....")
 | 
			
		||||
  expect_equal(biCode(" "), ".....")
 | 
			
		||||
  expect_equal(biCode("123 12"), ".....")
 | 
			
		||||
  expect_equal(biCode("h sapiens"), "H..SA")
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("NA values are preserved", {
 | 
			
		||||
  expect_true(is.na((biCode(NA))))
 | 
			
		||||
  expect_equal(biCode(c("first", NA, "last")),
 | 
			
		||||
               c("FIRST", NA, "LAST."))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("Missing argument throws an error", {
 | 
			
		||||
  expect_error(biCode(), "argument \"s\" is missing, with no default")
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
# test_biCode.R
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
context("biCode() utility function tests")  # A set of tests for some
 | 
			
		||||
                                            # functionality
 | 
			
		||||
 | 
			
		||||
test_that("expected input is processed correctly", {  # Related expectations
 | 
			
		||||
  expect_equal(biCode("homo sapiens"), "HOMSA")
 | 
			
		||||
  expect_equal(biCode("[homo sapiens neanderthaliensis]"), "HOMSA")
 | 
			
		||||
  expect_equal(biCode(c("Phascolarctos cinereus", "Macropus rufus")),
 | 
			
		||||
               c("PHACI", "MACRU"))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("unexpected input is managed", {
 | 
			
		||||
  expect_equal(biCode(""), ".....")
 | 
			
		||||
  expect_equal(biCode(" "), ".....")
 | 
			
		||||
  expect_equal(biCode("123 12"), ".....")
 | 
			
		||||
  expect_equal(biCode("h sapiens"), "H..SA")
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("NA values are preserved", {
 | 
			
		||||
  expect_true(is.na((biCode(NA))))
 | 
			
		||||
  expect_equal(biCode(c("first", NA, "last")),
 | 
			
		||||
               c("FIRST", NA, "LAST."))
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
test_that("Missing argument throws an error", {
 | 
			
		||||
  expect_error(biCode(), "argument \"s\" is missing, with no default")
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [END]
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user