2020 updates - deactivate for maintenance
This commit is contained in:
parent
89bdd14d1c
commit
37ef655d47
42
.init.R
42
.init.R
@ -1,42 +0,0 @@
|
|||||||
# .init.R
|
|
||||||
# Functions to initialize this collection of learning units
|
|
||||||
# Boris Steipe
|
|
||||||
# ====================================================================
|
|
||||||
|
|
||||||
# Create a local copy of myScript.R if required, and not been done yet.
|
|
||||||
if (! file.exists("myScript.R") && file.exists(".tmp.R")) {
|
|
||||||
file.copy(".tmp.R", "myScript.R")
|
|
||||||
}
|
|
||||||
|
|
||||||
# If it doesn't exist yet, set up a profile:
|
|
||||||
if (! file.exists(".myProfile.R")) {
|
|
||||||
# setup profile data
|
|
||||||
cat("\nPlease enter the requested values correctly, no spaces, and\n")
|
|
||||||
cat("press <enter>.\n")
|
|
||||||
e <- readline("Please enter your UofT eMail address: ")
|
|
||||||
n <- readline("Please enter your Student Number: ")
|
|
||||||
|
|
||||||
conn <- file(".myProfile.R")
|
|
||||||
writeLines(c(sprintf("myEMail <- \"%s\"", e),
|
|
||||||
sprintf("myStudentNumber <- %d", as.numeric(n))),
|
|
||||||
conn)
|
|
||||||
close(conn)
|
|
||||||
rm(e, n, conn)
|
|
||||||
}
|
|
||||||
|
|
||||||
# Patch YFO -> MYSPE if necessary:
|
|
||||||
tmp <- readLines(".myProfile.R")
|
|
||||||
if (length(grep("^YFO", tmp)) > 0) {
|
|
||||||
idx <- grep("^YFO", tmp)
|
|
||||||
tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx])
|
|
||||||
writeLines(tmp, ".myProfile.R")
|
|
||||||
}
|
|
||||||
rm(tmp)
|
|
||||||
|
|
||||||
source(".myProfile.R")
|
|
||||||
|
|
||||||
source(".utilities.R")
|
|
||||||
|
|
||||||
file.edit("ABC-units.R")
|
|
||||||
|
|
||||||
# [End]
|
|
20
.utilities.R
20
.utilities.R
@ -181,19 +181,29 @@ fetchMSAmotif <- function(ali, mot) {
|
|||||||
|
|
||||||
# ====== PDB ID selection ======================================================
|
# ====== PDB ID selection ======================================================
|
||||||
|
|
||||||
selectPDBrep <- function(n) {
|
selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
|
||||||
# Select n PDB IDs from a list of high-resolution, non-homologous, single
|
# Select n PDB IDs from a list of high-resolution, non-homologous, single
|
||||||
# domain, single chain structure files that represent a CATH topology
|
# domain, single chain structure files that represent a CATH topology
|
||||||
# group.
|
# group.
|
||||||
# Parameters n num number of IDs to return.
|
# Parameters:
|
||||||
|
# n num number of IDs to return
|
||||||
|
# seed num a seed for the RNG
|
||||||
|
#
|
||||||
# Value: char PDB IDs
|
# Value: char PDB IDs
|
||||||
# Note: the list is loaded from an RData file in the data directory
|
#
|
||||||
|
# Note: the list is loaded from an RData file in the "./data" directory.
|
||||||
|
# If you use this function for a course submissio, it MUST be invoked as:
|
||||||
|
#
|
||||||
|
# selectPDBrep(n, seed = myStudentNumber)
|
||||||
|
#
|
||||||
|
# ... and myStudentNumber MUST be correctly initialized
|
||||||
|
|
||||||
load("./data/pdbRep.RData") # loads pdbRep
|
load("./data/pdbRep.RData") # loads pdbRep
|
||||||
if (n > length(pdbRep)) {
|
if (n > length(pdbRep)) {
|
||||||
stop(sprintf("You can select no more than %d IDs.", length(pdbRep)))
|
stop(sprintf("There are only %d PDB IDs in the table to choose from.",
|
||||||
|
length(pdbRep)))
|
||||||
}
|
}
|
||||||
set.seed(as.numeric(Sys.time()))
|
set.seed(seed)
|
||||||
return(sample(pdbRep, n))
|
return(sample(pdbRep, n))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
42
ABC-units.R
42
ABC-units.R
@ -2,11 +2,16 @@
|
|||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course: R code for learning units
|
# Purpose: A Bioinformatics Course: R code for learning units
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 4.0
|
||||||
#
|
#
|
||||||
# Date: 2017 08 18
|
# Date: 2020 09 16
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
|
# Versions:
|
||||||
|
# V 4.0 2020 version
|
||||||
|
# V 3.0 2019 version
|
||||||
|
# V 2.0 2018 version
|
||||||
|
# V 1.0 2017 version
|
||||||
# V 0.1 First code
|
# V 0.1 First code
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
@ -14,23 +19,19 @@
|
|||||||
#
|
#
|
||||||
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
# == HOW TO WORK WITH LEARNING UNIT FILES ======================================
|
||||||
#
|
#
|
||||||
# Expect that the learning unit files will be continuously updated.
|
# The R-scripts and datasets in this project will be continuously updated,
|
||||||
#
|
# and updates will be posted on GitHub. To bring your version into the latest
|
||||||
|
# state use the Git-pane (top left) and "pull" (blue downward arrow) from the
|
||||||
|
# repository. However, this will overwrite locally edited version of files.
|
||||||
|
|
||||||
# If you wish to edit any of the code, for example to add your own comments and
|
# To edit code and experiment with it, for example to add your own comments and
|
||||||
# examples, save any edited version under a different name. Otherwise you will
|
# examples, save your edited version into the "myScripts" folder. Otherwise you
|
||||||
# have problems with git when you update the project to a new version.
|
# may have problems with git when you update the project to a new version. It's
|
||||||
|
# good practice to change the filename, for example by prepending your initials.
|
||||||
|
# This helps distinguish the files you are working with e.g. in a list of
|
||||||
|
# recent files. For example if your name is Honjo Tasuku, your edited
|
||||||
|
# BIN-Sequence.R might be named HT-BIN-Sequence.R
|
||||||
|
|
||||||
# DO NOT SIMPLY source() THESE FILES!
|
|
||||||
|
|
||||||
# If there are portions you don't understand, use R's help system, Google for an
|
|
||||||
# answer, or ask your instructor. Don't continue if you don't understand what's
|
|
||||||
# going on. That's not how it works ...
|
|
||||||
#
|
|
||||||
# While this file itself should not be edited by you this is YOUR project
|
|
||||||
# directory, and files that you create (notes etc.) will not be harmed when you
|
|
||||||
# pull updated version of the master, or other new files, from github.
|
|
||||||
#
|
|
||||||
# If you pull from github and get the following type of error ...
|
# If you pull from github and get the following type of error ...
|
||||||
# ---------------
|
# ---------------
|
||||||
# error: Your local changes to the following files would be
|
# error: Your local changes to the following files would be
|
||||||
@ -41,8 +42,11 @@
|
|||||||
# ... then, you need to bring the offending file into its original state.
|
# ... then, you need to bring the offending file into its original state.
|
||||||
# Open the Commit window, select the file, and click on the Revert button.
|
# Open the Commit window, select the file, and click on the Revert button.
|
||||||
#
|
#
|
||||||
# Of course, you can save a local copy under a different name before you revert,
|
# When working with these script DO NOT SIMPLY source() THESE FILES!
|
||||||
# in case you want to keep your changes.
|
|
||||||
|
# If there are portions you don't understand, use R's help system, Google for an
|
||||||
|
# answer, or ask your instructor. Don't continue if you don't understand what's
|
||||||
|
# going on. That's not how it works ...
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-ALI-BLAST.R
|
# tocID <- "BIN-ALI-BLAST.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-BLAST unit.
|
# R code accompanying the BIN-ALI-BLAST unit.
|
||||||
@ -29,13 +35,13 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------
|
#TOC> ---------------------------------------------------
|
||||||
#TOC> 1 Defining the APSES domain 42
|
#TOC> 1 Defining the APSES domain 42
|
||||||
#TOC> 2 Executing the BLAST search 64
|
#TOC> 2 Executing the BLAST search 64
|
||||||
#TOC> 3 Analysing results 86
|
#TOC> 3 Analysing results 86
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-ALI-Dotplot.R
|
# tocID <- "BIN-ALI-Dotplot.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Dotplot unit.
|
# R code accompanying the BIN-ALI-Dotplot unit.
|
||||||
@ -27,12 +33,12 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------
|
#TOC> --------------------------------------
|
||||||
#TOC> 1 ___Section___ 39
|
#TOC> 1 ___Section___ 39
|
||||||
#TOC> 2 Tasks 187
|
#TOC> 2 Tasks 187
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-ALI-MSA.R
|
# tocID <- "BIN-ALI-MSA.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-MSA unit.
|
# R code accompanying the BIN-ALI-MSA unit.
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------------
|
#TOC> ------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 54
|
#TOC> 1 Preparations 54
|
||||||
@ -47,7 +53,7 @@
|
|||||||
#TOC> 6 Sequence Logos 546
|
#TOC> 6 Sequence Logos 546
|
||||||
#TOC> 6.1 Subsetting an alignment by motif 555
|
#TOC> 6.1 Subsetting an alignment by motif 555
|
||||||
#TOC> 6.2 Plot a Sequence Logo 604
|
#TOC> 6.2 Plot a Sequence Logo 604
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -239,7 +245,7 @@ for (i in seq_along(highScoringRanges$lengths)) {
|
|||||||
# - adjust the sequence names
|
# - adjust the sequence names
|
||||||
# - convert to msaAAMultipleAlignment object
|
# - convert to msaAAMultipleAlignment object
|
||||||
|
|
||||||
# === 4.1.1 importing an .aln file
|
# === 4.1.1 importing an .aln file
|
||||||
|
|
||||||
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
||||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-ALI-Optimal_sequence_alignment.R
|
# tocID <- "BIN-ALI-Optimal_sequence_alignment.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-ALI-Similarity.R
|
# tocID <- "BIN-ALI-Similarity.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Similarity unit.
|
# R code accompanying the BIN-ALI-Similarity unit.
|
||||||
@ -28,13 +34,13 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------
|
#TOC> ----------------------------------------------
|
||||||
#TOC> 1 Amino Acid Properties 41
|
#TOC> 1 Amino Acid Properties 41
|
||||||
#TOC> 2 Mutation Data matrix 158
|
#TOC> 2 Mutation Data matrix 158
|
||||||
#TOC> 3 Background score 199
|
#TOC> 3 Background score 199
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-Data_integration.R
|
# tocID <- "BIN-Data_integration.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Data_integration unit.
|
# R code accompanying the BIN-Data_integration unit.
|
||||||
@ -30,12 +36,12 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------
|
#TOC> -------------------------------------------------
|
||||||
#TOC> 1 Identifier mapping 42
|
#TOC> 1 Identifier mapping 42
|
||||||
#TOC> 2 Cross-referencing tables 165
|
#TOC> 2 Cross-referencing tables 165
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-FUNC-Domain_annotation.R
|
# tocID <- "BIN-FUNC-Domain_annotation.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
# R code accompanying the BIN-FUNC-Domain_annotation unit.
|
||||||
@ -25,7 +31,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------------------
|
||||||
#TOC> 1 Update your database script 41
|
#TOC> 1 Update your database script 41
|
||||||
@ -34,7 +40,7 @@
|
|||||||
#TOC> 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment 93
|
#TOC> 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment 93
|
||||||
#TOC> 1.2 Execute and Validate 119
|
#TOC> 1.2 Execute and Validate 119
|
||||||
#TOC> 2 Plot Annotations 144
|
#TOC> 2 Plot Annotations 144
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -90,7 +96,7 @@
|
|||||||
# Then SKIP the next section.
|
# Then SKIP the next section.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# === 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment
|
# === 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# You DO already have a file called "<MYSPE>-Annotations.json" in the
|
# You DO already have a file called "<MYSPE>-Annotations.json" in the
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-FUNC_Semantic_similarity.R
|
# tocID <- "BIN-FUNC_Semantic_similarity.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||||
@ -28,14 +34,14 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42
|
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42
|
||||||
#TOC> 2 Fetch GO Annotations 98
|
#TOC> 2 Fetch GO Annotations 98
|
||||||
#TOC> 3 Semantic Similarities 107
|
#TOC> 3 Semantic Similarities 107
|
||||||
#TOC> 4 GO Term Enrichment in Gene Sets 125
|
#TOC> 4 GO Term Enrichment in Gene Sets 125
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -158,27 +164,9 @@ myEnr <- GOenrichment(mySet, allGenes)
|
|||||||
|
|
||||||
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
|
sort(myEnr$p.values) # Any significantly enriched terms? All of these are ...
|
||||||
|
|
||||||
#Yes: most significantly enriched is GO:0071931. What is this?
|
#Most significantly enriched is GO:0071931. What is this?
|
||||||
getGOTerm("GO:0071931") # ... makes sense.
|
annotate::getGOTerm("GO:0071931") # ... makes sense.
|
||||||
|
|
||||||
(fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term?
|
|
||||||
|
|
||||||
intersect(mySet, fullSet) # These are in both sets
|
|
||||||
setdiff(mySet, fullSet) # These mySet members are not annotated to that term
|
|
||||||
setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
|
|
||||||
# ... that's the most interesting set. From a set of
|
|
||||||
# genes we have identified a function that they
|
|
||||||
# share, and that shared function has allowed us
|
|
||||||
# to identify
|
|
||||||
|
|
||||||
# What are these genes?
|
|
||||||
# Select annotations from the annotation database:
|
|
||||||
AnnotationDbi::select(org.Sc.sgd.db,
|
|
||||||
keys = setdiff(fullSet, mySet),
|
|
||||||
columns = c("COMMON", "DESCRIPTION"))
|
|
||||||
|
|
||||||
# Note that these annotations are partially redundant to several different
|
|
||||||
# aliases of the same three genes.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-MYSPE.R
|
# tocID <- "BIN-MYSPE.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-MYSPE unit
|
# R code accompanying the BIN-MYSPE unit
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-PHYLO-Data_preparation.R
|
# tocID <- "BIN-PHYLO-Data_preparation.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------
|
#TOC> ---------------------------------------------------------
|
||||||
#TOC> 1 Preparations 44
|
#TOC> 1 Preparations 44
|
||||||
@ -37,7 +43,7 @@
|
|||||||
#TOC> 3 Multiple Sequence Alignment 117
|
#TOC> 3 Multiple Sequence Alignment 117
|
||||||
#TOC> 4 Reviewing and Editing Alignments 136
|
#TOC> 4 Reviewing and Editing Alignments 136
|
||||||
#TOC> 4.1 Masking workflow 152
|
#TOC> 4.1 Masking workflow 152
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-PHYLO-Tree_analysis.R
|
# tocID <- "BIN-PHYLO-Tree_analysis.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||||
@ -31,7 +37,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------
|
#TOC> --------------------------------------------------
|
||||||
#TOC> 1 Preparation and Tree Plot 46
|
#TOC> 1 Preparation and Tree Plot 46
|
||||||
@ -39,7 +45,7 @@
|
|||||||
#TOC> 2.1 Rooting Trees 145
|
#TOC> 2.1 Rooting Trees 145
|
||||||
#TOC> 2.2 Rotating Clades 190
|
#TOC> 2.2 Rotating Clades 190
|
||||||
#TOC> 2.3 Computing tree distances 241
|
#TOC> 2.3 Computing tree distances 241
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-PHYLO-Tree_building.R
|
# tocID <- "BIN-PHYLO-Tree_building.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------
|
#TOC> -----------------------------------------------------------
|
||||||
#TOC> 1 Calculating Trees 46
|
#TOC> 1 Calculating Trees 46
|
||||||
@ -39,7 +45,7 @@
|
|||||||
#TOC> 1.1.3 ... on Linux 96
|
#TOC> 1.1.3 ... on Linux 96
|
||||||
#TOC> 1.1.4 Confirming PROMLPATH 101
|
#TOC> 1.1.4 Confirming PROMLPATH 101
|
||||||
#TOC> 1.2 Building a maximum likelihood tree 110
|
#TOC> 1.2 Building a maximum likelihood tree 110
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -68,7 +74,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
|||||||
# on your computer Phylip has been installed and define the path
|
# on your computer Phylip has been installed and define the path
|
||||||
# to the proml program that calculates a maximum-likelihood tree.
|
# to the proml program that calculates a maximum-likelihood tree.
|
||||||
|
|
||||||
# === 1.1.1 ... on the Mac
|
# === 1.1.1 ... on the Mac
|
||||||
# On the Mac, the standard installation places a phylip folder
|
# On the Mac, the standard installation places a phylip folder
|
||||||
# in the /Applications directory. That folder contains all the
|
# in the /Applications directory. That folder contains all the
|
||||||
# individual phylip programs as <name>.app files. These are not
|
# individual phylip programs as <name>.app files. These are not
|
||||||
@ -79,7 +85,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
|||||||
# directly to that subdirectory to find the program it needs:
|
# directly to that subdirectory to find the program it needs:
|
||||||
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
# PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS"
|
||||||
|
|
||||||
# === 1.1.2 ... on Windows
|
# === 1.1.2 ... on Windows
|
||||||
# On Windows you need to know where the programs have been installed, and you
|
# On Windows you need to know where the programs have been installed, and you
|
||||||
# need to specify a path that is correct for the Windows OS. Find the folder
|
# need to specify a path that is correct for the Windows OS. Find the folder
|
||||||
# that is named "exe", and right-click to inspect its properties. The path
|
# that is named "exe", and right-click to inspect its properties. The path
|
||||||
@ -93,12 +99,12 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
|||||||
# I have heard that your path must not contain spaces, and it is prudent to
|
# I have heard that your path must not contain spaces, and it is prudent to
|
||||||
# avoid other special characters as well.
|
# avoid other special characters as well.
|
||||||
|
|
||||||
# === 1.1.3 ... on Linux
|
# === 1.1.3 ... on Linux
|
||||||
# If you are running Linux I trust you know what to do. It's probably
|
# If you are running Linux I trust you know what to do. It's probably
|
||||||
# something like
|
# something like
|
||||||
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
# PROMLPATH <- "/usr/local/phylip-3.695/bin"
|
||||||
|
|
||||||
# === 1.1.4 Confirming PROMLPATH
|
# === 1.1.4 Confirming PROMLPATH
|
||||||
# Confirm that the settings are right.
|
# Confirm that the settings are right.
|
||||||
PROMLPATH # returns the path
|
PROMLPATH # returns the path
|
||||||
list.dirs(PROMLPATH) # returns the directories in that path
|
list.dirs(PROMLPATH) # returns the directories in that path
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-PPI-Analysis.R
|
# tocID <- "BIN-PPI-Analysis.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PPI-Analysis unit.
|
# R code accompanying the BIN-PPI-Analysis unit.
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------
|
||||||
#TOC> 1 Setup and data 46
|
#TOC> 1 Setup and data 46
|
||||||
@ -39,7 +45,7 @@
|
|||||||
#TOC> 2.3 Betweenness Centrality 180
|
#TOC> 2.3 Betweenness Centrality 180
|
||||||
#TOC> 3 biomaRt 226
|
#TOC> 3 biomaRt 226
|
||||||
#TOC> 4 Task for submission 296
|
#TOC> 4 Task for submission 296
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-SEQA-Composition.R
|
# tocID <- "BIN-SEQA-Composition.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-SEQA-Comparison unit
|
# R code accompanying the BIN-SEQA-Comparison unit
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------------
|
#TOC> ----------------------------------------------------------
|
||||||
#TOC> 1 Preparation 47
|
#TOC> 1 Preparation 47
|
||||||
@ -40,7 +46,7 @@
|
|||||||
#TOC> 3.3 Plotting log ratios 185
|
#TOC> 3.3 Plotting log ratios 185
|
||||||
#TOC> 3.4 Sort by frequency 200
|
#TOC> 3.4 Sort by frequency 200
|
||||||
#TOC> 3.5 Color by amino acid type 215
|
#TOC> 3.5 Color by amino acid type 215
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-Sequence.R
|
# tocID <- "BIN-Sequence.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Sequence unit.
|
# R code accompanying the BIN-Sequence unit.
|
||||||
@ -30,7 +36,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------
|
#TOC> ----------------------------------------------------
|
||||||
#TOC> 1 Prepare 63
|
#TOC> 1 Prepare 63
|
||||||
@ -50,7 +56,7 @@
|
|||||||
#TOC> 7.2 Sampling 306
|
#TOC> 7.2 Sampling 306
|
||||||
#TOC> 7.2.1 Equiprobable characters 308
|
#TOC> 7.2.1 Equiprobable characters 308
|
||||||
#TOC> 7.2.2 Defined probability vector 350
|
#TOC> 7.2.2 Defined probability vector 350
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -171,16 +177,16 @@ cat(sprintf("\n%s fish", c("one", "two", "red", "blue")))
|
|||||||
|
|
||||||
# = 6 Changing strings ====================================================
|
# = 6 Changing strings ====================================================
|
||||||
|
|
||||||
# === 6.1.1 Changing case
|
# === 6.1.1 Changing case
|
||||||
tolower(s)
|
tolower(s)
|
||||||
toupper(tolower(s))
|
toupper(tolower(s))
|
||||||
|
|
||||||
|
|
||||||
# === 6.1.2 Reverse
|
# === 6.1.2 Reverse
|
||||||
reverse(s)
|
reverse(s)
|
||||||
|
|
||||||
|
|
||||||
# === 6.1.3 Change characters
|
# === 6.1.3 Change characters
|
||||||
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
# chartr(old, new, x) maps all characters in x that appear in "old" to the
|
||||||
# correpsonding character in "new."
|
# correpsonding character in "new."
|
||||||
|
|
||||||
@ -208,7 +214,7 @@ chartr(myCypher, lett, x)
|
|||||||
# (Nb. substitution cyphers are easy to crack!)
|
# (Nb. substitution cyphers are easy to crack!)
|
||||||
|
|
||||||
|
|
||||||
# === 6.1.4 Substitute characters
|
# === 6.1.4 Substitute characters
|
||||||
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
(s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is
|
||||||
# a "regular expression"!
|
# a "regular expression"!
|
||||||
|
|
||||||
@ -305,7 +311,7 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the
|
|||||||
|
|
||||||
# == 7.2 Sampling ==========================================================
|
# == 7.2 Sampling ==========================================================
|
||||||
|
|
||||||
# === 7.2.1 Equiprobable characters
|
# === 7.2.1 Equiprobable characters
|
||||||
|
|
||||||
# Assume you need a large random-nucleotide string for some statistical model.
|
# Assume you need a large random-nucleotide string for some statistical model.
|
||||||
# How to create such a string? sample() can easily create it:
|
# How to create such a string? sample() can easily create it:
|
||||||
@ -347,7 +353,7 @@ length(unlist(x))
|
|||||||
# of the smaller number of Cs and Gs - before biology even comes into play. How
|
# of the smaller number of Cs and Gs - before biology even comes into play. How
|
||||||
# do we account for that?
|
# do we account for that?
|
||||||
|
|
||||||
# === 7.2.2 Defined probability vector
|
# === 7.2.2 Defined probability vector
|
||||||
|
|
||||||
# This is where we need to know how to create samples with specific probability
|
# This is where we need to know how to create samples with specific probability
|
||||||
# distributions. A crude hack would be to create a sampling source vector with
|
# distributions. A crude hack would be to create a sampling source vector with
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# BIN-Storing_data.R
|
# tocID <- "BIN-Storing_data.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Storing_data unit
|
# R code accompanying the BIN-Storing_data unit
|
||||||
@ -27,7 +33,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------
|
||||||
#TOC> 1 A Relational Datamodel in R: review 57
|
#TOC> 1 A Relational Datamodel in R: review 57
|
||||||
@ -50,7 +56,7 @@
|
|||||||
#TOC> 3.3 Create an R script to create your own database 535
|
#TOC> 3.3 Create an R script to create your own database 535
|
||||||
#TOC> 3.3.1 Check and validate 555
|
#TOC> 3.3.1 Check and validate 555
|
||||||
#TOC> 3.4 Task: submit for credit (part 2/2) 596
|
#TOC> 3.4 Task: submit for credit (part 2/2) 596
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -205,7 +211,7 @@ str(philDB)
|
|||||||
# go back, re-read, play with it, and ask for help. This is essential.
|
# go back, re-read, play with it, and ask for help. This is essential.
|
||||||
|
|
||||||
|
|
||||||
# === 1.1.1 completing the database
|
# === 1.1.1 completing the database
|
||||||
|
|
||||||
|
|
||||||
# Next I'll add one more person, and create the other two tables:
|
# Next I'll add one more person, and create the other two tables:
|
||||||
@ -369,7 +375,7 @@ dbSanitizeSequence(x)
|
|||||||
|
|
||||||
# == 2.3 Create a protein table for our data model =========================
|
# == 2.3 Create a protein table for our data model =========================
|
||||||
|
|
||||||
# === 2.3.1 Initialize the database
|
# === 2.3.1 Initialize the database
|
||||||
|
|
||||||
|
|
||||||
# The function dbInit contains all the code to return a list of empty
|
# The function dbInit contains all the code to return a list of empty
|
||||||
@ -381,7 +387,7 @@ myDB <- dbInit()
|
|||||||
str(myDB)
|
str(myDB)
|
||||||
|
|
||||||
|
|
||||||
# === 2.3.2 Add data
|
# === 2.3.2 Add data
|
||||||
|
|
||||||
|
|
||||||
# fromJSON() returns a dataframe that we can readily process to add data
|
# fromJSON() returns a dataframe that we can readily process to add data
|
||||||
@ -428,7 +434,7 @@ source("./scripts/ABC-createRefDB.R")
|
|||||||
str(myDB)
|
str(myDB)
|
||||||
|
|
||||||
|
|
||||||
# === 2.4.1 Examples of navigating the database
|
# === 2.4.1 Examples of navigating the database
|
||||||
|
|
||||||
|
|
||||||
# You can look at the contents of the tables in the usual way we access
|
# You can look at the contents of the tables in the usual way we access
|
||||||
@ -552,7 +558,7 @@ myDB$taxonomy$species[sel]
|
|||||||
# in any of the JSON files. Later you will add more information ...
|
# in any of the JSON files. Later you will add more information ...
|
||||||
|
|
||||||
|
|
||||||
# === 3.3.1 Check and validate
|
# === 3.3.1 Check and validate
|
||||||
|
|
||||||
|
|
||||||
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
# Is your protein named according to the pattern "MBP1_MYSPE"? It should be.
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# FND-Genetic_code.R
|
# tocID <- "FND-Genetic_code.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-Genetic_code unit.
|
# R code accompanying the FND-Genetic_code unit.
|
||||||
@ -28,7 +34,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------------------
|
#TOC> ----------------------------------------------------------------
|
||||||
#TOC> 1 Storing the genetic code 45
|
#TOC> 1 Storing the genetic code 45
|
||||||
@ -38,7 +44,7 @@
|
|||||||
#TOC> 3 An alternative representation: 3D array 212
|
#TOC> 3 An alternative representation: 3D array 212
|
||||||
#TOC> 3.1 Print a Genetic code table 246
|
#TOC> 3.1 Print a Genetic code table 246
|
||||||
#TOC> 4 Tasks 272
|
#TOC> 4 Tasks 272
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# FND-MAT-Graphs_and_networks.R
|
# tocID <- "FND-MAT-Graphs_and_networks.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
||||||
@ -29,7 +35,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------
|
#TOC> ------------------------------------------------------------
|
||||||
#TOC> 1 Review 50
|
#TOC> 1 Review 50
|
||||||
@ -43,7 +49,7 @@
|
|||||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539
|
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539
|
||||||
#TOC> 4.1 Diameter 576
|
#TOC> 4.1 Diameter 576
|
||||||
#TOC> 5 GRAPH CLUSTERING 645
|
#TOC> 5 GRAPH CLUSTERING 645
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -280,7 +286,7 @@ plot(GBA,
|
|||||||
vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
|
vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
|
||||||
vertex.size = 200 + (30 * igraph::degree(GBA)),
|
vertex.size = 200 + (30 * igraph::degree(GBA)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar) # restore grphics state
|
par(oPar) # restore graphics state
|
||||||
|
|
||||||
# This is a very obviously different graph! Some biological networks have
|
# This is a very obviously different graph! Some biological networks have
|
||||||
# features that look like that - but in my experience the hub nodes are usually
|
# features that look like that - but in my experience the hub nodes are usually
|
||||||
|
@ -1,14 +1,21 @@
|
|||||||
# FND-STA-Information_theory.R
|
# tocID <- "FND-STA-Information_theory.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Information_theory unit.
|
# R code accompanying the FND-STA-Information_theory unit.
|
||||||
#
|
#
|
||||||
# Version: 0.2
|
# Version: 0.2.1
|
||||||
#
|
#
|
||||||
# Date: 2017 MM DD
|
# Date: 2017 - 2019
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 0.2.1 Maintenance
|
||||||
# 0.2 Under development
|
# 0.2 Under development
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -58,11 +65,33 @@ AAref["Y"] <- 0.0294
|
|||||||
sum(AAref)
|
sum(AAref)
|
||||||
|
|
||||||
# Function to calculate Shannon entropy
|
# Function to calculate Shannon entropy
|
||||||
H <- function(v) {
|
H <- function(pmf) {
|
||||||
# Shannon entropy (bits)
|
# Calculate Shannon entropy
|
||||||
return(-sum(v * (log(v) / log(2))))
|
# Parameters:
|
||||||
|
# pmf (numeric) probability mass function: a vector of states and
|
||||||
|
# associated probabilities. Each element of
|
||||||
|
# pmf must be in (0, 1] and sum(pmf) must be 1.
|
||||||
|
# Value:
|
||||||
|
# Shannon entropy in bits.
|
||||||
|
# Examples:
|
||||||
|
# H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random
|
||||||
|
# # nucleotide sequence
|
||||||
|
# H(1) # If all elements are the same, entropy is zero
|
||||||
|
#
|
||||||
|
if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) {
|
||||||
|
stop("Input is not a discrete probability distribution.")
|
||||||
|
}
|
||||||
|
H <- -sum(pmf * (log(pmf) / log(2)))
|
||||||
|
return(H)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Why use all.equal()? Exact comparisons with floating point numbers are
|
||||||
|
# brittle. Consider for example:
|
||||||
|
1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1
|
||||||
|
print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777
|
||||||
|
# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Entropy of the database frequencies (in bits):
|
# Entropy of the database frequencies (in bits):
|
||||||
(Href <- H(AAref))
|
(Href <- H(AAref))
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# FND-STA-Probability_distribution.R
|
# tocID <- "FND-STA-Probability_distribution.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||||
@ -28,7 +34,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------------
|
||||||
#TOC> 1 Introduction 52
|
#TOC> 1 Introduction 52
|
||||||
@ -45,7 +51,7 @@
|
|||||||
#TOC> 4.2.1 An example from tossing dice 463
|
#TOC> 4.2.1 An example from tossing dice 463
|
||||||
#TOC> 4.2.2 An example from lognormal distributions 586
|
#TOC> 4.2.2 An example from lognormal distributions 586
|
||||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629
|
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -460,7 +466,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000)
|
|||||||
# be applied to discrete distributions. But we need to talk a bit about
|
# be applied to discrete distributions. But we need to talk a bit about
|
||||||
# converting counts to p.m.f.'s.
|
# converting counts to p.m.f.'s.
|
||||||
|
|
||||||
# === 4.2.1 An example from tossing dice
|
# === 4.2.1 An example from tossing dice
|
||||||
|
|
||||||
# The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
|
# The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But
|
||||||
# there is an issue when we convert sampled counts to frequencies, and estimate
|
# there is an issue when we convert sampled counts to frequencies, and estimate
|
||||||
@ -583,7 +589,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick")
|
|||||||
# somewhat but not drastically atypical.
|
# somewhat but not drastically atypical.
|
||||||
|
|
||||||
|
|
||||||
# === 4.2.2 An example from lognormal distributions
|
# === 4.2.2 An example from lognormal distributions
|
||||||
|
|
||||||
# We had compared a set of lognormal and gamma distributions above, now we
|
# We had compared a set of lognormal and gamma distributions above, now we
|
||||||
# can use KL-divergence to quantify their similarity:
|
# can use KL-divergence to quantify their similarity:
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# FND-STA-Significance.R
|
# tocID <- "FND-STA-Significance.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Significance unit.
|
# R code accompanying the FND-STA-Significance unit.
|
||||||
@ -25,7 +31,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------------
|
#TOC> ------------------------------------------------------------------
|
||||||
#TOC> 1 Significance and p-value 43
|
#TOC> 1 Significance and p-value 43
|
||||||
@ -36,7 +42,7 @@
|
|||||||
#TOC> 3 Significance by integration 198
|
#TOC> 3 Significance by integration 198
|
||||||
#TOC> 4 Significance by simulation or permutation 204
|
#TOC> 4 Significance by simulation or permutation 204
|
||||||
#TOC> 5 Final tasks 312
|
#TOC> 5 Final tasks 312
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -100,7 +106,7 @@ print(x, digits = 22)
|
|||||||
# curve, as a fraction of the whole.
|
# curve, as a fraction of the whole.
|
||||||
|
|
||||||
|
|
||||||
# === 1.2.1 p-value illustrated
|
# === 1.2.1 p-value illustrated
|
||||||
|
|
||||||
# Let's illustrate. First we draw a million random values from our
|
# Let's illustrate. First we draw a million random values from our
|
||||||
# standard, normal distribution:
|
# standard, normal distribution:
|
||||||
|
@ -1,2 +1,4 @@
|
|||||||
# ABC-units
|
# ABC-units
|
||||||
A Bioinformatics Course: R modules for learning units
|
A Bioinformatics Course: R modules for learning units
|
||||||
|
|
||||||
|
Follow the instructions in the learning unit to install your local copy of this R-project.
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-Biostrings.R
|
# tocID <- "RPR-Biostrings.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Biostrings unit.
|
# R code accompanying the RPR-Biostrings unit.
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-FASTA.R
|
# tocID <- "RPR-FASTA.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-FASTA unit.
|
# R code accompanying the RPR-FASTA unit.
|
||||||
|
12
RPR-GEO2R.R
12
RPR-GEO2R.R
@ -1,4 +1,10 @@
|
|||||||
# RPR_GEO2R.R
|
# tocID <- "RPR_GEO2R.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR_GEO2R unit.
|
# R code accompanying the RPR_GEO2R unit.
|
||||||
@ -34,7 +40,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 56
|
#TOC> 1 Preparations 56
|
||||||
@ -49,7 +55,7 @@
|
|||||||
#TOC> 5.1 Final task: Gene descriptions 504
|
#TOC> 5.1 Final task: Gene descriptions 504
|
||||||
#TOC> 6 Improving on Discovery by Differential Expression 510
|
#TOC> 6 Improving on Discovery by Differential Expression 510
|
||||||
#TOC> 7 Annotation data 594
|
#TOC> 7 Annotation data 594
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-Genetic_code_optimality.R
|
# tocID <- "RPR-Genetic_code_optimality.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||||
@ -30,7 +36,7 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------
|
#TOC> --------------------------------------------------------------
|
||||||
#TOC> 1 Designing a computational experiment 57
|
#TOC> 1 Designing a computational experiment 57
|
||||||
@ -43,7 +49,7 @@
|
|||||||
#TOC> 2.2.4 measure effect 213
|
#TOC> 2.2.4 measure effect 213
|
||||||
#TOC> 3 Run the experiment 260
|
#TOC> 3 Run the experiment 260
|
||||||
#TOC> 4 Task solutions 356
|
#TOC> 4 Task solutions 356
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -142,7 +148,7 @@ swappedGC <- function(GC) {
|
|||||||
# - we count the number of mutations and evaluate their severity.
|
# - we count the number of mutations and evaluate their severity.
|
||||||
|
|
||||||
|
|
||||||
# === 2.2.1 reverse-translate
|
# === 2.2.1 reverse-translate
|
||||||
|
|
||||||
# To reverse-translate an amino acid vector, we randomly pick one of its
|
# To reverse-translate an amino acid vector, we randomly pick one of its
|
||||||
# codons from a genetic code, and assemble all codons to a sequence.
|
# codons from a genetic code, and assemble all codons to a sequence.
|
||||||
@ -167,7 +173,7 @@ traRev <- function(s, GC) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# === 2.2.2 Randomly mutate
|
# === 2.2.2 Randomly mutate
|
||||||
|
|
||||||
# To mutate, we split a codon into it's three nucleotides, then randomly replace
|
# To mutate, we split a codon into it's three nucleotides, then randomly replace
|
||||||
# one of the three with another nucleotide.
|
# one of the three with another nucleotide.
|
||||||
@ -192,7 +198,7 @@ randMut <- function(vC) {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# === 2.2.3 Forward- translate
|
# === 2.2.3 Forward- translate
|
||||||
|
|
||||||
traFor <- function(vC, GC) {
|
traFor <- function(vC, GC) {
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -210,7 +216,7 @@ traFor <- function(vC, GC) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# === 2.2.4 measure effect
|
# === 2.2.4 measure effect
|
||||||
|
|
||||||
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
|
# How do we evaluate the effect of the mutation? We'll take a simple ad hoc
|
||||||
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
|
# approach: we divide amino acids into hydrophobic, hydrophilic, and neutral
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-Introduction.R
|
# tocID <- "RPR-Introduction.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Introduction unit
|
# R code accompanying the RPR-Introduction unit
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-PROSITE_POST.R
|
# tocID <- "RPR-PROSITE_POST.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
@ -29,13 +35,13 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------------
|
||||||
#TOC> 1 Constructing a POST command from a Web query 42
|
#TOC> 1 Constructing a POST command from a Web query 42
|
||||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
|
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
|
||||||
#TOC> 2 Task solutions 150
|
#TOC> 2 Task solutions 150
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-RegEx.R
|
# tocID <- "RPR-RegEx.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-RegEx unit
|
# R code accompanying the RPR-RegEx unit
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-SX-PDB.R
|
# tocID <- "RPR-SX-PDB.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-SX-PDB unit.
|
# R code accompanying the RPR-SX-PDB unit.
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-UniProt_GET.R
|
# tocID <- "RPR-UniProt_GET.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
@ -28,13 +34,13 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------------
|
#TOC> ----------------------------------------------------------
|
||||||
#TOC> 1 UniProt files via GET 41
|
#TOC> 1 UniProt files via GET 41
|
||||||
#TOC> 1.1 Task - fetchUniProtSeq() function 103
|
#TOC> 1.1 Task - fetchUniProtSeq() function 103
|
||||||
#TOC> 2 Task solutions 110
|
#TOC> 2 Task solutions 110
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-Unit_testing.R
|
# tocID <- "RPR-Unit_testing.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Unit_testing unit.
|
# R code accompanying the RPR-Unit_testing unit.
|
||||||
@ -29,10 +35,10 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------
|
#TOC> -------------------------------------------------
|
||||||
#TOC> 1 Unit Tests with testthat 40
|
#TOC> 1 Unit Tests with testthat 46
|
||||||
#TOC> 2 Organizing your tests 159
|
#TOC> 2 Organizing your tests 165
|
||||||
#TOC> 2.1 Testing scripts 183
|
#TOC> 2.1 Testing scripts 189
|
||||||
#TOC> 3 Task solutions 198
|
#TOC> 3 Task solutions 204
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
@ -1,4 +1,10 @@
|
|||||||
# RPR-eUtils_and_XML.R
|
# tocID <- "RPR-eUtils_and_XML.R"
|
||||||
|
#
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
|
# PATIENCE ... #
|
||||||
|
# Do not yet work wih this code. Updates in progress. Thank you. #
|
||||||
|
# boris.steipe@utoronto.ca #
|
||||||
|
# ---------------------------------------------------------------------------- #
|
||||||
#
|
#
|
||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
@ -28,13 +34,13 @@
|
|||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------
|
#TOC> -----------------------------------------------------------
|
||||||
#TOC> 1 Working with NCBI eUtils 41
|
#TOC> 1 Working with NCBI eUtils 41
|
||||||
#TOC> 1.1 Task - fetchNCBItaxData() function 144
|
#TOC> 1.1 Task - fetchNCBItaxData() function 144
|
||||||
#TOC> 2 Task solutions 151
|
#TOC> 2 Task solutions 151
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||||
#
|
#
|
||||||
# For the data model, see
|
# For the data model, see
|
||||||
# https://docs.google.com/drawings/d/1uupNvz18_FYFwyyVPebTM0CUxcJCPDQuxuIJGpjWQWg
|
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
@ -1,12 +1,35 @@
|
|||||||
# ABC-dbUtilities.R
|
# tocID <- "scripts/ABC-dbUtilities.R"
|
||||||
|
#
|
||||||
# database utilities for ABC learning units
|
# database utilities for ABC learning units
|
||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
# ====== PACKAGES ==============================================================
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -------------------------------------------------
|
||||||
|
#TOC> 1 PACKAGES 32
|
||||||
|
#TOC> 2 FUNCTIONS 50
|
||||||
|
#TOC> 2.01 dbSanitizeSequence() 53
|
||||||
|
#TOC> 2.02 dbConfirmUnique() 88
|
||||||
|
#TOC> 2.03 dbInit() 106
|
||||||
|
#TOC> 2.04 dbAutoincrement() 147
|
||||||
|
#TOC> 2.05 dbAddProtein() 160
|
||||||
|
#TOC> 2.06 dbAddFeature() 180
|
||||||
|
#TOC> 2.07 dbAddTaxonomy() 199
|
||||||
|
#TOC> 2.08 dbAddAnnotation() 215
|
||||||
|
#TOC> 2.09 dbFetchUniProtSeq() 243
|
||||||
|
#TOC> 2.10 dbFetchPrositeFeatures() 267
|
||||||
|
#TOC> 2.11 node2text() 311
|
||||||
|
#TOC> 2.12 dbFetchNCBItaxData() 323
|
||||||
|
#TOC> 2.13 UniProtIDmap() 362
|
||||||
|
#TOC> 3 TESTS 399
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 PACKAGES ============================================================
|
||||||
|
|
||||||
|
|
||||||
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||||
@ -24,9 +47,10 @@ if (! requireNamespace("xml2", quietly = TRUE)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ====== FUNCTIONS =============================================================
|
# = 2 FUNCTIONS ===========================================================
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.01 dbSanitizeSequence() =============================================
|
||||||
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
||||||
# Remove FASTA header lines, if any,
|
# Remove FASTA header lines, if any,
|
||||||
# flatten any structure that s has,
|
# flatten any structure that s has,
|
||||||
@ -61,6 +85,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.02 dbConfirmUnique() ================================================
|
||||||
dbConfirmUnique <- function(x) {
|
dbConfirmUnique <- function(x) {
|
||||||
# x is a vector of logicals.
|
# x is a vector of logicals.
|
||||||
# returns x if x has exactly one TRUE element.
|
# returns x if x has exactly one TRUE element.
|
||||||
@ -78,24 +103,27 @@ dbConfirmUnique <- function(x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.03 dbInit() =========================================================
|
||||||
dbInit <- function() {
|
dbInit <- function() {
|
||||||
# Return an empty instance of the protein database
|
# Return an empty instance of the protein database
|
||||||
|
# Open the link and study the schema:
|
||||||
|
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||||
|
|
||||||
db <- list()
|
db <- list()
|
||||||
|
|
||||||
|
db$version <- "1.0"
|
||||||
|
|
||||||
db$protein <- data.frame(
|
db$protein <- data.frame(
|
||||||
ID = numeric(),
|
ID = numeric(),
|
||||||
name = character(),
|
name = character(),
|
||||||
RefSeqID = character(),
|
RefSeqID = character(),
|
||||||
UniProtID = character(),
|
UniProtID = character(),
|
||||||
taxonomyID = numeric(),
|
taxonomyID = numeric(),
|
||||||
sequence = character(),
|
sequence = character())
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
|
|
||||||
db$taxonomy <- data.frame(
|
db$taxonomy <- data.frame(
|
||||||
ID = numeric(),
|
ID = numeric(),
|
||||||
species = character(),
|
species = character())
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
|
|
||||||
|
|
||||||
db$annotation <- data.frame(
|
db$annotation <- data.frame(
|
||||||
@ -103,21 +131,20 @@ dbInit <- function() {
|
|||||||
proteinID = numeric(),
|
proteinID = numeric(),
|
||||||
featureID = numeric(),
|
featureID = numeric(),
|
||||||
start = numeric(),
|
start = numeric(),
|
||||||
end = numeric(),
|
end = numeric())
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
|
|
||||||
db$feature <- data.frame(
|
db$feature <- data.frame(
|
||||||
ID = numeric(),
|
ID = numeric(),
|
||||||
name = character(),
|
name = character(),
|
||||||
description = character(),
|
description = character(),
|
||||||
sourceDB = character(),
|
sourceDB = character(),
|
||||||
accession = character(),
|
accession = character())
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
|
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.04 dbAutoincrement() ================================================
|
||||||
dbAutoincrement <- function(tb) {
|
dbAutoincrement <- function(tb) {
|
||||||
# Return a unique integer that can be used as a primary key
|
# Return a unique integer that can be used as a primary key
|
||||||
# Value:
|
# Value:
|
||||||
@ -130,6 +157,7 @@ dbAutoincrement <- function(tb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.05 dbAddProtein() ===================================================
|
||||||
dbAddProtein <- function(db, jsonDF) {
|
dbAddProtein <- function(db, jsonDF) {
|
||||||
# Add one or more protein entries to the database db.
|
# Add one or more protein entries to the database db.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -142,14 +170,14 @@ dbAddProtein <- function(db, jsonDF) {
|
|||||||
RefSeqID = jsonDF$RefSeqID[i],
|
RefSeqID = jsonDF$RefSeqID[i],
|
||||||
UniProtID = jsonDF$UniProtID[i],
|
UniProtID = jsonDF$UniProtID[i],
|
||||||
taxonomyID = jsonDF$taxonomyID[i],
|
taxonomyID = jsonDF$taxonomyID[i],
|
||||||
sequence = dbSanitizeSequence(jsonDF$sequence[i]),
|
sequence = dbSanitizeSequence(jsonDF$sequence[i]))
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
db$protein <- rbind(db$protein, x)
|
db$protein <- rbind(db$protein, x)
|
||||||
}
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.06 dbAddFeature() ===================================================
|
||||||
dbAddFeature <- function(db, jsonDF) {
|
dbAddFeature <- function(db, jsonDF) {
|
||||||
# Add one or more feature entries to the database db.
|
# Add one or more feature entries to the database db.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -161,14 +189,14 @@ dbAddFeature <- function(db, jsonDF) {
|
|||||||
name = jsonDF$name[i],
|
name = jsonDF$name[i],
|
||||||
description = jsonDF$description[i],
|
description = jsonDF$description[i],
|
||||||
sourceDB = jsonDF$sourceDB[i],
|
sourceDB = jsonDF$sourceDB[i],
|
||||||
accession = jsonDF$accession[i],
|
accession = jsonDF$accession[i])
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
db$feature <- rbind(db$feature, x)
|
db$feature <- rbind(db$feature, x)
|
||||||
}
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.07 dbAddTaxonomy() ==================================================
|
||||||
dbAddTaxonomy <- function(db, jsonDF) {
|
dbAddTaxonomy <- function(db, jsonDF) {
|
||||||
# Add one or more taxonomy entries to the database db.
|
# Add one or more taxonomy entries to the database db.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -178,13 +206,13 @@ dbAddTaxonomy <- function(db, jsonDF) {
|
|||||||
for (i in seq_len(nrow(jsonDF))) {
|
for (i in seq_len(nrow(jsonDF))) {
|
||||||
x <- data.frame(
|
x <- data.frame(
|
||||||
ID = jsonDF$ID[i],
|
ID = jsonDF$ID[i],
|
||||||
species = jsonDF$species[i],
|
species = jsonDF$species[i])
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
db$taxonomy <- rbind(db$taxonomy, x)
|
db$taxonomy <- rbind(db$taxonomy, x)
|
||||||
}
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# == 2.08 dbAddAnnotation() ================================================
|
||||||
dbAddAnnotation <- function(db, jsonDF) {
|
dbAddAnnotation <- function(db, jsonDF) {
|
||||||
# Add one or more annotation entries to the database db.
|
# Add one or more annotation entries to the database db.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -205,14 +233,14 @@ dbAddAnnotation <- function(db, jsonDF) {
|
|||||||
proteinID = pID,
|
proteinID = pID,
|
||||||
featureID = fID,
|
featureID = fID,
|
||||||
start = as.integer(jsonDF$start[i]),
|
start = as.integer(jsonDF$start[i]),
|
||||||
end = as.integer(jsonDF$end[i]),
|
end = as.integer(jsonDF$end[i]))
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
db$annotation <- rbind(db$annotation, x)
|
db$annotation <- rbind(db$annotation, x)
|
||||||
}
|
}
|
||||||
return(db)
|
return(db)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.09 dbFetchUniProtSeq() ==============================================
|
||||||
dbFetchUniProtSeq <- function(ID) {
|
dbFetchUniProtSeq <- function(ID) {
|
||||||
# Fetch a protein sequence from UniProt.
|
# Fetch a protein sequence from UniProt.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -236,6 +264,7 @@ dbFetchUniProtSeq <- function(ID) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.10 dbFetchPrositeFeatures() =========================================
|
||||||
dbFetchPrositeFeatures <- function(ID) {
|
dbFetchPrositeFeatures <- function(ID) {
|
||||||
# Fetch feature annotations from ScanProsite.
|
# Fetch feature annotations from ScanProsite.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -272,14 +301,14 @@ dbFetchPrositeFeatures <- function(ID) {
|
|||||||
start = as.numeric(tokens[4]),
|
start = as.numeric(tokens[4]),
|
||||||
end = as.numeric(tokens[5]),
|
end = as.numeric(tokens[5]),
|
||||||
psID = tokens[6],
|
psID = tokens[6],
|
||||||
psName = tokens[7],
|
psName = tokens[7]))
|
||||||
stringsAsFactors = FALSE))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return(myFeatures)
|
return(myFeatures)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.11 node2text() ======================================================
|
||||||
node2text <- function(doc, tag) {
|
node2text <- function(doc, tag) {
|
||||||
# an extractor function for the contents of elements
|
# an extractor function for the contents of elements
|
||||||
# between given tags in an XML response.
|
# between given tags in an XML response.
|
||||||
@ -291,6 +320,7 @@ node2text <- function(doc, tag) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.12 dbFetchNCBItaxData() =============================================
|
||||||
dbFetchNCBItaxData <- function(ID) {
|
dbFetchNCBItaxData <- function(ID) {
|
||||||
# Fetch feature taxID and Organism from the NCBI.
|
# Fetch feature taxID and Organism from the NCBI.
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -329,6 +359,7 @@ dbFetchNCBItaxData <- function(ID) {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.13 UniProtIDmap() ===================================================
|
||||||
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||||
# Use UniProt ID mapping service to map one or more IDs
|
# Use UniProt ID mapping service to map one or more IDs
|
||||||
# Parameters:
|
# Parameters:
|
||||||
@ -351,8 +382,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
|
|
||||||
if (httr::status_code(response) == 200) { # 200: oK
|
if (httr::status_code(response) == 200) { # 200: oK
|
||||||
myMap <- read.delim(file = textConnection(httr::content(response)),
|
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||||
sep = "\t",
|
sep = "\t")
|
||||||
stringsAsFactors = FALSE)
|
|
||||||
myMap <- myMap[ , c(1,3)]
|
myMap <- myMap[ , c(1,3)]
|
||||||
colnames(myMap) <- c("From", "To")
|
colnames(myMap) <- c("From", "To")
|
||||||
} else {
|
} else {
|
||||||
@ -366,7 +396,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ====== TESTS =================================================================
|
# = 3 TESTS ===============================================================
|
||||||
|
|
||||||
if (FALSE) {
|
if (FALSE) {
|
||||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# ABC-makeScCCnet.R
|
# tocID <- "scripts/ABC-makeScCCnet.R"
|
||||||
#
|
#
|
||||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
||||||
# GOSlim annotation.
|
# GOSlim annotation.
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# ABC-writeALN.R
|
# tocID <- "scripts/ABC-writeALN.R"
|
||||||
#
|
#
|
||||||
# ToDo: calculate consensus line
|
# ToDo: calculate consensus line
|
||||||
# append sequence numbers
|
# append sequence numbers
|
||||||
|
@ -40,7 +40,7 @@ writeMFA <- function(ali,
|
|||||||
if (is.na(blockWidth)) {
|
if (is.na(blockWidth)) {
|
||||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||||
}
|
}
|
||||||
if (blockWidth < 1){
|
if (! blockWidth > 0){
|
||||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ writeMFA <- function(ali,
|
|||||||
txt <- c(txt, "") # append an empty line for readability
|
txt <- c(txt, "") # append an empty line for readability
|
||||||
}
|
}
|
||||||
|
|
||||||
writeLines(txt, con= myCon)
|
writeLines(txt, con = myCon)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -357,20 +357,23 @@ parseBLASTalignment <- function(hit) {
|
|||||||
|
|
||||||
# ==== TESTS ===================================================================
|
# ==== TESTS ===================================================================
|
||||||
|
|
||||||
# define query:
|
if (FALSE) {
|
||||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
# define query:
|
||||||
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||||
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||||
# sep="")
|
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||||
# or ...
|
sep="")
|
||||||
# q <- "NP_010227" # refseq ID
|
# or ...
|
||||||
#
|
q <- "NP_010227" # refseq ID
|
||||||
# test <- BLAST(q,
|
|
||||||
# nHits = 100,
|
test <- BLAST(q,
|
||||||
# E = 0.001,
|
nHits = 100,
|
||||||
# rid = "",
|
E = 0.001,
|
||||||
# limits = "txid4751[ORGN]")
|
rid = "",
|
||||||
# length(test$hits)
|
limits = "txid4751[ORGN]")
|
||||||
|
str(test)
|
||||||
|
length(test$hits)
|
||||||
|
}
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user