From 37ef655d47f6e016b4fa1fbb809eac1bcd905d38 Mon Sep 17 00:00:00 2001 From: hyginn Date: Fri, 18 Sep 2020 21:56:30 +1000 Subject: [PATCH] 2020 updates - deactivate for maintenance --- .init.R | 42 -------------- .utilities.R | 20 +++++-- ABC-units.R | 42 +++++++------- BIN-ALI-BLAST.R | 12 +++- BIN-ALI-Dotplot.R | 12 +++- BIN-ALI-MSA.R | 14 +++-- BIN-ALI-Optimal_sequence_alignment.R | 8 ++- BIN-ALI-Similarity.R | 12 +++- BIN-Data_integration.R | 12 +++- BIN-FUNC-Domain_annotation.R | 14 +++-- BIN-FUNC-Semantic_similarity.R | 34 ++++-------- BIN-MYSPE.R | 8 ++- BIN-PHYLO-Data_preparation.R | 12 +++- BIN-PHYLO-Tree_analysis.R | 12 +++- BIN-PHYLO-Tree_building.R | 20 ++++--- BIN-PPI-Analysis.R | 12 +++- BIN-SEQA-Composition.R | 12 +++- BIN-Sequence.R | 24 +++++--- BIN-Storing_data.R | 22 +++++--- FND-Genetic_code.R | 12 +++- FND-MAT-Graphs_and_networks.R | 14 +++-- FND-STA-Information_theory.R | 41 ++++++++++++-- FND-STA-Probability_distribution.R | 16 ++++-- FND-STA-Significance.R | 14 +++-- README.md | 2 + RPR-Biostrings.R | 8 ++- RPR-FASTA.R | 8 ++- RPR-GEO2R.R | 12 +++- RPR-Genetic_code_optimality.R | 20 ++++--- RPR-Introduction.R | 8 ++- RPR-PROSITE_POST.R | 12 +++- RPR-RegEx.R | 8 ++- RPR-SX-PDB.R | 8 ++- RPR-UniProt_GET.R | 12 +++- RPR-Unit_testing.R | 16 ++++-- RPR-eUtils_XML.R | 12 +++- scripts/ABC-createRefDB.R | 2 +- scripts/ABC-dbUtilities.R | 82 +++++++++++++++++++--------- scripts/ABC-makeScCCnet.R | 2 +- scripts/ABC-writeALN.R | 2 +- scripts/ABC-writeMFA.R | 4 +- scripts/BLAST.R | 31 ++++++----- 42 files changed, 447 insertions(+), 243 deletions(-) delete mode 100644 .init.R diff --git a/.init.R b/.init.R deleted file mode 100644 index 33551f1..0000000 --- a/.init.R +++ /dev/null @@ -1,42 +0,0 @@ -# .init.R -# Functions to initialize this collection of learning units -# Boris Steipe -# ==================================================================== - -# Create a local copy of myScript.R if required, and not been done yet. -if (! file.exists("myScript.R") && file.exists(".tmp.R")) { - file.copy(".tmp.R", "myScript.R") -} - -# If it doesn't exist yet, set up a profile: -if (! file.exists(".myProfile.R")) { - # setup profile data - cat("\nPlease enter the requested values correctly, no spaces, and\n") - cat("press .\n") - e <- readline("Please enter your UofT eMail address: ") - n <- readline("Please enter your Student Number: ") - - conn <- file(".myProfile.R") - writeLines(c(sprintf("myEMail <- \"%s\"", e), - sprintf("myStudentNumber <- %d", as.numeric(n))), - conn) - close(conn) - rm(e, n, conn) -} - -# Patch YFO -> MYSPE if necessary: -tmp <- readLines(".myProfile.R") -if (length(grep("^YFO", tmp)) > 0) { - idx <- grep("^YFO", tmp) - tmp[idx] <- gsub("^YFO", "MYSPE", tmp[idx]) - writeLines(tmp, ".myProfile.R") -} -rm(tmp) - -source(".myProfile.R") - -source(".utilities.R") - -file.edit("ABC-units.R") - -# [End] diff --git a/.utilities.R b/.utilities.R index 09f56d4..8d9e7f5 100644 --- a/.utilities.R +++ b/.utilities.R @@ -181,19 +181,29 @@ fetchMSAmotif <- function(ali, mot) { # ====== PDB ID selection ====================================================== -selectPDBrep <- function(n) { +selectPDBrep <- function(n, seed = as.numeric(Sys.time())) { # Select n PDB IDs from a list of high-resolution, non-homologous, single # domain, single chain structure files that represent a CATH topology # group. - # Parameters n num number of IDs to return. + # Parameters: + # n num number of IDs to return + # seed num a seed for the RNG + # # Value: char PDB IDs - # Note: the list is loaded from an RData file in the data directory + # + # Note: the list is loaded from an RData file in the "./data" directory. + # If you use this function for a course submissio, it MUST be invoked as: + # + # selectPDBrep(n, seed = myStudentNumber) + # + # ... and myStudentNumber MUST be correctly initialized load("./data/pdbRep.RData") # loads pdbRep if (n > length(pdbRep)) { - stop(sprintf("You can select no more than %d IDs.", length(pdbRep))) + stop(sprintf("There are only %d PDB IDs in the table to choose from.", + length(pdbRep))) } - set.seed(as.numeric(Sys.time())) + set.seed(seed) return(sample(pdbRep, n)) } diff --git a/ABC-units.R b/ABC-units.R index 5632b36..e5af3ad 100644 --- a/ABC-units.R +++ b/ABC-units.R @@ -2,11 +2,16 @@ # # Purpose: A Bioinformatics Course: R code for learning units # -# Version: 0.1 +# Version: 4.0 # -# Date: 2017 08 18 +# Date: 2020 09 16 # Author: Boris Steipe (boris.steipe@utoronto.ca) # +# Versions: +# V 4.0 2020 version +# V 3.0 2019 version +# V 2.0 2018 version +# V 1.0 2017 version # V 0.1 First code # # TODO: @@ -14,23 +19,19 @@ # # == HOW TO WORK WITH LEARNING UNIT FILES ====================================== # -# Expect that the learning unit files will be continuously updated. -# +# The R-scripts and datasets in this project will be continuously updated, +# and updates will be posted on GitHub. To bring your version into the latest +# state use the Git-pane (top left) and "pull" (blue downward arrow) from the +# repository. However, this will overwrite locally edited version of files. -# If you wish to edit any of the code, for example to add your own comments and -# examples, save any edited version under a different name. Otherwise you will -# have problems with git when you update the project to a new version. +# To edit code and experiment with it, for example to add your own comments and +# examples, save your edited version into the "myScripts" folder. Otherwise you +# may have problems with git when you update the project to a new version. It's +# good practice to change the filename, for example by prepending your initials. +# This helps distinguish the files you are working with e.g. in a list of +# recent files. For example if your name is Honjo Tasuku, your edited +# BIN-Sequence.R might be named HT-BIN-Sequence.R -# DO NOT SIMPLY source() THESE FILES! - -# If there are portions you don't understand, use R's help system, Google for an -# answer, or ask your instructor. Don't continue if you don't understand what's -# going on. That's not how it works ... -# -# While this file itself should not be edited by you this is YOUR project -# directory, and files that you create (notes etc.) will not be harmed when you -# pull updated version of the master, or other new files, from github. -# # If you pull from github and get the following type of error ... # --------------- # error: Your local changes to the following files would be @@ -41,8 +42,11 @@ # ... then, you need to bring the offending file into its original state. # Open the Commit window, select the file, and click on the Revert button. # -# Of course, you can save a local copy under a different name before you revert, -# in case you want to keep your changes. +# When working with these script DO NOT SIMPLY source() THESE FILES! + +# If there are portions you don't understand, use R's help system, Google for an +# answer, or ask your instructor. Don't continue if you don't understand what's +# going on. That's not how it works ... # # # ============================================================================== diff --git a/BIN-ALI-BLAST.R b/BIN-ALI-BLAST.R index 39477f3..55ac089 100644 --- a/BIN-ALI-BLAST.R +++ b/BIN-ALI-BLAST.R @@ -1,4 +1,10 @@ -# BIN-ALI-BLAST.R +# tocID <- "BIN-ALI-BLAST.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-BLAST unit. @@ -29,13 +35,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------- #TOC> 1 Defining the APSES domain 42 #TOC> 2 Executing the BLAST search 64 #TOC> 3 Analysing results 86 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-ALI-Dotplot.R b/BIN-ALI-Dotplot.R index 1982bd4..661cdde 100644 --- a/BIN-ALI-Dotplot.R +++ b/BIN-ALI-Dotplot.R @@ -1,4 +1,10 @@ -# BIN-ALI-Dotplot.R +# tocID <- "BIN-ALI-Dotplot.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Dotplot unit. @@ -27,12 +33,12 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------- #TOC> 1 ___Section___ 39 #TOC> 2 Tasks 187 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-ALI-MSA.R b/BIN-ALI-MSA.R index ddb58a2..dea1855 100644 --- a/BIN-ALI-MSA.R +++ b/BIN-ALI-MSA.R @@ -1,4 +1,10 @@ -# BIN-ALI-MSA.R +# tocID <- "BIN-ALI-MSA.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-MSA unit. @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ------------------------------------------------------------------ #TOC> 1 Preparations 54 @@ -47,7 +53,7 @@ #TOC> 6 Sequence Logos 546 #TOC> 6.1 Subsetting an alignment by motif 555 #TOC> 6.2 Plot a Sequence Logo 604 -#TOC> +#TOC> #TOC> ========================================================================== @@ -239,7 +245,7 @@ for (i in seq_along(highScoringRanges$lengths)) { # - adjust the sequence names # - convert to msaAAMultipleAlignment object -# === 4.1.1 importing an .aln file +# === 4.1.1 importing an .aln file # The seqinr package has a function to read CLUSTAL W formatted .aln files ... if (! requireNamespace("seqinr", quietly=TRUE)) { diff --git a/BIN-ALI-Optimal_sequence_alignment.R b/BIN-ALI-Optimal_sequence_alignment.R index 4cc2854..cb7bef8 100644 --- a/BIN-ALI-Optimal_sequence_alignment.R +++ b/BIN-ALI-Optimal_sequence_alignment.R @@ -1,4 +1,10 @@ -# BIN-ALI-Optimal_sequence_alignment.R +# tocID <- "BIN-ALI-Optimal_sequence_alignment.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. diff --git a/BIN-ALI-Similarity.R b/BIN-ALI-Similarity.R index b7a1cb2..447d512 100644 --- a/BIN-ALI-Similarity.R +++ b/BIN-ALI-Similarity.R @@ -1,4 +1,10 @@ -# BIN-ALI-Similarity.R +# tocID <- "BIN-ALI-Similarity.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Similarity unit. @@ -28,13 +34,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------- #TOC> 1 Amino Acid Properties 41 #TOC> 2 Mutation Data matrix 158 #TOC> 3 Background score 199 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-Data_integration.R b/BIN-Data_integration.R index 38b8d01..4630918 100644 --- a/BIN-Data_integration.R +++ b/BIN-Data_integration.R @@ -1,4 +1,10 @@ -# BIN-Data_integration.R +# tocID <- "BIN-Data_integration.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Data_integration unit. @@ -30,12 +36,12 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ------------------------------------------------- #TOC> 1 Identifier mapping 42 #TOC> 2 Cross-referencing tables 165 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-FUNC-Domain_annotation.R b/BIN-FUNC-Domain_annotation.R index 3a7f106..e200381 100644 --- a/BIN-FUNC-Domain_annotation.R +++ b/BIN-FUNC-Domain_annotation.R @@ -1,4 +1,10 @@ -# BIN-FUNC-Domain_annotation.R +# tocID <- "BIN-FUNC-Domain_annotation.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC-Domain_annotation unit. @@ -25,7 +31,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------------------------- #TOC> 1 Update your database script 41 @@ -34,7 +40,7 @@ #TOC> 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment 93 #TOC> 1.2 Execute and Validate 119 #TOC> 2 Plot Annotations 144 -#TOC> +#TOC> #TOC> ========================================================================== @@ -90,7 +96,7 @@ # Then SKIP the next section. # # -# === 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment +# === 1.1.2 If you HAVE done the BIN-ALI-Optimal_sequence_alignment # # # You DO already have a file called "-Annotations.json" in the diff --git a/BIN-FUNC-Semantic_similarity.R b/BIN-FUNC-Semantic_similarity.R index 5295bcc..f739efe 100644 --- a/BIN-FUNC-Semantic_similarity.R +++ b/BIN-FUNC-Semantic_similarity.R @@ -1,4 +1,10 @@ -# BIN-FUNC_Semantic_similarity.R +# tocID <- "BIN-FUNC_Semantic_similarity.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC_Semantic_similarity unit. @@ -28,14 +34,14 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------------- #TOC> 1 Preparations: Packages, AnnotationDB, Setup 42 #TOC> 2 Fetch GO Annotations 98 #TOC> 3 Semantic Similarities 107 #TOC> 4 GO Term Enrichment in Gene Sets 125 -#TOC> +#TOC> #TOC> ========================================================================== @@ -158,27 +164,9 @@ myEnr <- GOenrichment(mySet, allGenes) sort(myEnr$p.values) # Any significantly enriched terms? All of these are ... -#Yes: most significantly enriched is GO:0071931. What is this? -getGOTerm("GO:0071931") # ... makes sense. +#Most significantly enriched is GO:0071931. What is this? +annotate::getGOTerm("GO:0071931") # ... makes sense. -(fullSet <- myEnr$genes$`GO:0071931`) # What genes are annotated to this term? - -intersect(mySet, fullSet) # These are in both sets -setdiff(mySet, fullSet) # These mySet members are not annotated to that term -setdiff(fullSet, mySet) # These are annotated to that term but not in mySet. - # ... that's the most interesting set. From a set of - # genes we have identified a function that they - # share, and that shared function has allowed us - # to identify - -# What are these genes? -# Select annotations from the annotation database: -AnnotationDbi::select(org.Sc.sgd.db, - keys = setdiff(fullSet, mySet), - columns = c("COMMON", "DESCRIPTION")) - -# Note that these annotations are partially redundant to several different -# aliases of the same three genes. diff --git a/BIN-MYSPE.R b/BIN-MYSPE.R index cb481b9..f8b1e93 100644 --- a/BIN-MYSPE.R +++ b/BIN-MYSPE.R @@ -1,4 +1,10 @@ -# BIN-MYSPE.R +# tocID <- "BIN-MYSPE.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-MYSPE unit diff --git a/BIN-PHYLO-Data_preparation.R b/BIN-PHYLO-Data_preparation.R index e0532a4..382817d 100644 --- a/BIN-PHYLO-Data_preparation.R +++ b/BIN-PHYLO-Data_preparation.R @@ -1,4 +1,10 @@ -# BIN-PHYLO-Data_preparation.R +# tocID <- "BIN-PHYLO-Data_preparation.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Data_preparation unit. @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------- #TOC> 1 Preparations 44 @@ -37,7 +43,7 @@ #TOC> 3 Multiple Sequence Alignment 117 #TOC> 4 Reviewing and Editing Alignments 136 #TOC> 4.1 Masking workflow 152 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-PHYLO-Tree_analysis.R b/BIN-PHYLO-Tree_analysis.R index 950a000..cf60bec 100644 --- a/BIN-PHYLO-Tree_analysis.R +++ b/BIN-PHYLO-Tree_analysis.R @@ -1,4 +1,10 @@ -# BIN-PHYLO-Tree_analysis.R +# tocID <- "BIN-PHYLO-Tree_analysis.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Tree_analysis unit. @@ -31,7 +37,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------------------- #TOC> 1 Preparation and Tree Plot 46 @@ -39,7 +45,7 @@ #TOC> 2.1 Rooting Trees 145 #TOC> 2.2 Rotating Clades 190 #TOC> 2.3 Computing tree distances 241 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-PHYLO-Tree_building.R b/BIN-PHYLO-Tree_building.R index 544b9a2..2ba15f9 100644 --- a/BIN-PHYLO-Tree_building.R +++ b/BIN-PHYLO-Tree_building.R @@ -1,4 +1,10 @@ -# BIN-PHYLO-Tree_building.R +# tocID <- "BIN-PHYLO-Tree_building.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Tree_building unit. @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------- #TOC> 1 Calculating Trees 46 @@ -39,7 +45,7 @@ #TOC> 1.1.3 ... on Linux 96 #TOC> 1.1.4 Confirming PROMLPATH 101 #TOC> 1.2 Building a maximum likelihood tree 110 -#TOC> +#TOC> #TOC> ========================================================================== @@ -68,7 +74,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) { # on your computer Phylip has been installed and define the path # to the proml program that calculates a maximum-likelihood tree. -# === 1.1.1 ... on the Mac +# === 1.1.1 ... on the Mac # On the Mac, the standard installation places a phylip folder # in the /Applications directory. That folder contains all the # individual phylip programs as .app files. These are not @@ -79,7 +85,7 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) { # directly to that subdirectory to find the program it needs: # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" -# === 1.1.2 ... on Windows +# === 1.1.2 ... on Windows # On Windows you need to know where the programs have been installed, and you # need to specify a path that is correct for the Windows OS. Find the folder # that is named "exe", and right-click to inspect its properties. The path @@ -93,12 +99,12 @@ if (! requireNamespace("Rphylip", quietly = TRUE)) { # I have heard that your path must not contain spaces, and it is prudent to # avoid other special characters as well. -# === 1.1.3 ... on Linux +# === 1.1.3 ... on Linux # If you are running Linux I trust you know what to do. It's probably # something like # PROMLPATH <- "/usr/local/phylip-3.695/bin" -# === 1.1.4 Confirming PROMLPATH +# === 1.1.4 Confirming PROMLPATH # Confirm that the settings are right. PROMLPATH # returns the path list.dirs(PROMLPATH) # returns the directories in that path diff --git a/BIN-PPI-Analysis.R b/BIN-PPI-Analysis.R index c75e0fa..becd7f8 100644 --- a/BIN-PPI-Analysis.R +++ b/BIN-PPI-Analysis.R @@ -1,4 +1,10 @@ -# BIN-PPI-Analysis.R +# tocID <- "BIN-PPI-Analysis.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PPI-Analysis unit. @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------- #TOC> 1 Setup and data 46 @@ -39,7 +45,7 @@ #TOC> 2.3 Betweenness Centrality 180 #TOC> 3 biomaRt 226 #TOC> 4 Task for submission 296 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-SEQA-Composition.R b/BIN-SEQA-Composition.R index 4458ffd..59b64a4 100644 --- a/BIN-SEQA-Composition.R +++ b/BIN-SEQA-Composition.R @@ -1,4 +1,10 @@ -# BIN-SEQA-Composition.R +# tocID <- "BIN-SEQA-Composition.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-SEQA-Comparison unit @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------------- #TOC> 1 Preparation 47 @@ -40,7 +46,7 @@ #TOC> 3.3 Plotting log ratios 185 #TOC> 3.4 Sort by frequency 200 #TOC> 3.5 Color by amino acid type 215 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/BIN-Sequence.R b/BIN-Sequence.R index 2861cb3..00315f3 100644 --- a/BIN-Sequence.R +++ b/BIN-Sequence.R @@ -1,4 +1,10 @@ -# BIN-Sequence.R +# tocID <- "BIN-Sequence.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Sequence unit. @@ -30,7 +36,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------- #TOC> 1 Prepare 63 @@ -50,7 +56,7 @@ #TOC> 7.2 Sampling 306 #TOC> 7.2.1 Equiprobable characters 308 #TOC> 7.2.2 Defined probability vector 350 -#TOC> +#TOC> #TOC> ========================================================================== @@ -171,16 +177,16 @@ cat(sprintf("\n%s fish", c("one", "two", "red", "blue"))) # = 6 Changing strings ==================================================== -# === 6.1.1 Changing case +# === 6.1.1 Changing case tolower(s) toupper(tolower(s)) -# === 6.1.2 Reverse +# === 6.1.2 Reverse reverse(s) -# === 6.1.3 Change characters +# === 6.1.3 Change characters # chartr(old, new, x) maps all characters in x that appear in "old" to the # correpsonding character in "new." @@ -208,7 +214,7 @@ chartr(myCypher, lett, x) # (Nb. substitution cyphers are easy to crack!) -# === 6.1.4 Substitute characters +# === 6.1.4 Substitute characters (s <- gsub("IV", "i-v", s)) # gsub can change length, first argument is # a "regular expression"! @@ -305,7 +311,7 @@ sum(d <= 2.5) # 276. 276 of our 10000 samples are just as bunched near the # == 7.2 Sampling ========================================================== -# === 7.2.1 Equiprobable characters +# === 7.2.1 Equiprobable characters # Assume you need a large random-nucleotide string for some statistical model. # How to create such a string? sample() can easily create it: @@ -347,7 +353,7 @@ length(unlist(x)) # of the smaller number of Cs and Gs - before biology even comes into play. How # do we account for that? -# === 7.2.2 Defined probability vector +# === 7.2.2 Defined probability vector # This is where we need to know how to create samples with specific probability # distributions. A crude hack would be to create a sampling source vector with diff --git a/BIN-Storing_data.R b/BIN-Storing_data.R index fb9891b..479c285 100644 --- a/BIN-Storing_data.R +++ b/BIN-Storing_data.R @@ -1,4 +1,10 @@ -# BIN-Storing_data.R +# tocID <- "BIN-Storing_data.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Storing_data unit @@ -27,7 +33,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------------- #TOC> 1 A Relational Datamodel in R: review 57 @@ -50,7 +56,7 @@ #TOC> 3.3 Create an R script to create your own database 535 #TOC> 3.3.1 Check and validate 555 #TOC> 3.4 Task: submit for credit (part 2/2) 596 -#TOC> +#TOC> #TOC> ========================================================================== @@ -205,7 +211,7 @@ str(philDB) # go back, re-read, play with it, and ask for help. This is essential. -# === 1.1.1 completing the database +# === 1.1.1 completing the database # Next I'll add one more person, and create the other two tables: @@ -369,7 +375,7 @@ dbSanitizeSequence(x) # == 2.3 Create a protein table for our data model ========================= -# === 2.3.1 Initialize the database +# === 2.3.1 Initialize the database # The function dbInit contains all the code to return a list of empty @@ -381,7 +387,7 @@ myDB <- dbInit() str(myDB) -# === 2.3.2 Add data +# === 2.3.2 Add data # fromJSON() returns a dataframe that we can readily process to add data @@ -428,7 +434,7 @@ source("./scripts/ABC-createRefDB.R") str(myDB) -# === 2.4.1 Examples of navigating the database +# === 2.4.1 Examples of navigating the database # You can look at the contents of the tables in the usual way we access @@ -552,7 +558,7 @@ myDB$taxonomy$species[sel] # in any of the JSON files. Later you will add more information ... -# === 3.3.1 Check and validate +# === 3.3.1 Check and validate # Is your protein named according to the pattern "MBP1_MYSPE"? It should be. diff --git a/FND-Genetic_code.R b/FND-Genetic_code.R index 5220449..267b127 100644 --- a/FND-Genetic_code.R +++ b/FND-Genetic_code.R @@ -1,4 +1,10 @@ -# FND-Genetic_code.R +# tocID <- "FND-Genetic_code.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the FND-Genetic_code unit. @@ -28,7 +34,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------------------- #TOC> 1 Storing the genetic code 45 @@ -38,7 +44,7 @@ #TOC> 3 An alternative representation: 3D array 212 #TOC> 3.1 Print a Genetic code table 246 #TOC> 4 Tasks 272 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/FND-MAT-Graphs_and_networks.R b/FND-MAT-Graphs_and_networks.R index 2348f12..d560fb9 100644 --- a/FND-MAT-Graphs_and_networks.R +++ b/FND-MAT-Graphs_and_networks.R @@ -1,4 +1,10 @@ -# FND-MAT-Graphs_and_networks.R +# tocID <- "FND-MAT-Graphs_and_networks.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the FND-MAT-Graphs_and_networks unit. @@ -29,7 +35,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ------------------------------------------------------------ #TOC> 1 Review 50 @@ -43,7 +49,7 @@ #TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539 #TOC> 4.1 Diameter 576 #TOC> 5 GRAPH CLUSTERING 645 -#TOC> +#TOC> #TOC> ========================================================================== @@ -280,7 +286,7 @@ plot(GBA, vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1], vertex.size = 200 + (30 * igraph::degree(GBA)), vertex.label = NA) -par(oPar) # restore grphics state +par(oPar) # restore graphics state # This is a very obviously different graph! Some biological networks have # features that look like that - but in my experience the hub nodes are usually diff --git a/FND-STA-Information_theory.R b/FND-STA-Information_theory.R index 49f2589..3161bad 100644 --- a/FND-STA-Information_theory.R +++ b/FND-STA-Information_theory.R @@ -1,14 +1,21 @@ -# FND-STA-Information_theory.R +# tocID <- "FND-STA-Information_theory.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the FND-STA-Information_theory unit. # -# Version: 0.2 +# Version: 0.2.1 # -# Date: 2017 MM DD +# Date: 2017 - 2019 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 0.2.1 Maintenance # 0.2 Under development # 0.1 First code copied from 2016 material. # @@ -58,11 +65,33 @@ AAref["Y"] <- 0.0294 sum(AAref) # Function to calculate Shannon entropy -H <- function(v) { - # Shannon entropy (bits) - return(-sum(v * (log(v) / log(2)))) +H <- function(pmf) { + # Calculate Shannon entropy + # Parameters: + # pmf (numeric) probability mass function: a vector of states and + # associated probabilities. Each element of + # pmf must be in (0, 1] and sum(pmf) must be 1. + # Value: + # Shannon entropy in bits. + # Examples: + # H(c(A=0.25, C=0.25, G=0.25, T=0.25)) # 2 bits entropy in a random + # # nucleotide sequence + # H(1) # If all elements are the same, entropy is zero + # + if (any(pmf <= 0 | pmf > 1) || isFALSE(all.equal(1.0, sum(pmf)))) { + stop("Input is not a discrete probability distribution.") + } + H <- -sum(pmf * (log(pmf) / log(2))) + return(H) } +# Why use all.equal()? Exact comparisons with floating point numbers are +# brittle. Consider for example: +1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6 == 1 +print(1/6 + 1/6 + 1/6 + 1/6 + 1/6 + 1/6, digits = 22) # 0.9999999999999998889777 +# all.equal() tests for _near_ equality with tolerance of ~ 1.5e-8 + + # Entropy of the database frequencies (in bits): (Href <- H(AAref)) diff --git a/FND-STA-Probability_distribution.R b/FND-STA-Probability_distribution.R index e77dc01..77397e5 100644 --- a/FND-STA-Probability_distribution.R +++ b/FND-STA-Probability_distribution.R @@ -1,4 +1,10 @@ -# FND-STA-Probability_distribution.R +# tocID <- "FND-STA-Probability_distribution.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the FND-STA-Probability_distribution unit. @@ -28,7 +34,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------------------------- #TOC> 1 Introduction 52 @@ -45,7 +51,7 @@ #TOC> 4.2.1 An example from tossing dice 463 #TOC> 4.2.2 An example from lognormal distributions 586 #TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629 -#TOC> +#TOC> #TOC> ========================================================================== @@ -460,7 +466,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000) # be applied to discrete distributions. But we need to talk a bit about # converting counts to p.m.f.'s. -# === 4.2.1 An example from tossing dice +# === 4.2.1 An example from tossing dice # The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But # there is an issue when we convert sampled counts to frequencies, and estimate @@ -583,7 +589,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick") # somewhat but not drastically atypical. -# === 4.2.2 An example from lognormal distributions +# === 4.2.2 An example from lognormal distributions # We had compared a set of lognormal and gamma distributions above, now we # can use KL-divergence to quantify their similarity: diff --git a/FND-STA-Significance.R b/FND-STA-Significance.R index 4748c7b..08abcdb 100644 --- a/FND-STA-Significance.R +++ b/FND-STA-Significance.R @@ -1,4 +1,10 @@ -# FND-STA-Significance.R +# tocID <- "FND-STA-Significance.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the FND-STA-Significance unit. @@ -25,7 +31,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ------------------------------------------------------------------ #TOC> 1 Significance and p-value 43 @@ -36,7 +42,7 @@ #TOC> 3 Significance by integration 198 #TOC> 4 Significance by simulation or permutation 204 #TOC> 5 Final tasks 312 -#TOC> +#TOC> #TOC> ========================================================================== @@ -100,7 +106,7 @@ print(x, digits = 22) # curve, as a fraction of the whole. -# === 1.2.1 p-value illustrated +# === 1.2.1 p-value illustrated # Let's illustrate. First we draw a million random values from our # standard, normal distribution: diff --git a/README.md b/README.md index 9c792a2..edf67e4 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # ABC-units A Bioinformatics Course: R modules for learning units + +Follow the instructions in the learning unit to install your local copy of this R-project. diff --git a/RPR-Biostrings.R b/RPR-Biostrings.R index b24c08e..5e8a730 100644 --- a/RPR-Biostrings.R +++ b/RPR-Biostrings.R @@ -1,4 +1,10 @@ -# RPR-Biostrings.R +# tocID <- "RPR-Biostrings.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Biostrings unit. diff --git a/RPR-FASTA.R b/RPR-FASTA.R index 56f9ff8..d7f9706 100644 --- a/RPR-FASTA.R +++ b/RPR-FASTA.R @@ -1,4 +1,10 @@ -# RPR-FASTA.R +# tocID <- "RPR-FASTA.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-FASTA unit. diff --git a/RPR-GEO2R.R b/RPR-GEO2R.R index 7338ce9..3bea234 100644 --- a/RPR-GEO2R.R +++ b/RPR-GEO2R.R @@ -1,4 +1,10 @@ -# RPR_GEO2R.R +# tocID <- "RPR_GEO2R.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR_GEO2R unit. @@ -34,7 +40,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------------------- #TOC> 1 Preparations 56 @@ -49,7 +55,7 @@ #TOC> 5.1 Final task: Gene descriptions 504 #TOC> 6 Improving on Discovery by Differential Expression 510 #TOC> 7 Annotation data 594 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/RPR-Genetic_code_optimality.R b/RPR-Genetic_code_optimality.R index bdf68dd..d3a5cd1 100644 --- a/RPR-Genetic_code_optimality.R +++ b/RPR-Genetic_code_optimality.R @@ -1,4 +1,10 @@ -# RPR-Genetic_code_optimality.R +# tocID <- "RPR-Genetic_code_optimality.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Genetic_code_optimality unit. @@ -30,7 +36,7 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------- #TOC> 1 Designing a computational experiment 57 @@ -43,7 +49,7 @@ #TOC> 2.2.4 measure effect 213 #TOC> 3 Run the experiment 260 #TOC> 4 Task solutions 356 -#TOC> +#TOC> #TOC> ========================================================================== @@ -142,7 +148,7 @@ swappedGC <- function(GC) { # - we count the number of mutations and evaluate their severity. -# === 2.2.1 reverse-translate +# === 2.2.1 reverse-translate # To reverse-translate an amino acid vector, we randomly pick one of its # codons from a genetic code, and assemble all codons to a sequence. @@ -167,7 +173,7 @@ traRev <- function(s, GC) { } -# === 2.2.2 Randomly mutate +# === 2.2.2 Randomly mutate # To mutate, we split a codon into it's three nucleotides, then randomly replace # one of the three with another nucleotide. @@ -192,7 +198,7 @@ randMut <- function(vC) { -# === 2.2.3 Forward- translate +# === 2.2.3 Forward- translate traFor <- function(vC, GC) { # Parameters: @@ -210,7 +216,7 @@ traFor <- function(vC, GC) { } -# === 2.2.4 measure effect +# === 2.2.4 measure effect # How do we evaluate the effect of the mutation? We'll take a simple ad hoc # approach: we divide amino acids into hydrophobic, hydrophilic, and neutral diff --git a/RPR-Introduction.R b/RPR-Introduction.R index 2598d93..45be703 100644 --- a/RPR-Introduction.R +++ b/RPR-Introduction.R @@ -1,4 +1,10 @@ -# RPR-Introduction.R +# tocID <- "RPR-Introduction.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Introduction unit diff --git a/RPR-PROSITE_POST.R b/RPR-PROSITE_POST.R index 4e85457..392dcee 100644 --- a/RPR-PROSITE_POST.R +++ b/RPR-PROSITE_POST.R @@ -1,4 +1,10 @@ -# RPR-PROSITE_POST.R +# tocID <- "RPR-PROSITE_POST.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. @@ -29,13 +35,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------------- #TOC> 1 Constructing a POST command from a Web query 42 #TOC> 1.1 Task - fetchPrositeFeatures() function 142 #TOC> 2 Task solutions 150 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/RPR-RegEx.R b/RPR-RegEx.R index aa96cfe..a9173c0 100644 --- a/RPR-RegEx.R +++ b/RPR-RegEx.R @@ -1,4 +1,10 @@ -# RPR-RegEx.R +# tocID <- "RPR-RegEx.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-RegEx unit diff --git a/RPR-SX-PDB.R b/RPR-SX-PDB.R index d566b8e..a97c591 100644 --- a/RPR-SX-PDB.R +++ b/RPR-SX-PDB.R @@ -1,4 +1,10 @@ -# RPR-SX-PDB.R +# tocID <- "RPR-SX-PDB.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-SX-PDB unit. diff --git a/RPR-UniProt_GET.R b/RPR-UniProt_GET.R index 74578ee..b8e9c30 100644 --- a/RPR-UniProt_GET.R +++ b/RPR-UniProt_GET.R @@ -1,4 +1,10 @@ -# RPR-UniProt_GET.R +# tocID <- "RPR-UniProt_GET.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. @@ -28,13 +34,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------------- #TOC> 1 UniProt files via GET 41 #TOC> 1.1 Task - fetchUniProtSeq() function 103 #TOC> 2 Task solutions 110 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/RPR-Unit_testing.R b/RPR-Unit_testing.R index 3fa2e0f..34b6b68 100644 --- a/RPR-Unit_testing.R +++ b/RPR-Unit_testing.R @@ -1,4 +1,10 @@ -# RPR-Unit_testing.R +# tocID <- "RPR-Unit_testing.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Unit_testing unit. @@ -29,10 +35,10 @@ #TOC> #TOC> Section Title Line #TOC> ------------------------------------------------- -#TOC> 1 Unit Tests with testthat 40 -#TOC> 2 Organizing your tests 159 -#TOC> 2.1 Testing scripts 183 -#TOC> 3 Task solutions 198 +#TOC> 1 Unit Tests with testthat 46 +#TOC> 2 Organizing your tests 165 +#TOC> 2.1 Testing scripts 189 +#TOC> 3 Task solutions 204 #TOC> #TOC> ========================================================================== diff --git a/RPR-eUtils_XML.R b/RPR-eUtils_XML.R index 1f8c23b..6dec655 100644 --- a/RPR-eUtils_XML.R +++ b/RPR-eUtils_XML.R @@ -1,4 +1,10 @@ -# RPR-eUtils_and_XML.R +# tocID <- "RPR-eUtils_and_XML.R" +# +# ---------------------------------------------------------------------------- # +# PATIENCE ... # +# Do not yet work wih this code. Updates in progress. Thank you. # +# boris.steipe@utoronto.ca # +# ---------------------------------------------------------------------------- # # # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. @@ -28,13 +34,13 @@ #TOC> ========================================================================== -#TOC> +#TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------- #TOC> 1 Working with NCBI eUtils 41 #TOC> 1.1 Task - fetchNCBItaxData() function 144 #TOC> 2 Task solutions 151 -#TOC> +#TOC> #TOC> ========================================================================== diff --git a/scripts/ABC-createRefDB.R b/scripts/ABC-createRefDB.R index c4a45b9..3df8d08 100644 --- a/scripts/ABC-createRefDB.R +++ b/scripts/ABC-createRefDB.R @@ -8,7 +8,7 @@ # http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi # # For the data model, see -# https://docs.google.com/drawings/d/1uupNvz18_FYFwyyVPebTM0CUxcJCPDQuxuIJGpjWQWg +# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 # For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R # # ============================================================================== diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R index 3137611..e4b21d3 100644 --- a/scripts/ABC-dbUtilities.R +++ b/scripts/ABC-dbUtilities.R @@ -1,12 +1,35 @@ -# ABC-dbUtilities.R - +# tocID <- "scripts/ABC-dbUtilities.R" +# # database utilities for ABC learning units # # ============================================================================== -# -# ====== PACKAGES ============================================================== +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------- +#TOC> 1 PACKAGES 32 +#TOC> 2 FUNCTIONS 50 +#TOC> 2.01 dbSanitizeSequence() 53 +#TOC> 2.02 dbConfirmUnique() 88 +#TOC> 2.03 dbInit() 106 +#TOC> 2.04 dbAutoincrement() 147 +#TOC> 2.05 dbAddProtein() 160 +#TOC> 2.06 dbAddFeature() 180 +#TOC> 2.07 dbAddTaxonomy() 199 +#TOC> 2.08 dbAddAnnotation() 215 +#TOC> 2.09 dbFetchUniProtSeq() 243 +#TOC> 2.10 dbFetchPrositeFeatures() 267 +#TOC> 2.11 node2text() 311 +#TOC> 2.12 dbFetchNCBItaxData() 323 +#TOC> 2.13 UniProtIDmap() 362 +#TOC> 3 TESTS 399 +#TOC> +#TOC> ========================================================================== + + +# = 1 PACKAGES ============================================================ if (! requireNamespace("jsonlite", quietly = TRUE)) { @@ -24,9 +47,10 @@ if (! requireNamespace("xml2", quietly = TRUE)) { } -# ====== FUNCTIONS ============================================================= +# = 2 FUNCTIONS =========================================================== +# == 2.01 dbSanitizeSequence() ============================================= dbSanitizeSequence <- function(s, unambiguous = TRUE) { # Remove FASTA header lines, if any, # flatten any structure that s has, @@ -61,6 +85,7 @@ dbSanitizeSequence <- function(s, unambiguous = TRUE) { } +# == 2.02 dbConfirmUnique() ================================================ dbConfirmUnique <- function(x) { # x is a vector of logicals. # returns x if x has exactly one TRUE element. @@ -78,24 +103,27 @@ dbConfirmUnique <- function(x) { } +# == 2.03 dbInit() ========================================================= dbInit <- function() { # Return an empty instance of the protein database + # Open the link and study the schema: + # https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0 db <- list() + db$version <- "1.0" + db$protein <- data.frame( ID = numeric(), name = character(), RefSeqID = character(), UniProtID = character(), taxonomyID = numeric(), - sequence = character(), - stringsAsFactors = FALSE) + sequence = character()) db$taxonomy <- data.frame( ID = numeric(), - species = character(), - stringsAsFactors = FALSE) + species = character()) db$annotation <- data.frame( @@ -103,21 +131,20 @@ dbInit <- function() { proteinID = numeric(), featureID = numeric(), start = numeric(), - end = numeric(), - stringsAsFactors = FALSE) + end = numeric()) db$feature <- data.frame( ID = numeric(), name = character(), description = character(), sourceDB = character(), - accession = character(), - stringsAsFactors = FALSE) + accession = character()) return(db) } +# == 2.04 dbAutoincrement() ================================================ dbAutoincrement <- function(tb) { # Return a unique integer that can be used as a primary key # Value: @@ -130,6 +157,7 @@ dbAutoincrement <- function(tb) { } +# == 2.05 dbAddProtein() =================================================== dbAddProtein <- function(db, jsonDF) { # Add one or more protein entries to the database db. # Parameters: @@ -142,14 +170,14 @@ dbAddProtein <- function(db, jsonDF) { RefSeqID = jsonDF$RefSeqID[i], UniProtID = jsonDF$UniProtID[i], taxonomyID = jsonDF$taxonomyID[i], - sequence = dbSanitizeSequence(jsonDF$sequence[i]), - stringsAsFactors = FALSE) + sequence = dbSanitizeSequence(jsonDF$sequence[i])) db$protein <- rbind(db$protein, x) } return(db) } +# == 2.06 dbAddFeature() =================================================== dbAddFeature <- function(db, jsonDF) { # Add one or more feature entries to the database db. # Parameters: @@ -161,14 +189,14 @@ dbAddFeature <- function(db, jsonDF) { name = jsonDF$name[i], description = jsonDF$description[i], sourceDB = jsonDF$sourceDB[i], - accession = jsonDF$accession[i], - stringsAsFactors = FALSE) + accession = jsonDF$accession[i]) db$feature <- rbind(db$feature, x) } return(db) } +# == 2.07 dbAddTaxonomy() ================================================== dbAddTaxonomy <- function(db, jsonDF) { # Add one or more taxonomy entries to the database db. # Parameters: @@ -178,13 +206,13 @@ dbAddTaxonomy <- function(db, jsonDF) { for (i in seq_len(nrow(jsonDF))) { x <- data.frame( ID = jsonDF$ID[i], - species = jsonDF$species[i], - stringsAsFactors = FALSE) + species = jsonDF$species[i]) db$taxonomy <- rbind(db$taxonomy, x) } return(db) } +# == 2.08 dbAddAnnotation() ================================================ dbAddAnnotation <- function(db, jsonDF) { # Add one or more annotation entries to the database db. # Parameters: @@ -205,14 +233,14 @@ dbAddAnnotation <- function(db, jsonDF) { proteinID = pID, featureID = fID, start = as.integer(jsonDF$start[i]), - end = as.integer(jsonDF$end[i]), - stringsAsFactors = FALSE) + end = as.integer(jsonDF$end[i])) db$annotation <- rbind(db$annotation, x) } return(db) } +# == 2.09 dbFetchUniProtSeq() ============================================== dbFetchUniProtSeq <- function(ID) { # Fetch a protein sequence from UniProt. # Parameters: @@ -236,6 +264,7 @@ dbFetchUniProtSeq <- function(ID) { } +# == 2.10 dbFetchPrositeFeatures() ========================================= dbFetchPrositeFeatures <- function(ID) { # Fetch feature annotations from ScanProsite. # Parameters: @@ -272,14 +301,14 @@ dbFetchPrositeFeatures <- function(ID) { start = as.numeric(tokens[4]), end = as.numeric(tokens[5]), psID = tokens[6], - psName = tokens[7], - stringsAsFactors = FALSE)) + psName = tokens[7])) } } return(myFeatures) } +# == 2.11 node2text() ====================================================== node2text <- function(doc, tag) { # an extractor function for the contents of elements # between given tags in an XML response. @@ -291,6 +320,7 @@ node2text <- function(doc, tag) { } +# == 2.12 dbFetchNCBItaxData() ============================================= dbFetchNCBItaxData <- function(ID) { # Fetch feature taxID and Organism from the NCBI. # Parameters: @@ -329,6 +359,7 @@ dbFetchNCBItaxData <- function(ID) { +# == 2.13 UniProtIDmap() =================================================== UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { # Use UniProt ID mapping service to map one or more IDs # Parameters: @@ -351,8 +382,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { if (httr::status_code(response) == 200) { # 200: oK myMap <- read.delim(file = textConnection(httr::content(response)), - sep = "\t", - stringsAsFactors = FALSE) + sep = "\t") myMap <- myMap[ , c(1,3)] colnames(myMap) <- c("From", "To") } else { @@ -366,7 +396,7 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { } -# ====== TESTS ================================================================= +# = 3 TESTS =============================================================== if (FALSE) { if (! requireNamespace("testthat", quietly = TRUE)) { diff --git a/scripts/ABC-makeScCCnet.R b/scripts/ABC-makeScCCnet.R index 5138590..54185e3 100644 --- a/scripts/ABC-makeScCCnet.R +++ b/scripts/ABC-makeScCCnet.R @@ -1,4 +1,4 @@ -# ABC-makeScCCnet.R +# tocID <- "scripts/ABC-makeScCCnet.R" # # Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle" # GOSlim annotation. diff --git a/scripts/ABC-writeALN.R b/scripts/ABC-writeALN.R index 0f52e7f..a8b7d80 100644 --- a/scripts/ABC-writeALN.R +++ b/scripts/ABC-writeALN.R @@ -1,4 +1,4 @@ -# ABC-writeALN.R +# tocID <- "scripts/ABC-writeALN.R" # # ToDo: calculate consensus line # append sequence numbers diff --git a/scripts/ABC-writeMFA.R b/scripts/ABC-writeMFA.R index 3753156..90471d3 100644 --- a/scripts/ABC-writeMFA.R +++ b/scripts/ABC-writeMFA.R @@ -40,7 +40,7 @@ writeMFA <- function(ali, if (is.na(blockWidth)) { stop("PANIC: parameter \"blockWidth\" must be numeric.") } - if (blockWidth < 1){ + if (! blockWidth > 0){ stop("PANIC: parameter \"blockWidth\" must be greater than zero.") } @@ -105,7 +105,7 @@ writeMFA <- function(ali, txt <- c(txt, "") # append an empty line for readability } - writeLines(txt, con= myCon) + writeLines(txt, con = myCon) } diff --git a/scripts/BLAST.R b/scripts/BLAST.R index 32c7e2f..80fd7bd 100644 --- a/scripts/BLAST.R +++ b/scripts/BLAST.R @@ -357,20 +357,23 @@ parseBLASTalignment <- function(hit) { # ==== TESTS =================================================================== -# define query: -# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain -# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", -# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", -# sep="") -# or ... -# q <- "NP_010227" # refseq ID -# -# test <- BLAST(q, -# nHits = 100, -# E = 0.001, -# rid = "", -# limits = "txid4751[ORGN]") -# length(test$hits) +if (FALSE) { + # define query: + q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain + "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", + "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", + sep="") + # or ... + q <- "NP_010227" # refseq ID + + test <- BLAST(q, + nHits = 100, + E = 0.001, + rid = "", + limits = "txid4751[ORGN]") + str(test) + length(test$hits) +} # [END]