diff --git a/.utilities.R b/.utilities.R index fc8f88a..827051d 100644 --- a/.utilities.R +++ b/.utilities.R @@ -23,21 +23,26 @@ #TOC> #TOC> Section Title Line #TOC> ----------------------------------------------------------- -#TOC> 1 SCRIPTS TO SOURCE 45 -#TOC> 2 PACKAGES 51 -#TOC> 3 SUPPORT FUNCTIONS 62 -#TOC> 3.1 objectInfo() 65 -#TOC> 3.2 biCode() 93 -#TOC> 3.3 sameSpecies() 127 -#TOC> 3.4 pBar() 146 -#TOC> 3.5 waitTimer() 168 -#TOC> 3.6 fetchMSAmotif() 196 -#TOC> 3.7 H() (Shannon entropy) 240 -#TOC> 4 DATA 254 -#TOC> 4.1 REFspecies 256 -#TOC> 5 FUNCTIONS TO CUSTOMIZE ASSIGNMENTS 271 -#TOC> 5.1 getMYSPE() 274 -#TOC> 5.2 selectPDBrep() 283 +#TOC> 1 SCRIPTS TO SOURCE 50 +#TOC> 2 PACKAGES 56 +#TOC> 3 DATA & CONSTANTS 67 +#TOC> 4 SUPPORT FUNCTIONS 113 +#TOC> 4.01 objectInfo() 116 +#TOC> 4.02 biCode() 144 +#TOC> 4.03 sameSpecies() 178 +#TOC> 4.04 validateFA() 198 +#TOC> 4.05 readFASTA() 247 +#TOC> 4.06 writeFASTA() 282 +#TOC> 4.07 pBar() 315 +#TOC> 4.08 waitTimer() 337 +#TOC> 4.09 fetchMSAmotif() 365 +#TOC> 4.10 H() (Shannon entropy) 409 +#TOC> 5 FUNCTIONS TO CUSTOMIZE ASSIGNMENTS 422 +#TOC> 5.01 seal() 424 +#TOC> 5.02 getMYSPE() 428 +#TOC> 5.03 selectPDBrep() 437 +#TOC> 5.04 selectChi2() 473 +#TOC> 5.05 selectENSP() 486 #TOC> #TOC> ========================================================================== @@ -59,10 +64,56 @@ if (! requireNamespace("jsonlite", quietly = TRUE)) { } -# = 3 SUPPORT FUNCTIONS =================================================== +# = 3 DATA & CONSTANTS ==================================================== + +# cf. https://www.bioinformatics.org/sms/iupac.html +AAVALID <- "acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY*-" +NUCVALID <- "acgtuACGTU-" +NUCAMBIG <- "acgtACGTryswkmbdhvnRYSWKMBDHVN-" + +# A colorpallette for amino acid properties +AACOLS <- character() +AACOLS["R"] <- "#577EFF" # Positive +AACOLS["K"] <- "#479EEE" # +AACOLS["H"] <- "#37BFDE" # +AACOLS["E"] <- "#ffa587" # Negative +AACOLS["D"] <- "#ff87ad" # +AACOLS["N"] <- "#9FC6FC" # Hydrophilic +AACOLS["Q"] <- "#A7CFF5" # +AACOLS["S"] <- "#AFD8EE" # +AACOLS["T"] <- "#B7E2E8" # +AACOLS["Y"] <- "#F5FFD9" # Hydrophobic +AACOLS["W"] <- "#F1FFDB" # +AACOLS["F"] <- "#EDFFDD" # +AACOLS["I"] <- "#E9FFDF" # +AACOLS["L"] <- "#E5FFE2" # +AACOLS["M"] <- "#E1FFE4" # +AACOLS["V"] <- "#DDFFE6" # +AACOLS["A"] <- "#D9FFE9" # +AACOLS["G"] <- "#e0e0e0" # Glycine +AACOLS["C"] <- "#fffb91" # Cysteine +AACOLS["P"] <- "#e8f7e1" # Proline +# barplot(rep(1, 20), col = AACOLS) -# == 3.1 objectInfo() ====================================================== +# 10 species of fungi for reference analysis. +# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi +REFspecies <- c("Aspergillus nidulans", + "Bipolaris oryzae", + "Coprinopsis cinerea", + "Cryptococcus neoformans", + "Neurospora crassa", + "Puccinia graminis", + "Saccharomyces cerevisiae", + "Schizosaccharomyces pombe", + "Ustilago maydis", + "Wallemia mellicola") + + +# = 4 SUPPORT FUNCTIONS =================================================== + + +# == 4.01 objectInfo() ===================================================== objectInfo <- function(x) { # Function to combine various information items about R objects # @@ -90,7 +141,7 @@ objectInfo <- function(x) { } -# == 3.2 biCode() ========================================================== +# == 4.02 biCode() ========================================================= biCode <- function(s) { # Make a 5 character "biCode" from a binomial name by concatening # the uppercased first three letter of the first word and the first @@ -124,7 +175,7 @@ biCode <- function(s) { } -# == 3.3 sameSpecies() ===================================================== +# == 4.03 sameSpecies() ==================================================== sameSpecies <- function(a, b) { # Parameters: a, b two vectors that contain # binomial species names and maybe additional strain information. @@ -143,7 +194,125 @@ sameSpecies <- function(a, b) { } -# == 3.4 pBar() ============================================================ + +# == 4.04 validateFA() ===================================================== + +validateFA <- function(txt) { + # validates txt according to FASTA assumptions + # Parameters: + # txt char a putative vector of FASTA formatted text + # Value: invisible(NULL) + # + # The function is used for its side-effect of throwing an error of + # FASTA assumptions are violated in txt. + # - At least one header line + # - No adjacent header lines + # - All header lines followed by at least one sequence line + # - All sequence lines have at least one valid character + # - Valid characters are AAVALID (which includes valid nucleotides) + + if ( ! any(grepl("^>", txt)) ) { + stop("no header lines in input") + } + + sel <- grepl("^>", txt) + sel <- sel[- length(sel)] & sel[-1] + if ( any(sel) ) { + i <- which(sel)[1] + stop(sprintf("adjacent header lines in input (lines %d and %d)", + i, i+1)) + } + + selA <- grepl("^>", txt) + selB <- grepl(sprintf("[^%s]", AAVALID), txt) + if ( any( (! selA) & selB) ) { # (not header) AND (has invalid character) + i <- which( (! selA) & selB)[1] + stop(sprintf("invalid character(s) in sequence (cf. line %d)", + i, i+1)) + } + + sel <- grep("^>", txt) + 1 + sel <- grepl(sprintf("[%s]+", AAVALID), txt[sel]) + if ( ! all(sel) ) { + i <- which( ! sel)[1] + stop(sprintf("a header has no adjacent sequence (line %d)", + grep("^>", txt)[i])) + } + + # all good, if we get to here. + return(invisible(NULL)) +} + + +# == 4.05 readFASTA() ====================================================== + +readFASTA <- function(FA) { + # Read FASTA formatted text, validate it, + # return a dataframe of headers and collapsed sequences. + # Parameters: + # FA chr Input file name (or text connection) + # Value: + # data.frame + # $header char the FASTA header lines + # $ seq char the actual sequences + # + # Note: if length(FA) is one, it is assumed to be a filename + # + # Example: + # refAPSES <- readFASTA("./data/refAPSES.mfa") + # readFASTA(c("> This", "acdef", "ghi", > That", "k-l")) + # + + if (length(FA) == 1) { FA <- readLines(FA) } + validateFA(FA) + FA <- FA[! grepl("^$", FA)] # drop all empty lines + iHead <- grep("^>", FA) # find all headers + myFA <- data.frame(head = FA[iHead], + seq = character(length(iHead))) + + for (i in seq_along(iHead)) { + first <- iHead[i] + 1 # first line of each sequence + last <- ifelse(i < length(iHead), iHead[i + 1] - 1, length(FA)) # ...last + myFA$seq[i] <- paste0(FA[first:last], collapse = "") + } + return(myFA) +} + + +# == 4.06 writeFASTA() ===================================================== +writeFASTA <- function(fa, fn = NULL, width = 60) { + # Write the contents of dataframe "fa" as a FASTA formatted file. + # Parameters: + # fa dataframe + # $head chr vector of FASTA headers, + # $seq chr vector of sequences in one-letter code + # fn chr filename for output; if NULL (default) the output is + # returned instead + # width int max number of sequence characters per line of output. + # Value: + # FASTA formatted character vector IF fn was NULL. invisible(NULL) + # otherwise. + + out <- character() + for (i in seq_along(fa$head)) { + out <- c(out, fa$head[i]) # add header line + from <- seq(1, nchar(fa$seq[i]), by = width) # starting indices of chunks + to <- c((from - 1)[-1], nchar(fa$seq[i])) # ending indices of chunks + out <- c(out, substring(fa$seq[i], from, to)) # add chunks to txt + out <- c(out, "") # add empty line for better readability + } + out <- out[ - length(out)] # drop the last empty line + + if (length(fn) == 1) { + writeLines(out, fn) + return(invisible(NULL)) + } else { + return(out) + } +} + + +# == 4.07 pBar() =========================================================== pBar <- function(i, l, nCh = 50) { # Draw a progress bar in the console # i: the current iteration @@ -165,7 +334,7 @@ sameSpecies <- function(a, b) { } -# == 3.5 waitTimer() ======================================================= +# == 4.08 waitTimer() ====================================================== waitTimer <- function(t, nIntervals = 50) { # pause and wait for t seconds and display a progress bar as # you are waiting @@ -193,7 +362,7 @@ waitTimer <- function(t, nIntervals = 50) { } -# == 3.6 fetchMSAmotif() =================================================== +# == 4.09 fetchMSAmotif() ================================================== fetchMSAmotif <- function(ali, mot) { # Retrieve a subset from ali that spans the sequence in mot. # Biostrings package must be installed. @@ -237,7 +406,7 @@ fetchMSAmotif <- function(ali, mot) { } -# == 3.7 H() (Shannon entropy) ============================================= +# == 4.10 H() (Shannon entropy) ============================================ H <- function(x, N) { # calculate the Shannon entropy of the vector x given N possible states # (in bits). @@ -250,31 +419,13 @@ H <- function(x, N) { } - -# = 4 DATA ================================================================ - -# == 4.1 REFspecies ======================================================== -# 10 species of fungi for reference analysis. -# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi -REFspecies <- c("Aspergillus nidulans", - "Bipolaris oryzae", - "Coprinopsis cinerea", - "Cryptococcus neoformans", - "Neurospora crassa", - "Puccinia graminis", - "Saccharomyces cerevisiae", - "Schizosaccharomyces pombe", - "Ustilago maydis", - "Wallemia mellicola") - - # = 5 FUNCTIONS TO CUSTOMIZE ASSIGNMENTS ================================== -# == 5.1 seal() ======================================================== +# == 5.01 seal() =========================================================== seal <- function(x.1L) { .Call(digest:::digest_impl,x.1L,3L,-1L,-0,-0,-0) } -# == 5.1 getMYSPE() ======================================================== +# == 5.02 getMYSPE() ======================================================= getMYSPE <- function(x) { dat <- readRDS("./data/sDat.rds") map <- readRDS("./data/MYSPEmap.rds") @@ -283,7 +434,7 @@ getMYSPE <- function(x) { } -# == 5.2 selectPDBrep() ==================================================== +# == 5.03 selectPDBrep() =================================================== selectPDBrep <- function(n, forCredit = FALSE) { # Select n PDB IDs from a list of high-resolution, non-homologous, single # domain, single chain structure files that represent a CATH topology @@ -319,7 +470,7 @@ selectPDBrep <- function(n, forCredit = FALSE) { } -# == 5.2 selectChi2() ==================================================== +# == 5.04 selectChi2() ===================================================== selectChi2 <- function() { # Select one random Amino acid from those that have a Chi2 angle @@ -332,7 +483,7 @@ selectChi2 <- function() { } -# == 5.2 selectENSP() ==================================================== +# == 5.05 selectENSP() ===================================================== selectENSP <- function(x) { oldSeed <- .Random.seed set.seed(myStudentNumber)