diff --git a/BIN-ALI-BLAST.R b/BIN-ALI-BLAST.R index d601f66..39477f3 100644 --- a/BIN-ALI-BLAST.R +++ b/BIN-ALI-BLAST.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-BLAST unit. # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 10 23 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.1 Fixed parsing logic. # 1.0 First live version 2017. # 0.1 First code copied from 2016 material. @@ -27,31 +29,17 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> --------------------------------------------- -#TOC> 1 Preparations 41 -#TOC> 2 Defining the APSES domain 54 -#TOC> 3 Executing the BLAST search 76 -#TOC> 4 Analysing results 98 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------- +#TOC> 1 Defining the APSES domain 42 +#TOC> 2 Executing the BLAST search 64 +#TOC> 3 Analysing results 86 +#TOC> #TOC> ========================================================================== -# = 1 Preparations ======================================================== - -if (!require(Biostrings, quietly=TRUE)) { - source("https://bioconductor.org/biocLite.R") - biocLite("Biostrings") - library(Biostrings) -} -# Package information: -# library(help = Biostrings) # basic information -# browseVignettes("Biostrings") # available vignettes -# data(package = "Biostrings") # available datasets - - -# = 2 Defining the APSES domain =========================================== +# = 1 Defining the APSES domain =========================================== # Load your protein database source("makeProteinDB.R") @@ -73,7 +61,7 @@ source("makeProteinDB.R") # BLAST search. -# = 3 Executing the BLAST search ========================================== +# = 2 Executing the BLAST search ========================================== # The ./scripts/BLAST.R code defines two functions to access the BLAST interface # through its Web API, and to parse results. Have a look at the script, then @@ -91,11 +79,11 @@ BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence limits = "txid559292[ORGN]") # S. cerevisiae S288c -length(BLASTresults$hits) # There should be at least one hit there. Ask for advice - # in case this step fails. +length(BLASTresults$hits) # There should be at least one hit there. Ask for + # advice in case this step fails. -# = 4 Analysing results =================================================== +# = 3 Analysing results =================================================== (topHit <- BLASTresults$hits[[1]]) # Get the top hit diff --git a/BIN-ALI-Dotplot.R b/BIN-ALI-Dotplot.R index ab12437..1982bd4 100644 --- a/BIN-ALI-Dotplot.R +++ b/BIN-ALI-Dotplot.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Dotplot unit. # -# Version: 0.1 +# Version: 0.2 # -# Date: 2017 08 28 +# Date: 2019 01 07 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 0.2 Change from require() to requireNamespace(), +# use ::() idiom throughout # 0.1 First code copied from 2016 material. # # @@ -23,24 +25,37 @@ # # ============================================================================== -# = 1 ___Section___ -# First, we install and load the Biostrings package. -if (!require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +#TOC> ========================================================================== +#TOC> +#TOC> Section Title Line +#TOC> -------------------------------------- +#TOC> 1 ___Section___ 39 +#TOC> 2 Tasks 187 +#TOC> +#TOC> ========================================================================== + + +# = 1 ___Section___ ======================================================= + +if (!requireNamespace("BiocManager", quietly=TRUE)) { + install.packages("BiocManager") } +if (!requireNamespace("Biostrings", quietly=TRUE)) { + BiocManager::install("Biostrings") +} +# Package information: # library(help = Biostrings) # basic information # browseVignettes("Biostrings") # available vignettes # data(package = "Biostrings") # available datasets +if (!requireNamespace("seqinr", quietly=TRUE)) { + install.packages("seqinr") +} # Let's load BLOSUM62 -data(BLOSUM62) +data(BLOSUM62, package = "Biostrings") # Now let's craft code for a dotplot. That's surprisingly simple. We build a # matrix that has as many rows as one sequence, as many columns as another. Then @@ -51,10 +66,10 @@ data(BLOSUM62) # First we fetch our sequences and split them into single characters. sel <- myDB$protein$name == "MBP1_SACCE" -MBP1_SACCE <- s2c(myDB$protein$sequence[sel]) +MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") -MBP1_MYSPE <- s2c(myDB$protein$sequence[sel]) +MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) # Check that we have two character vectors of the expected length. str(MBP1_SACCE) @@ -136,7 +151,7 @@ axis(4, at = c(1, seq(10, len, by=10))) # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because # there already is a dotplot function in the seqinr package: -dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr +seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's # Which one do you prefer? You can probably see the block patterns that arise @@ -169,7 +184,7 @@ dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) -# = 1 Tasks +# = 2 Tasks =============================================================== diff --git a/BIN-ALI-MSA.R b/BIN-ALI-MSA.R index 164c420..ddb58a2 100644 --- a/BIN-ALI-MSA.R +++ b/BIN-ALI-MSA.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-MSA unit. # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 10 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.1 Added fetchMSAmotif() # 1.0 Fully refactored and rewritten for 2017 # 0.1 First code copied from 2016 material. @@ -27,25 +29,25 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ------------------------------------------------------------ -#TOC> 1 Preparations 51 -#TOC> 2 Aligning full length MBP1 proteins 99 -#TOC> 2.1 Preparing Sequences 110 -#TOC> 2.2 Compute the MSA 135 -#TOC> 3 Analyzing an MSA 156 -#TOC> 4 Comparing MSAs 227 -#TOC> 4.1 Importing an alignment to msa 236 -#TOC> 4.1.1 importing an .aln file 245 -#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276 -#TOC> 4.2 More alignments 313 -#TOC> 4.3 Computing comparison metrics 325 -#TOC> 5 Profile-Profile alignments 462 -#TOC> 6 Sequence Logos 539 -#TOC> 6.1 Subsetting an alignment by motif 548 -#TOC> 6.2 Plot a Sequence Logo 591 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------------------------ +#TOC> 1 Preparations 54 +#TOC> 2 Aligning full length MBP1 proteins 96 +#TOC> 2.1 Preparing Sequences 107 +#TOC> 2.2 Compute the MSA 132 +#TOC> 3 Analyzing an MSA 153 +#TOC> 4 Comparing MSAs 224 +#TOC> 4.1 Importing an alignment to msa 233 +#TOC> 4.1.1 importing an .aln file 242 +#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 273 +#TOC> 4.2 More alignments 324 +#TOC> 4.3 Computing comparison metrics 336 +#TOC> 5 Profile-Profile alignments 473 +#TOC> 6 Sequence Logos 546 +#TOC> 6.1 Subsetting an alignment by motif 555 +#TOC> 6.2 Plot a Sequence Logo 604 +#TOC> #TOC> ========================================================================== @@ -59,28 +61,22 @@ source("makeProteinDB.R") -# Multiple sequence alignment algorithms are provided in -# the Bioconductor msa package. - -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly=TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly=TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help = Biostrings) # basic information # browseVignettes("Biostrings") # available vignettes # data(package = "Biostrings") # available datasets +# Multiple sequence alignment algorithms are provided in +# the Bioconductor msa package. -if (! require(msa, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("msa") - library(msa) +if (! requireNamespace("msa", quietly=TRUE)) { + BiocManager::install("msa") } # Package information: # library(help=msa) # basic information @@ -115,7 +111,7 @@ help(package = "msa") # of sequence. sel <- grep("MBP1", myDB$protein$name) -MBP1set <- AAStringSet(myDB$protein$sequence[sel]) +MBP1set <- Biostrings::AAStringSet(myDB$protein$sequence[sel]) # To help us make sense of the alignment we need to add the names for # the sequences. Names for a seqSet object are held in the ranges slot... @@ -142,10 +138,10 @@ MBP1set # Let's run an alignment with "Muscle" -(msaM <- msaMuscle( MBP1set, order = "aligned")) +(msaM <- msa::msaMuscle( MBP1set, order = "aligned")) # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method): -print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) +msa::print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) # You see that the alignment object has sequence strings with hyphens as @@ -173,7 +169,7 @@ print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package -msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62) +msaMScores <- msa::msaConservationScore(msaM, substitutionMatrix = BLOSUM62) plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position") # That plot shows the well-aligned regions (domains ?) of the sequence, but it @@ -243,20 +239,20 @@ for (i in seq_along(highScoringRanges$lengths)) { # - adjust the sequence names # - convert to msaAAMultipleAlignment object -# === 4.1.1 importing an .aln file +# === 4.1.1 importing an .aln file # The seqinr package has a function to read CLUSTAL W formatted .aln files ... -if (! require(seqinr, quietly=TRUE)) { - install.packages(seqinr) - library(seqinr) +if (! requireNamespace("seqinr", quietly=TRUE)) { + install.packages("seqinr") } # Package information: # library(help=seqinr) # basic information # browseVignettes("seqinr") # available vignettes # data(package = "seqinr") # available datasets -# read the donwloaded file -tmp <- read.alignment("msaT.aln", format = "clustal") +# read the T-coffee aligned file that you donwloaded from the EBI MSA tools +# (cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/BIN-ALI-MSA) +tmp <- seqinr::read.alignment("msaT.aln", format = "clustal") # read.alignment() returns a list. $seq is a list of strings, one for each # complete alignment. However, they are converted to lower case. @@ -274,16 +270,16 @@ for (i in seq_along(x)) { names(x)[i] <- myDB$protein$name[sel] # get the name } -# === 4.1.2 Creating an MsaAAMultipleAlignment object +# === 4.1.2 Creating an MsaAAMultipleAlignment object # MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects # in their @unmasked slot, and a few additional items. Rather then build the -# object from scratch, we copy an axisting object, and overwrite the dta in its +# object from scratch, we copy an existing object, and overwrite the data in its # slots with what we need. Our goal is pragmatic, we want an object that msa's # functions will accept as input. # First: convert our named char vector into an AAstringSet -x <- AAStringSet(x) +x <- Biostrings::AAStringSet(x) # Then: create a new MsaAAMultipleAlignment S4 object. The msa package has # defined what such an object should look like, with the SetClass() function. To @@ -294,8 +290,22 @@ x <- AAStringSet(x) str(msaM) +# There is a catch however in the way R makes such operations specific to +# the packages they need them: the function that creates the class is +# defined as a "generic", and when it is called, R looks in the package +# namespace for a more specific function with precise instructions what +# to do. However, we have not loaded the package namespace - we access all +# of the functions directly with the msa:: prefix. This method breaks down +# when generic functions are involved. I.e. - we could make it work, but +# the amount of code we need then is unreasonable. The straightforward +# way is to load the package. We can still use the prefix notation for +# its functions, just to emphasize where the function comes from. But since +# the namespace then exists, we ensure that generics are properly dispatched. + +library(msa) # load the msa package namespace + msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object - unmasked = x, # "unmasked" slot takes an AASringSet + unmasked = x, # "unmasked" slot takes an AAStringSet version = "T-Coffee", # "version" slot takes a string params = list(), # "params" takes a list(), we leave the # list empty, but we could add the @@ -309,18 +319,18 @@ str(msaT) msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can # use the msa package functions on it -msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62) +msaTScores <- msa::msaConservationScore(msaT, substitutionMatrix = BLOSUM62) # == 4.2 More alignments =================================================== # Next, we calculate alignments with msa's two other alignment options: # CLUSTAL Omega -(msaO <- msaClustalOmega( MBP1set, order = "aligned")) -msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62) +(msaO <- msa::msaClustalOmega( MBP1set, order = "aligned")) +msaOScores <- msa::msaConservationScore(msaO, substitutionMatrix = BLOSUM62) # CLUSTAL W -(msaW <- msaClustalW( MBP1set, order = "aligned")) -msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62) +(msaW <- msa::msaClustalW( MBP1set, order = "aligned")) +msaWScores <- msa::msaConservationScore(msaW, substitutionMatrix = BLOSUM62) # == 4.3 Computing comparison metrics ====================================== @@ -454,7 +464,7 @@ legend("bottomright", # Your alignment is going to be different from mine, due to the inclusion of # MYSPE - but what I see is that MUSCLE gives the highest score overall, and -# achieves this with fewer indels then most, and the lowest number of gaps of +# achieves this with fewer indels than most, and the lowest number of gaps of # all algorithms. # To actually compare regions of alignments, we need to align alignments. @@ -470,12 +480,8 @@ legend("bottomright", # to compare two MSAs with each other, by aligning them. The algorithm is # provided by the DECIPHER package. -if (! require(DECIPHER, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("DECIPHER") - library(DECIPHER) +if (! requireNamespace("DECIPHER", quietly=TRUE)) { + BiocManager::install("DECIPHER") } # Package information: # library(help = DECIPHER) # basic information @@ -484,14 +490,14 @@ if (! require(DECIPHER, quietly=TRUE)) { # AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and # CLUSTAL W alignments: we could do this directly ... -AlignProfiles(msaW@unmasked, msaM@unmasked) +DECIPHER::AlignProfiles(msaW@unmasked, msaM@unmasked) # But for ease of comparison, we'll reorder the sequences of the CLUSTAL W # alignment into the same order as the MUSCLE alignment: m <- as.character(msaM) w <- as.character(msaW)[names(m)] -(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m))) +(ppa <- DECIPHER::AlignProfiles(msa::AAStringSet(w), msa::AAStringSet(m))) # Conveniently, AlignProfiles() returns an AAStringSet, so we can use our # writeALN function to show it. Here is an arbitrary block, from somewhere in @@ -533,8 +539,8 @@ writeALN(ppa2, range = c(800, 960)) # Again, go explore, and get a sense of what's going on. You may find that # CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas # MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would -# expect from an algorithm that builds alignments from pairwise local -# alignments, without global refinement. +# expect from an algorithm that builds alignments incrementally from pairwise +# local alignments, without global refinement. # = 6 Sequence Logos ====================================================== @@ -602,16 +608,15 @@ writeALN(fetchMSAmotif(msaM, wing)) # ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at # the EBI. -if (! require(ggseqlogo, quietly=TRUE)) { +if (! requireNamspace("ggseqlogo", quietly=TRUE)) { install.packages(("ggseqlogo")) - library(ggseqlogo) } # Package information: # library(help=ggseqlogo) # basic information # browseVignettes("ggseqlogo") # available vignettes # data(package = "ggseqlogo") # available datasets -ggseqlogo(as.character(motifAli)) +ggseqlogo::ggseqlogo(as.character(motifAli)) diff --git a/BIN-ALI-Optimal_sequence_alignment.R b/BIN-ALI-Optimal_sequence_alignment.R index ef9fb11..3c6c600 100644 --- a/BIN-ALI-Optimal_sequence_alignment.R +++ b/BIN-ALI-Optimal_sequence_alignment.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. # -# Version: 1.4 +# Version: 1.5 # -# Date: 2017 09 - 2017 11 +# Date: 2017 09 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.5 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.4 Pull s2c() from seqinr package, rather then loading the # entire library. # 1.3 Updated confirmation task with correct logic @@ -32,29 +34,32 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> -------------------------------------------------------------------- -#TOC> 1 Prepare 52 -#TOC> 2 Biostrings Pairwise Alignment 66 -#TOC> 2.1 Optimal global alignment 84 -#TOC> 2.2 Optimal local alignment 147 -#TOC> 3 APSES Domain annotation by alignment 171 -#TOC> 4 Update your database script 252 -#TOC> 4.1 Preparing an annotation file ... 258 -#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260 -#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303 -#TOC> 4.2 Execute and Validate 327 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> -------------------------------------------------------------------------- +#TOC> 1 Prepare 54 +#TOC> 2 Biostrings Pairwise Alignment 71 +#TOC> 2.1 Optimal global alignment 89 +#TOC> 2.2 Optimal local alignment 152 +#TOC> 3 APSES Domain annotation by alignment 176 +#TOC> 4 Update your database script 257 +#TOC> 4.1 Preparing an annotation file ... 263 +#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 265 +#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 308 +#TOC> 4.2 Execute and Validate 332 +#TOC> #TOC> ========================================================================== # = 1 Prepare ============================================================= -# To simplify code, we pull the function s2c(x) from the seqinr package, -# rather than using the lengthier idiom unlist(strsplit(x, ""). -# This assumes that the seqinr package has been installed previously. -s2c <- seqinr::s2c +if (! requireNamespace("seqinr", quietly=TRUE)) { + install.packages("seqinr") +} +# You can get package information with the following commands: +# library(help = seqinr) # basic information +# browseVignettes("seqinr") # available vignettes +# data(package = "seqinr") # available datasets # You need to recreate the protein database that you have constructed in the @@ -66,13 +71,13 @@ source("makeProteinDB.R") # = 2 Biostrings Pairwise Alignment ======================================= -if (!require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (!requireNamespace("BiocManager", quietly=TRUE)) { + install.packages("BiocManager") } +if (!requireNamespace("Biostrings", quietly=TRUE)) { + BiocManager::install("Biostrings") +} +# Package information: # library(help = Biostrings) # basic information # browseVignettes("Biostrings") # available vignettes # data(package = "Biostrings") # available datasets @@ -88,15 +93,15 @@ if (!require(Biostrings, quietly=TRUE)) { # First: make AAString objects ... sel <- myDB$protein$name == "MBP1_SACCE" -aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel]) +aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") -aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel]) +aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel]) ?pairwiseAlignment # ... and align. # Global optimal alignment with end-gap penalties is default. -ali1 <- pairwiseAlignment( +ali1 <- Biostrings::pairwiseAlignment( aaMBP1_SACCE, aaMBP1_MYSPE, substitutionMatrix = "BLOSUM62", @@ -108,7 +113,7 @@ str(ali1) # ... it's complicated # This is a Biostrings alignment object. But we can use Biostrings functions to # tame it: ali1 -writePairwiseAlignments(ali1) # That should look familiar +Biostrings::writePairwiseAlignments(ali1) # That should look familiar # And we can make the internal structure work for us (@ is for classes as # $ is for lists ...) @@ -137,9 +142,9 @@ sum(s2c(as.character(ali1@pattern)) == percentID <- function(al) { # returns the percent-identity of a Biostrings alignment object return(100 * - sum(seqinr::s2c(as.character(al@pattern)) == - seqinr::s2c(as.character(al@subject))) / - nchar(al@pattern)) + sum(seqinr::s2c(as.character(al@pattern)) == + seqinr::s2c(as.character(al@subject))) / + nchar(al@pattern)) } percentID(ali1) @@ -147,7 +152,7 @@ percentID(ali1) # == 2.2 Optimal local alignment =========================================== # Compare with local optimal alignment (like EMBOSS Water) -ali2 <- pairwiseAlignment( +ali2 <- Biostrings::pairwiseAlignment( aaMBP1_SACCE, aaMBP1_MYSPE, type = "local", @@ -155,9 +160,9 @@ ali2 <- pairwiseAlignment( gapOpening = 50, gapExtension = 10) -writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal - # DNA binding domain - but that one has quite - # high sequence identity: +Biostrings::writePairwiseAlignments(ali2) +# This has probably only aligned the N-terminal DNA binding domain - but that +# one has quite high sequence identity: percentID(ali2) # == TASK: == @@ -209,14 +214,14 @@ myDB$annotation[myDB$annotation$ID == proID & # the sequence, and used the start and end coordinates to extract a substring. # Let's convert this to an AAstring and assign it: -aaMB1_SACCE_APSES <- AAString(apses) +aaMB1_SACCE_APSES <- Biostrings::AAString(apses) # Now let's align these two sequences of very different length without end-gap # penalties using the "overlap" type. "overlap" turns the # end-gap penalties off and that is crucially important since # the sequences have very different length. -aliApses <- pairwiseAlignment( +aliApses <- Biostrings::pairwiseAlignment( aaMB1_SACCE_APSES, aaMBP1_MYSPE, type = "overlap", @@ -228,7 +233,7 @@ aliApses <- pairwiseAlignment( # homologous, and have (almost) no indels. The entire "pattern" # sequence from QIYSAR ... to ... KPLFDF should be matched # with the "query". Is this correct? -writePairwiseAlignments(aliApses) +Biostrings::writePairwiseAlignments(aliApses) # If this is correct, you can extract the matched sequence from # the alignment object. The syntax is a bit different from what @@ -257,7 +262,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1 # == 4.1 Preparing an annotation file ... ================================== # -# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit +# === 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit # # # You DON'T already have a file called "-Annotations.json" in the @@ -300,7 +305,7 @@ aliApses@subject@range@start + aliApses@subject@range@width - 1 # Then SKIP the next section. # # -# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit +# === 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit # # # You DO already have a file called "-Annotations.json" in the diff --git a/BIN-ALI-Similarity.R b/BIN-ALI-Similarity.R index 370f9cd..b7a1cb2 100644 --- a/BIN-ALI-Similarity.R +++ b/BIN-ALI-Similarity.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-ALI-Similarity unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 20 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0 Refactored for 2017; add aaindex, ternary plot. # 0.1 First code copied from 2016 material. # @@ -27,11 +29,11 @@ #TOC> ========================================================================== #TOC> -#TOC> Section Title Line -#TOC> ---------------------------------------- -#TOC> 1 Amino Acid Properties 43 -#TOC> 2 Mutation Data matrix 163 -#TOC> 3 Background score 205 +#TOC> Section Title Line +#TOC> ---------------------------------------------- +#TOC> 1 Amino Acid Properties 41 +#TOC> 2 Mutation Data matrix 158 +#TOC> 3 Background score 199 #TOC> #TOC> ========================================================================== @@ -41,9 +43,8 @@ # A large collection of amino acid property tables is available via the seqinr # package: -if (!require(seqinr)) { +if (! requireNamespace("seqinr", quietly=TRUE)) { install.packages("seqinr") - library(seqinr) } # Package information: # library(help = seqinr) # basic information @@ -127,9 +128,8 @@ text(Y$I, K$I, names(Y$I)) # plots are in general unintuitive and hard to interpret. One alternative is a # so-called "ternary plot": -if (!require(ggtern)) { +if (! requireNamespace("ggtern", quietly=TRUE)) { install.packages("ggtern") - library(ggtern) } # Package information: # library(help = ggtern) # basic information @@ -145,12 +145,11 @@ myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, stringsAsFactors = FALSE) rownames(myDat) <- names(Y$I) -ggtern(data = myDat, - aes(x = vol, - y = phi, - z = pK, - label = rownames(myDat))) + - geom_text() +ggtern::ggtern(data = myDat, + ggplot2::aes(x = vol, + y = phi, + z = pK, + label = rownames(myDat))) + ggplot2::geom_text() # This results in a mapping of amino acids relative to each other that is # similar to the Venn diagram you have seen in the notes. @@ -162,12 +161,11 @@ ggtern(data = myDat, # The Biostrings package contains the most common mutation data matrices. -if (!require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly=TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly=TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help=Biostrings) # basic information @@ -200,7 +198,9 @@ BLOSUM62["W", "R"] # = 3 Background score ==================================================== -# The mutation data matrix is designed to give high scores to homologous sequences, low scores to non-homologous sequences. What score on average should we expect for a random sequence? +# The mutation data matrix is designed to give high scores to homologous +# sequences, low scores to non-homologous sequences. What score on average +# should we expect for a random sequence? # If we sample amino acid pairs at random, we will get a score that is the # average of the individual pairscores in the matrix. Omitting the ambiguity @@ -219,12 +219,12 @@ sum(BLOSUM62[1:20, 1:20])/400 # PDB ID 3FG7 - a villin headpiece structure with a large amount of # low-complexity amino acid sequence ... -aa3FG7 <- readAAStringSet("./data/3FG7.fa")[[1]] +aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) # with an exceptionally high percentage of hydrophobic residues. -aa2F1C <- readAAStringSet("./data/2F1C.fa")[[1]] +aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] # Here is a function that takes two sequences and # returns their average pairscore. diff --git a/BIN-Data_integration.R b/BIN-Data_integration.R index cfc7cec..38b8d01 100644 --- a/BIN-Data_integration.R +++ b/BIN-Data_integration.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Data_integration unit. # -# Version: 1.0.1 +# Version: 1.1 # -# Date: 2018 10 30 +# Date: 2018 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0.1 Bugfix: UniProt ID Mapping service API change # 1.0 First live version # @@ -31,8 +33,8 @@ #TOC> #TOC> Section Title Line #TOC> ------------------------------------------------- -#TOC> 1 Identifier mapping 40 -#TOC> 2 Cross-referencing tables 164 +#TOC> 1 Identifier mapping 42 +#TOC> 2 Cross-referencing tables 165 #TOC> #TOC> ========================================================================== @@ -54,9 +56,8 @@ # To begin, we load httr, which supports sending and receiving data via the # http protocol, just like a Web browser. -if (!require(httr, quietly=TRUE)) { - install.packages("httr") - library(httr) +if (! requireNamespace("httpr", quietly=TRUE)) { + install.packages("httpr") } # Package information: # library(help = httr) # basic information @@ -75,22 +76,22 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036" # of the request. GET() and POST() are functions from httr. URL <- "https://www.uniprot.org/mapping/" -response <- POST(URL, - body = list(from = "P_REFSEQ_AC", # Refseq Protein - to = "ACC", # UniProt ID - format = "tab", - query = myQueryIDs)) +response <- httr::POST(URL, + body = list(from = "P_REFSEQ_AC", # Refseq Protein + to = "ACC", # UniProt ID + format = "tab", + query = myQueryIDs)) -cat(content(response)) +cat(httr::content(response)) # We need to check the status code - if it is not 200, an error ocurred and we # can't process the result: -status_code(response) +httr::status_code(response) # If the query is successful, tabbed text is returned. We can assign that to a # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. -myMappedIDs <- read.delim(file = textConnection(content(response)), +myMappedIDs <- read.delim(file = textConnection(httr::content(response)), sep = "\t", stringsAsFactors = FALSE) myMappedIDs @@ -132,14 +133,14 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { # for IDs that are not mapped. URL <- "https://www.uniprot.org/uploadlists/" - response <- POST(URL, - body = list(from = mapFrom, - to = mapTo, - format = "tab", - query = s)) + response <- httr::POST(URL, + body = list(from = mapFrom, + to = mapTo, + format = "tab", + query = s)) - if (status_code(response) == 200) { # 200: oK - myMap <- read.delim(file = textConnection(content(response)), + if (httr::status_code(response) == 200) { # 200: oK + myMap <- read.delim(file = textConnection(httr::content(response)), sep = "\t", stringsAsFactors = FALSE) myMap <- myMap[ , c(1,3)] @@ -148,7 +149,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { myMap <- data.frame() warning(paste("No uniProt ID mapping returned:", "server sent status", - status_code(response))) + httr::status_code(response))) } return(myMap) @@ -168,7 +169,8 @@ myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") # Nomenclature commission. How do we map one set of identifiers to another one? # The function to use is match(). -# Here is a tiny set of identifiers taken from a much larger table to illustrate the principle: +# Here is a tiny set of identifiers taken from a much larger table to +# illustrate the principle: # myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747", diff --git a/BIN-FUNC-Semantic_similarity.R b/BIN-FUNC-Semantic_similarity.R index c922237..5295bcc 100644 --- a/BIN-FUNC-Semantic_similarity.R +++ b/BIN-FUNC-Semantic_similarity.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-FUNC_Semantic_similarity unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 11 12 +# Date: 2017 11 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0 New code. # # @@ -25,64 +28,70 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> -------------------------------------------------------------- -#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39 -#TOC> 2 Fetch GO Annotations 89 -#TOC> 3 Semantic Similarities 98 -#TOC> 4 GO Term Enrichment in Gene Sets 116 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> -------------------------------------------------------------------- +#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42 +#TOC> 2 Fetch GO Annotations 98 +#TOC> 3 Semantic Similarities 107 +#TOC> 4 GO Term Enrichment in Gene Sets 125 +#TOC> #TOC> ========================================================================== # = 1 Preparations: Packages, AnnotationDB, Setup ========================= +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} # GOSim is an R-package in the Bioconductor project. -if (! require(GOSim, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("GOSim") - library(GOSim) +if (! requireNamespace("GOSim", quietly = TRUE)) { + BiocManager::install("GOSim") } # Package information: # library(help = GOSim) # basic information # browseVignettes("GOSim") # available vignettes # data(package = "GOSim") # available datasets +# GOSim makes extensive assumptions about loaded packages, and many base +# methods are masked. We will thus use library(GOSim) to load it +# in its entirety and with all packages it depends on. We will still use +# the ::() syntax in the code below, but this now serves +# more of a didactic purpose, rather than actual syntax requirements. + +library(GOSim) # GOSim loads human annotations by default. We load yeast annotations instead... -if (!require(org.Sc.sgd.db, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("org.Sc.sgd.db") - library(org.Sc.sgd.db) +if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { + BiocManager::install("org.Sc.sgd.db") } +# Bioconductor annotation packages won't work stably unless we actually load +# them: +library(org.Sc.sgd.db) + # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such # databases exist for all model organisms. It's a kind of a fancy data frame # from which we can get annotations by rows (genes) with the keys() funtion ... -keys(org.Sc.sgd.db)[1500:1510] +AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] # ... and the types of available annotations with the columns() function -columns(org.Sc.sgd.db) +AnnotationDbi::columns(org.Sc.sgd.db) # Note that one of the columns is "GO" ... and we load that into the # datastructures used by GOSim: # Choose GOterms to use -setEvidenceLevel(evidences="all", - organism=org.Sc.sgdORGANISM, - gomap=org.Sc.sgdGO) +GOSim::setEvidenceLevel(evidences = "all", + organism = org.Sc.sgdORGANISM, + gomap = org.Sc.sgdGO) # Use Biological Process ontology -setOntology("BP", loadIC=FALSE) +GOSim::setOntology("BP", loadIC = FALSE) # confirm that we loaded the correct ontology -head(get("gomap", envir=GOSimEnv)) +head(get("gomap", envir = GOSimEnv)) @@ -92,7 +101,7 @@ head(get("gomap", envir=GOSimEnv)) # All keys being used here are yeast systematic names. # Get one set of annotations -getGOInfo(c("YDL056W")) # Mbp1 +GOSim::getGOInfo(c("YDL056W")) # Mbp1 # = 3 Semantic Similarities =============================================== @@ -104,31 +113,31 @@ getGOInfo(c("YDL056W")) # Mbp1 # There are _many_ different metrics of term similarity implemented # in this package. - # Mbp1 and... -getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex -getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators -getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator -getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist -getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist -getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis + # Mbp1 and... +GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex +GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators +GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator +GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist +GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist +GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis # = 4 GO Term Enrichment in Gene Sets ===================================== -# Calculating GO term enrichment in gene sets is done with the topGO package. -if (! require(topGO, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("topGO") - library(topGO) +# Calculating GO term enrichment in gene sets is done with the Bioconductor +# topGO package. +if (! requireNamespace("topGO", quietly = TRUE)) { + BiocManager::install("topGO") } # Package information: # library(help = topGO) # basic information # browseVignettes("topGO") # available vignettes # data(package = "topGO") # available datasets +# Once again - assumptions are made by GOsim that require us to load the +# topGO package wholesale: +library(topGO) # Let's define a gene set: GOterm enrichment for G1/S switch activators: mySet <- c("YFR028C", # Cdc14 @@ -141,7 +150,7 @@ mySet <- c("YFR028C", # Cdc14 "YPL256C", # Cln2 "YAL040C") # Cln3 -allGenes <- keys(org.Sc.sgd.db) +allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which # we define enrichment @@ -164,9 +173,9 @@ setdiff(fullSet, mySet) # These are annotated to that term but not in mySet. # What are these genes? # Select annotations from the annotation database: -select(org.Sc.sgd.db, - keys = setdiff(fullSet, mySet), - columns = c("COMMON", "DESCRIPTION")) +AnnotationDbi::select(org.Sc.sgd.db, + keys = setdiff(fullSet, mySet), + columns = c("COMMON", "DESCRIPTION")) # Note that these annotations are partially redundant to several different # aliases of the same three genes. diff --git a/BIN-PHYLO-Data_preparation.R b/BIN-PHYLO-Data_preparation.R index 21f6a6b..e0532a4 100644 --- a/BIN-PHYLO-Data_preparation.R +++ b/BIN-PHYLO-Data_preparation.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Data_preparation unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 31 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0 First 2017 version # 0.1 First code copied from 2016 material. # @@ -26,15 +29,15 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> --------------------------------------------------- -#TOC> 1 Preparations 41 -#TOC> 2 Fetching sequences 78 -#TOC> 3 Multiple Sequence Alignment 119 -#TOC> 4 Reviewing and Editing Alignments 138 -#TOC> 4.1 Masking workflow 154 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------------- +#TOC> 1 Preparations 44 +#TOC> 2 Fetching sequences 76 +#TOC> 3 Multiple Sequence Alignment 117 +#TOC> 4 Reviewing and Editing Alignments 136 +#TOC> 4.1 Masking workflow 152 +#TOC> #TOC> ========================================================================== @@ -49,12 +52,11 @@ source("makeProteinDB.R") # Load packages we need -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help = Biostrings) # basic information @@ -62,12 +64,8 @@ if (! require(Biostrings, quietly=TRUE)) { # data(package = "Biostrings") # available datasets -if (! require(msa, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("msa") - library(msa) +if (! requireNamespace("msa", quietly = TRUE)) { + BiocManager::install("msa") } # Package information: # library(help = msa) # basic information @@ -123,8 +121,8 @@ tail(APSI) # the MSA algorithms in Biostrings. # -APSESSet <- AAStringSet(APSI) -APSESMsa <- msaMuscle(APSESSet, order = "aligned") +APSESSet <- Biostrings::AAStringSet(APSI) +APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If # that happens in your case, just use msaClustalOmega() instead. diff --git a/BIN-PHYLO-Tree_analysis.R b/BIN-PHYLO-Tree_analysis.R index a7a0b8e..950a000 100644 --- a/BIN-PHYLO-Tree_analysis.R +++ b/BIN-PHYLO-Tree_analysis.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Tree_analysis unit. # -# Version: 1.0.2 +# Version: 1.1 # -# Date: 2017 10 31 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0.2 Typo in variable name, style changes # 1.0.1 Wrong section heading # 1.0 First 2017 version @@ -31,11 +34,11 @@ #TOC> #TOC> Section Title Line #TOC> -------------------------------------------------- -#TOC> 1 Preparation and Tree Plot 43 -#TOC> 2 Tree Analysis 82 -#TOC> 2.1 Rooting Trees 141 -#TOC> 2.2 Rotating Clades 187 -#TOC> 2.3 Computing tree distances 234 +#TOC> 1 Preparation and Tree Plot 46 +#TOC> 2 Tree Analysis 86 +#TOC> 2.1 Rooting Trees 145 +#TOC> 2.2 Rotating Clades 190 +#TOC> 2.3 Computing tree distances 241 #TOC> #TOC> ========================================================================== @@ -43,19 +46,17 @@ # = 1 Preparation and Tree Plot =========================================== -if (!require(Rphylip, quietly=TRUE)) { - install.packages("Rphylip") - library(Rphylip) +if (! requireNamespace("ape", quietly = TRUE)) { + install.packages("ape") } # Package information: -# library(help = Rphylip) # basic information -# browseVignettes("Rphylip") # available vignettes -# data(package = "Rphylip") # available datasets - +# library(help = ape) # basic information +# browseVignettes("ape") # available vignettes +# data(package = "ape") # available datasets # Read the species tree that you have created at the phyloT Website: -fungiTree <- read.tree("fungiTree.txt") +fungiTree <- ape::read.tree("fungiTree.txt") plot(fungiTree) @@ -73,7 +74,10 @@ for (i in seq_along(fungiTree$tip.label)) { # Plot the tree plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE) -nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA") +ape::nodelabels(text = fungiTree$node.label, + cex = 0.6, + adj = 0.2, + bg = "#D4F2DA") # Note that you can use the arrow buttons in the menu above the plot to scroll # back to plots you have created earlier - so you can reference back to the # species tree. @@ -91,10 +95,10 @@ nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA") # trees in Newick format and visualize them elsewhere. # The "phylo" class object is one of R's "S3" objects and methods to plot and -# print it have been defined with the Rphylip package, and the package ape that -# Rphylip has loaded. You can simply call plot() and R knows what to -# do with and how to plot it. The underlying function is -# plot.phylo(), and documentation for its many options can by found by typing: +# print it have been defined with the Rphylip package, and in ape. You can +# simply call plot() and R knows what to do with and how +# to plot it. The underlying function is plot.phylo(), and documentation for its +# many options can by found by typing: ?plot.phylo @@ -125,40 +129,39 @@ apsTree$edge.length # show the node / edge and tip labels on a plot plot(apsTree) -nodelabels() -edgelabels() -tiplabels() +ape::nodelabels() +ape::edgelabels() +ape::tiplabels() # show the number of nodes, edges and tips -Nnode(apsTree) -Nedge(apsTree) -Ntip(apsTree) +ape::Nnode(apsTree) +ape::Nedge(apsTree) +ape::Ntip(apsTree) # Finally, write the tree to console in Newick format -write.tree(apsTree) +ape::write.tree(apsTree) # == 2.1 Rooting Trees ===================================================== # In order to analyse the tree, it is helpful to root it first and reorder its # clades. Contrary to documentation, Rproml() returns an unrooted tree. -is.rooted(apsTree) +ape::is.rooted(apsTree) -# You can root the tree with the command root() from the "ape" package. ape is -# automatically installed and loaded with Rphylip. +# You can root the tree with the command root() from the "ape" package. plot(apsTree) # add labels for internal nodes and tips -nodelabels(cex = 0.5, frame = "circle") -tiplabels(cex = 0.5, frame = "rect") +ape::nodelabels(cex = 0.5, frame = "circle") +ape::tiplabels(cex = 0.5, frame = "rect") # The outgroup of the tree is tip "11" in my sample tree, it may be a different # number in yours. Substitute the correct node number below for "outgroup". -apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE) +apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) plot(apsTree) -is.rooted(apsTree) +ape::is.rooted(apsTree) # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, # an edge of length zero was added to connect the MRCA (Most Recent Common @@ -172,7 +175,7 @@ apsTree$edge.length # overlap. apsTree$edge.length[1] <- 0.1 plot(apsTree, cex = 0.7) -nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") +ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") # This procedure does however not assign an actual length to a root edge, and @@ -181,7 +184,7 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 plot(apsTree, cex = 0.7, root.edge = TRUE) -nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") +ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") # == 2.2 Rotating Clades =================================================== @@ -192,9 +195,9 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") # We can either rotate around individual internal nodes ... layout(matrix(1:2, 1, 2)) plot(apsTree, no.margin = TRUE, root.edge = TRUE) -nodelabels(node = 17, cex = 0.7, bg = "#ff8866") -plot(rotate(apsTree, node = 17), no.margin = TRUE, root.edge = TRUE) -nodelabels(node = 17, cex = 0.7, bg = "#88ff66") +ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") +plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) +ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") # Note that the species at the bottom of the clade descending from node # 17 is now plotted at the top. layout(matrix(1), widths = 1.0, heights = 1.0) @@ -211,11 +214,15 @@ nOrg <- length(apsTree$tip.label) layout(matrix(1:2, 1, 2)) plot(fungiTree, no.margin = TRUE, root.edge = TRUE) -nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA") +ape::nodelabels(text = fungiTree$node.label, + cex = 0.5, + adj = 0.2, + bg = "#D4F2DA") -plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), - no.margin = TRUE, root.edge = TRUE) -add.scale.bar(length = 0.5) +plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), + no.margin = TRUE, + root.edge = TRUE) +ape::add.scale.bar(length = 0.5) layout(matrix(1), widths = 1.0, heights = 1.0) # Task: Study the two trees and consider their similarities and differences. @@ -236,9 +243,8 @@ layout(matrix(1), widths = 1.0, heights = 1.0) # Many superb phylogeny tools are contributed by the phangorn package. -if (!require(phangorn, quietly=TRUE)) { +if (! requireNamespace("phangorn", quietly = TRUE)) { install.packages("phangorn") - library(phangorn) } # Package information: # library(help = phangorn) # basic information @@ -253,20 +259,20 @@ apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) # phangorn provides several functions to compute tree-differences (and there # is a _whole_ lot of theory on how to compare trees). treedist() returns the # "symmetric difference" -treedist(fungiTree, apsTree2, check.labels = TRUE) +phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) # Numbers. What do they mean? How much more similar is our apsTree to the # (presumably) ground truth of fungiTree than a random tree would be? -# The ape package (which was loaded with RPhylip) provides the function rtree() +# The ape package provides the function rtree() # to compute random trees. -rtree(n = length(apsTree2$tip.label), # number of tips - rooted = TRUE, # we rooted the tree above, - # and fungiTree is rooted anyway - tip.label = apsTree2$tip.label, # use the apsTree2 labels - br = NULL) # don't generate branch lengths since - # fungiTree has none, so we can't - # compare them anyway. +ape::rtree(n = length(apsTree2$tip.label), # number of tips + rooted = TRUE, # we rooted the tree above, + # and fungiTree is rooted anyway + tip.label = apsTree2$tip.label, # use the apsTree2 labels + br = NULL) # don't generate branch lengths since + # fungiTree has none, so we can't + # compare them anyway. # Let's compute some random trees this way, calculate the distances to # fungiTree, and then compare the values we get for apsTree2. The random @@ -278,17 +284,17 @@ colnames(myTreeDistances) <- c("symm", "path") set.seed(112358) for (i in 1:N) { - xTree <- rtree(n = length(apsTree2$tip.label), - rooted = TRUE, - tip.label = apsTree2$tip.label, - br = NULL) - myTreeDistances[i, ] <- treedist(fungiTree, xTree) + xTree <- ape::rtree(n = length(apsTree2$tip.label), + rooted = TRUE, + tip.label = apsTree2$tip.label, + br = NULL) + myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree) } set.seed(NULL) # reset the random number generator table(myTreeDistances[, "symm"]) -(symmObs <- treedist(fungiTree, apsTree2)[1]) +(symmObs <- phangorn::treedist(fungiTree, apsTree2)[1]) # Random events less-or-equal to observation, divided by total number of # events gives us the empirical p-value. @@ -298,7 +304,7 @@ cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n hist(myTreeDistances[, "path"], col = "aliceblue", main = "Distances of random Trees to fungiTree") -(pathObs <- treedist(fungiTree, apsTree2)[2]) +(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) abline(v = pathObs, col = "chartreuse") # Random events less-or-equal to observation, divided by total number of diff --git a/BIN-PHYLO-Tree_building.R b/BIN-PHYLO-Tree_building.R index 5f013b1..544b9a2 100644 --- a/BIN-PHYLO-Tree_building.R +++ b/BIN-PHYLO-Tree_building.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PHYLO-Tree_building unit. # -# Version: 1.0 +# Version: 1.1 # # Date: 2017 10. 31 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, # 1.0 First 2017 version # 0.1 First code copied from 2016 material. # @@ -27,17 +29,17 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ------------------------------------------------------- -#TOC> 1 Calculating Trees 43 -#TOC> 1.1 PROMLPATH ... 64 -#TOC> 1.1.1 ... on the Mac 69 -#TOC> 1.1.2 ... on Windows 80 -#TOC> 1.1.3 ... on Linux 94 -#TOC> 1.1.4 Confirming PROMLPATH 99 -#TOC> 1.2 Building a maximum likelihood tree 108 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ----------------------------------------------------------- +#TOC> 1 Calculating Trees 46 +#TOC> 1.1 PROMLPATH ... 66 +#TOC> 1.1.1 ... on the Mac 71 +#TOC> 1.1.2 ... on Windows 82 +#TOC> 1.1.3 ... on Linux 96 +#TOC> 1.1.4 Confirming PROMLPATH 101 +#TOC> 1.2 Building a maximum likelihood tree 110 +#TOC> #TOC> ========================================================================== @@ -50,9 +52,8 @@ # After you have installed Phylip on your computer, install the R package that # provides an interface to the Phylip functions. -if (!require(Rphylip, quietly=TRUE)) { +if (! requireNamespace("Rphylip", quietly = TRUE)) { install.packages("Rphylip") - library(Rphylip) } # Package information: # library(help = Rphylip) # basic information @@ -67,7 +68,7 @@ if (!require(Rphylip, quietly=TRUE)) { # on your computer Phylip has been installed and define the path # to the proml program that calculates a maximum-likelihood tree. -# === 1.1.1 ... on the Mac +# === 1.1.1 ... on the Mac # On the Mac, the standard installation places a phylip folder # in the /Applications directory. That folder contains all the # individual phylip programs as .app files. These are not @@ -78,7 +79,7 @@ if (!require(Rphylip, quietly=TRUE)) { # directly to that subdirectory to find the program it needs: # PROMLPATH <- "/Applications/phylip-3.695/exe/proml.app/Contents/MacOS" -# === 1.1.2 ... on Windows +# === 1.1.2 ... on Windows # On Windows you need to know where the programs have been installed, and you # need to specify a path that is correct for the Windows OS. Find the folder # that is named "exe", and right-click to inspect its properties. The path @@ -92,12 +93,12 @@ if (!require(Rphylip, quietly=TRUE)) { # I have heard that your path must not contain spaces, and it is prudent to # avoid other special characters as well. -# === 1.1.3 ... on Linux +# === 1.1.3 ... on Linux # If you are running Linux I trust you know what to do. It's probably # something like # PROMLPATH <- "/usr/local/phylip-3.695/bin" -# === 1.1.4 Confirming PROMLPATH +# === 1.1.4 Confirming PROMLPATH # Confirm that the settings are right. PROMLPATH # returns the path list.dirs(PROMLPATH) # returns the directories in that path @@ -110,7 +111,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command" # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, # as a "proseq" object with the read.protein() function of the RPhylip package: -apsIn <- read.protein("APSESphyloSet.mfa") +apsIn <- Rphylip::read.protein("APSESphyloSet.mfa") # ... and you are ready to build a tree. @@ -125,7 +126,7 @@ apsIn <- read.protein("APSESphyloSet.mfa") # process will take us about 5 to 10 minutes. Run this, and anjoy a good cup # of coffee while you are waiting. -apsTree <- Rproml(apsIn, path=PROMLPATH) +apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) # A quick first look: diff --git a/BIN-PPI-Analysis.R b/BIN-PPI-Analysis.R index af1c84d..c75e0fa 100644 --- a/BIN-PPI-Analysis.R +++ b/BIN-PPI-Analysis.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-PPI-Analysis unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 08 - 2017 11 +# Date: 2017 08 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0 First live version # 0.1 First code copied from 2016 material. # @@ -29,13 +32,13 @@ #TOC> #TOC> Section Title Line #TOC> --------------------------------------------------------------- -#TOC> 1 Setup and data 43 -#TOC> 2 Functional Edges in the Human Proteome 80 -#TOC> 2.1 Cliques 123 -#TOC> 2.2 Communities 164 -#TOC> 2.3 Betweenness Centrality 178 -#TOC> 3 biomaRt 224 -#TOC> 4 Task for submission 295 +#TOC> 1 Setup and data 46 +#TOC> 2 Functional Edges in the Human Proteome 82 +#TOC> 2.1 Cliques 125 +#TOC> 2.2 Communities 166 +#TOC> 2.3 Betweenness Centrality 180 +#TOC> 3 biomaRt 226 +#TOC> 4 Task for submission 296 #TOC> #TOC> ========================================================================== @@ -45,9 +48,8 @@ # Not surprisingly, the analysis of PPI networks needs iGraph: -if (!require(igraph, quietly=TRUE)) { +if (! requireNamespace("igraph", quietly = TRUE)) { install.packages("igraph") - library(igraph) } # Package information: # library(help = igraph) # basic information @@ -88,9 +90,9 @@ head(STRINGedges) # Make a graph from this dataframe -?graph_from_data_frame +?igraph::graph_from_data_frame -gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE) +gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) # CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges - # layout of such large graphs is possible, but requires specialized code. Google @@ -99,13 +101,13 @@ gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE) # Of course simple computations on this graph are reasonably fast: -compSTR <- components(gSTR) +compSTR <- igraph::components(gSTR) summary(compSTR) # our graph is fully connected! -hist(log(degree(gSTR)), col="#FEE0AF") +hist(log(igraph::degree(gSTR)), col="#FEE0AF") # this actually does look rather scale-free -(freqRank <- table(degree(gSTR))) +(freqRank <- table(igraph::degree(gSTR))) plot(log10(as.numeric(names(freqRank)) + 1), log10(as.numeric(freqRank)), type = "b", pch = 21, bg = "#FEE0AF", @@ -126,29 +128,29 @@ abline(regressionLine, col = "firebrick") # subgraph, i.e. a subgraph in which every node is connected to every other. # Biological complexes often appear as cliques in interaction graphs. -clique_num(gSTR) +igraph::clique_num(gSTR) # The largest clique has 63 members. -(C <- largest_cliques(gSTR)[[1]]) +(C <- igraph::largest_cliques(gSTR)[[1]]) # Pick one of the proteins and find out what this fully connected cluster of 63 # proteins is (you can simply Google for any of the IDs). Is this expected? # Plot this ... -R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices +R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices # color the vertices along a color spectrum -vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes +vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes # color the edges to have the same color as the originating node eCol <- character() for (i in seq_along(vCol)) { - eCol <- c(eCol, rep(vCol[i], gorder(R))) + eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) } oPar <- par(mar= rep(0,4)) # Turn margins off plot(R, - layout = layout_in_circle(R), + layout = igraph::layout_in_circle(R), vertex.size = 3, vertex.color = vCol, edge.color = eCol, @@ -164,14 +166,14 @@ par(oPar) # == 2.2 Communities ======================================================= set.seed(112358) # set RNG seed for repeatable randomness -gSTRclusters <- cluster_infomap(gSTR) +gSTRclusters <- igraph::cluster_infomap(gSTR) set.seed(NULL) # reset the RNG -modularity(gSTRclusters) # ... measures how separated the different membership - # types are from each other -tMem <- table(membership(gSTRclusters)) +igraph::modularity(gSTRclusters) # ... measures how separated the different + # membership types are from each other +tMem <- table(igraph::membership(gSTRclusters)) length(tMem) # More than 2000 communities identified -hist(tMem, breaks = 50) # most clusters are small ... +hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ... range(tMem) # ... but one has > 100 members @@ -179,14 +181,14 @@ range(tMem) # ... but one has > 100 members # Let's find the nodes with the 10 - highest betweenness centralities. # -BC <- centr_betw(gSTR) +BC <- igraph::centr_betw(gSTR) # remember: BC$res contains the results head(BC$res) BC$res[1] # betweeness centrality of node 1 in the graph ... # ... which one is node 1? -V(gSTR)[1] +igraph::V(gSTR)[1] # to get the ten-highest nodes, we simply label the elements of BC with their # index ... @@ -203,7 +205,7 @@ head(sBC) # We can use the first ten labels to subset the nodes in gSTR and fetch the # IDs... -(ENSPsel <- names(V(gSTR)[BCsel])) +(ENSPsel <- names(igraph::V(gSTR)[BCsel])) # We are going to use these IDs to produce some output for a submitted task: # so I need you to personalize ENSPsel with the following @@ -231,12 +233,11 @@ set.seed(NULL) # reset the random number generator # day), simply a few lines of sample code to get you started on the specific use # case of retrieving descriptions for ensembl protein IDs. -if (!require(biomaRt, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("biomaRt") - library(biomaRt) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("biomaRt", quietly = TRUE)) { + BiocManager::install("biomaRt") } # Package information: # library(help = biomaRt) # basic information @@ -244,14 +245,14 @@ if (!require(biomaRt, quietly=TRUE)) { # data(package = "biomaRt") # available datasets # define which dataset to use ... -myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl") +myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") # what filters are defined? -(filters <- listFilters(myMart)) +(filters <- biomaRt::listFilters(myMart)) # and what attributes can we filter for? -(attributes <- listAttributes(myMart)) +(attributes <- biomaRt::listAttributes(myMart)) # Soooo many options - let's look for the correct name of filters that are @@ -259,18 +260,18 @@ myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl") filters[grep("ENSP", filters$description), ] # ... and the correct attribute names for gene symbols and descriptions ... -attributes[grep("symbol", attributes$description, ignore.case=TRUE), ] -attributes[grep("description", attributes$description, ignore.case=TRUE), ] +attributes[grep("symbol", attributes$description, ignore.case = TRUE), ] +attributes[grep("description", attributes$description, ignore.case = TRUE), ] # ... so we can put this together: here is a syntax example: -getBM(filters = "ensembl_peptide_id", - attributes = c("hgnc_symbol", - "wikigene_description", - "interpro_description", - "phenotype_description"), - values = "ENSP00000000442", - mart = myMart) +biomaRt::getBM(filters = "ensembl_peptide_id", + attributes = c("hgnc_symbol", + "wikigene_description", + "interpro_description", + "phenotype_description"), + values = "ENSP00000000442", + mart = myMart) # A simple loop will now get us the information for our 10 most central genes # from the human subset of STRING. @@ -279,13 +280,13 @@ CPdefs <- list() # Since we don't know how many matches one of our queries # will return, we'll put the result dataframes into a list. for (ID in ENSPsel) { - CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id", - attributes = c("hgnc_symbol", - "wikigene_description", - "interpro_description", - "phenotype_description"), - values = ID, - mart = myMart) + CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", + attributes = c("hgnc_symbol", + "wikigene_description", + "interpro_description", + "phenotype_description"), + values = ID, + mart = myMart) } # So what are the proteins with the ten highest betweenness centralities? diff --git a/BIN-SEQA-Composition.R b/BIN-SEQA-Composition.R index 0942c56..4458ffd 100644 --- a/BIN-SEQA-Composition.R +++ b/BIN-SEQA-Composition.R @@ -3,13 +3,17 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-SEQA-Comparison unit # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 11 17 +# Date: 2017 11 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # -# V 1.0 First live version 2017 -# V 0.1 First code copied from BCH441_A03_makeYFOlist.R +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() +# Versions: +# 1.0 First live version 2017 +# 0.1 First code copied from BCH441_A03_makeYFOlist.R # # TODO: # @@ -25,26 +29,25 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ---------------------------------------------------- -#TOC> 1 Preparation 41 -#TOC> 2 Aggregate properties 63 -#TOC> 3 Sequence Composition Enrichment 106 -#TOC> 3.1 Barplot, and side-by-side barplot 129 -#TOC> 3.2 Plotting ratios 164 -#TOC> 3.3 Plotting log ratios 180 -#TOC> 3.4 Sort by frequency 195 -#TOC> 3.5 Color by amino acid type 210 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ---------------------------------------------------------- +#TOC> 1 Preparation 47 +#TOC> 2 Aggregate properties 68 +#TOC> 3 Sequence Composition Enrichment 111 +#TOC> 3.1 Barplot, and side-by-side barplot 134 +#TOC> 3.2 Plotting ratios 169 +#TOC> 3.3 Plotting log ratios 185 +#TOC> 3.4 Sort by frequency 200 +#TOC> 3.5 Color by amino acid type 215 +#TOC> #TOC> ========================================================================== # = 1 Preparation ========================================================= -if (!require(seqinr, quietly=TRUE)) { +if (! requireNamespace("seqinr", quietly = TRUE)) { install.packages("seqinr") - library(seqinr) } # Package information: # library(help = seqinr) # basic information @@ -66,7 +69,7 @@ if (!require(seqinr, quietly=TRUE)) { # Let's try a simple function from seqinr: computing the pI of the sequence -?computePI +?seqinr::computePI # This takes as input a vector of upper-case AA codes @@ -82,13 +85,13 @@ s <- unlist(s) # strsplit() returns a list! Why? # the function s2c() to convert strings into # character vectors (and c2s to convert them back). -s2c(mySeq) +seqinr::s2c(mySeq) -computePI(s2c(mySeq)) # isoelectric point -pmw(s2c(mySeq)) # molecular weight -AAstat(s2c(mySeq)) # This also plots the distribution of - # values along the sequence +seqinr::computePI(s2c(mySeq)) # isoelectric point +seqinr::pmw(s2c(mySeq)) # molecular weight +seqinr::AAstat(s2c(mySeq)) # This also plots the distribution of + # values along the sequence # A true Labor of Love has gone into the # compilation of the "aaindex" data: @@ -116,9 +119,9 @@ aaindex[[459]]$D # Let's construct an enrichment plot to compare average frequencies # with the amino acid counts in our sequence. -(refData <- aaindex[[459]]$I) # reference frequencies in % -names(refData) <- a(names(refData)) # change names to single-letter - # code using seqinr's "a()" function +(refData <- aaindex[[459]]$I) # reference frequencies in % +names(refData) <- seqinr::a(names(refData)) # change names to single-letter + # code using seqinr's "a()" function sum(refData) refData # ... in % diff --git a/BIN-Sequence.R b/BIN-Sequence.R index 61b0900..2861cb3 100644 --- a/BIN-Sequence.R +++ b/BIN-Sequence.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the BIN-Sequence unit. # -# Version: 1.3 +# Version: 1.4 # # Date: 2017 09 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.4 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.3 Update set.seed() usage # 1.2 Removed irrelevant task. How did that even get in there? smh # 1.1 Add chartr() @@ -30,23 +33,23 @@ #TOC> #TOC> Section Title Line #TOC> ---------------------------------------------------- -#TOC> 1 Prepare 60 -#TOC> 2 Storing Sequence 78 -#TOC> 3 String properties 107 -#TOC> 4 Substrings 114 -#TOC> 5 Creating strings: sprintf() 135 -#TOC> 6 Changing strings 170 -#TOC> 6.1.1 Changing case 172 -#TOC> 6.1.2 Reverse 177 -#TOC> 6.1.3 Change characters 181 -#TOC> 6.1.4 Substitute characters 209 -#TOC> 6.2 stringi and stringr 229 -#TOC> 6.3 dbSanitizeSequence() 239 -#TOC> 7 Permuting and sampling 251 -#TOC> 7.1 Permutations 258 -#TOC> 7.2 Sampling 304 -#TOC> 7.2.1 Equiprobable characters 306 -#TOC> 7.2.2 Defined probability vector 348 +#TOC> 1 Prepare 63 +#TOC> 2 Storing Sequence 80 +#TOC> 3 String properties 109 +#TOC> 4 Substrings 116 +#TOC> 5 Creating strings: sprintf() 137 +#TOC> 6 Changing strings 172 +#TOC> 6.1.1 Changing case 174 +#TOC> 6.1.2 Reverse 179 +#TOC> 6.1.3 Change characters 183 +#TOC> 6.1.4 Substitute characters 211 +#TOC> 6.2 stringi and stringr 231 +#TOC> 6.3 dbSanitizeSequence() 241 +#TOC> 7 Permuting and sampling 253 +#TOC> 7.1 Permutations 260 +#TOC> 7.2 Sampling 306 +#TOC> 7.2.1 Equiprobable characters 308 +#TOC> 7.2.2 Defined probability vector 350 #TOC> #TOC> ========================================================================== @@ -62,12 +65,11 @@ # Much basic sequence handling is supported by the Bioconductor package # Biostrings. -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help = Biostrings) # basic information @@ -86,7 +88,7 @@ if (! require(Biostrings, quietly=TRUE)) { # ... or as more complex objects with rich metadata e.g. as a Biostrings # DNAstring, RNAstring, AAString, etc. -(a <- AAString("DIVMTQ")) +(a <- Biostrings::AAString("DIVMTQ")) # ... and all of these representations can be interconverted: @@ -314,6 +316,7 @@ N <- 100 set.seed(16818) # set RNG seed for repeatable randomness v <- sample(nuc, N, replace = TRUE) set.seed(NULL) # reset the RNG + (mySeq <- paste(v, collapse = "")) # What's the GC content? @@ -323,9 +326,8 @@ sum(table(v)[c("G", "C")]) # 51 is close to expected # What's the number of CpG motifs? Easy to check with the stringi # stri_match_all() function -if (! require(stringi, quietly=TRUE)) { +if (! requireNamespace("stringi", quietly = TRUE)) { install.packages("stringi") - library(stringi) } # Package information: # library(help = stringi) # basic information diff --git a/BIN-Storing_data.R b/BIN-Storing_data.R index e0cf4cf..fb9891b 100644 --- a/BIN-Storing_data.R +++ b/BIN-Storing_data.R @@ -27,30 +27,30 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ----------------------------------------------------------------- -#TOC> 1 A Relational Datamodel in R: review 62 -#TOC> 1.1 Building a sample database structure 102 -#TOC> 1.1.1 completing the database 213 -#TOC> 1.2 Querying the database 248 -#TOC> 1.3 Task: submit for credit (part 1/2) 277 -#TOC> 2 Implementing the protein datamodel 289 -#TOC> 2.1 JSON formatted source data 315 -#TOC> 2.2 "Sanitizing" sequence data 355 -#TOC> 2.3 Create a protein table for our data model 375 -#TOC> 2.3.1 Initialize the database 377 -#TOC> 2.3.2 Add data 389 -#TOC> 2.4 Complete the database 409 -#TOC> 2.4.1 Examples of navigating the database 436 -#TOC> 2.5 Updating the database 468 -#TOC> 3 Add your own data 480 -#TOC> 3.1 Find a protein 488 -#TOC> 3.2 Put the information into JSON files 517 -#TOC> 3.3 Create an R script to create your own database 540 -#TOC> 3.3.1 Check and validate 560 -#TOC> 3.4 Task: submit for credit (part 2/2) 601 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ----------------------------------------------------------------------- +#TOC> 1 A Relational Datamodel in R: review 57 +#TOC> 1.1 Building a sample database structure 97 +#TOC> 1.1.1 completing the database 208 +#TOC> 1.2 Querying the database 243 +#TOC> 1.3 Task: submit for credit (part 1/2) 272 +#TOC> 2 Implementing the protein datamodel 284 +#TOC> 2.1 JSON formatted source data 310 +#TOC> 2.2 "Sanitizing" sequence data 350 +#TOC> 2.3 Create a protein table for our data model 370 +#TOC> 2.3.1 Initialize the database 372 +#TOC> 2.3.2 Add data 384 +#TOC> 2.4 Complete the database 404 +#TOC> 2.4.1 Examples of navigating the database 431 +#TOC> 2.5 Updating the database 463 +#TOC> 3 Add your own data 475 +#TOC> 3.1 Find a protein 483 +#TOC> 3.2 Put the information into JSON files 512 +#TOC> 3.3 Create an R script to create your own database 535 +#TOC> 3.3.1 Check and validate 555 +#TOC> 3.4 Task: submit for credit (part 2/2) 596 +#TOC> #TOC> ========================================================================== @@ -205,7 +205,7 @@ str(philDB) # go back, re-read, play with it, and ask for help. This is essential. -# === 1.1.1 completing the database +# === 1.1.1 completing the database # Next I'll add one more person, and create the other two tables: @@ -328,11 +328,11 @@ file.show("./data/MBP1_SACCE.json") # sanitize the sequence at some point. But since we need to do that # anyway, it is easier to see the whole sequence if we store it in chunks. -# Let's load the "jsonlite" package and have a look at how it reads this data. +# Let's make sure the "jsonlite" package exists on your computer, then we'll +# explore how it reads this data. -if (! require(jsonlite, quietly=TRUE)) { +if (! requireNamespace("jsonlite", quietly = TRUE)) { install.packages("jsonlite") - library(jsonlite) } # Package information: # library(help = jsonlite) # basic information @@ -340,7 +340,7 @@ if (! require(jsonlite, quietly=TRUE)) { # data(package = "jsonlite") # available datasets -x <- fromJSON("./data/MBP1_SACCE.json") +x <- jsonlite::fromJSON("./data/MBP1_SACCE.json") str(x) x$name @@ -369,7 +369,7 @@ dbSanitizeSequence(x) # == 2.3 Create a protein table for our data model ========================= -# === 2.3.1 Initialize the database +# === 2.3.1 Initialize the database # The function dbInit contains all the code to return a list of empty @@ -381,7 +381,7 @@ myDB <- dbInit() str(myDB) -# === 2.3.2 Add data +# === 2.3.2 Add data # fromJSON() returns a dataframe that we can readily process to add data @@ -389,7 +389,7 @@ str(myDB) dbAddProtein -myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json")) +myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) str(myDB) # Lets check that the 833 amino acids of the yeast MBP1 sequence have @@ -428,7 +428,7 @@ source("./scripts/ABC-createRefDB.R") str(myDB) -# === 2.4.1 Examples of navigating the database +# === 2.4.1 Examples of navigating the database # You can look at the contents of the tables in the usual way we access @@ -552,7 +552,7 @@ myDB$taxonomy$species[sel] # in any of the JSON files. Later you will add more information ... -# === 3.3.1 Check and validate +# === 3.3.1 Check and validate # Is your protein named according to the pattern "MBP1_MYSPE"? It should be. diff --git a/FND-Genetic_code.R b/FND-Genetic_code.R index 77142fb..5220449 100644 --- a/FND-Genetic_code.R +++ b/FND-Genetic_code.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the FND-Genetic_code unit. # -# Version: 1.0.1 +# Version: 1.1 # -# Date: 2017 10 12 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0.1 Comment on "incomplete final line" warning in FASTA # 1.0 First live version # @@ -25,17 +28,17 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ---------------------------------------------------------- -#TOC> 1 Storing the genetic code 47 -#TOC> 1.1 Genetic code in Biostrings 65 -#TOC> 2 Working with the genetic code 97 -#TOC> 2.1 Translate a sequence. 126 -#TOC> 3 An alternative representation: 3D array 208 -#TOC> 3.1 Print a Genetic code table 241 -#TOC> 4 Tasks 267 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ---------------------------------------------------------------- +#TOC> 1 Storing the genetic code 45 +#TOC> 1.1 Genetic code in Biostrings 63 +#TOC> 2 Working with the genetic code 94 +#TOC> 2.1 Translate a sequence. 129 +#TOC> 3 An alternative representation: 3D array 212 +#TOC> 3.1 Print a Genetic code table 246 +#TOC> 4 Tasks 272 +#TOC> #TOC> ========================================================================== @@ -63,12 +66,11 @@ x["TAA"] # available in the Bioconductor "Biostrings" package: -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help = Biostrings) # basic information @@ -77,45 +79,51 @@ if (! require(Biostrings, quietly=TRUE)) { # The standard genetic code vector -GENETIC_CODE +Biostrings::GENETIC_CODE # The table of genetic codes. This information corresponds to this page # at the NCBI: # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes -GENETIC_CODE_TABLE +Biostrings::GENETIC_CODE_TABLE # Most of the alternative codes are mitochondrial codes. The id of the # Alternative Yeast Nuclear code is "12" -getGeneticCode("12") # Alternative Yeast Nuclear +Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear # = 2 Working with the genetic code ======================================= -# GENETIC_CODE is a "named vector" +# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it +# to a "local" variable, rather than retrieving it from the package all the +# time. -str(GENETIC_CODE) +genCode <- Biostrings::GENETIC_CODE + +# This is a named vector of characters ... + +str(genCode) # ... which also stores the alternative initiation codons TTG and CTG in # an attribute of the vector. (Alternative initiation codons sometimes are # used instead of ATG to intiate translation, if if not ATG they are translated # with fMet.) -attr(GENETIC_CODE, "alt_init_codons") +attr(genCode, "alt_init_codons") # But the key to use this vector is in the "names" which we use for subsetting # the list of amino acids in whatever way we need. -names(GENETIC_CODE) +names(genCode) # The translation of "TGG" ... -GENETIC_CODE["TGG"] +genCode["TGG"] # All stop codons -names(GENETIC_CODE)[GENETIC_CODE == "*"] +names(genCode)[genCode == "*"] # All start codons -names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or -c(names(GENETIC_CODE)[GENETIC_CODE == "M"], - attr(GENETIC_CODE, "alt_init_codons")) +names(genCode)[genCode == "M"] # ... or +c(names(genCode)[genCode == "M"], + attr(genCode, "alt_init_codons")) # == 2.1 Translate a sequence. ============================================= @@ -165,7 +173,7 @@ nchar(mbp1)/3 # attributes that are useful for Biostrings. Thus we convert the sequence first # with DNAstring(), then split it up, then convert it into a plain # character vector. -mbp1Codons <- as.character(codons(DNAString(mbp1))) +mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) head(mbp1Codons) @@ -173,7 +181,7 @@ head(mbp1Codons) mbp1AA <- character(834) for (i in seq_along(mbp1Codons)) { - mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]] + mbp1AA[i] <- genCode[mbp1Codons[i]] } head(mbp1Codons) @@ -196,7 +204,8 @@ sort(table(mbp1AA), decreasing = TRUE) mbp1AA <- mbp1AA[-(length(mbp1AA))] tail(mbp1AA) # Note the stop is gone! -# paste it together, collapsing the elements without separation-character +# paste it together, collapsing the elements using an empty string as the +# separation-character (i.e.: nothing) (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) @@ -204,14 +213,15 @@ tail(mbp1AA) # Note the stop is gone! # We don't use 3D arrays often - usually just 2D tables and data frames, so -# here is a good opportunity to review the syntax with a genetic code cube: +# here is a good opportunity to review the syntax of 3D arrays with a +# genetic code cube: -# Initialize, using A C G T as the names of the elements in each dimension +# Initialize, using A G C T as the names of the elements in each dimension cCube <- array(data = character(64), dim = c(4, 4, 4), - dimnames = list(c("A", "C", "G", "T"), - c("A", "C", "G", "T"), - c("A", "C", "G", "T"))) + dimnames = list(c("A", "G", "C", "T"), + c("A", "G", "C", "T"), + c("A", "G", "C", "T"))) # fill it with amino acid codes using three nested loops for (i in 1:4) { @@ -222,7 +232,7 @@ for (i in 1:4) { dimnames(cCube)[[3]][k], sep = "", collapse = "") - cCube[i, j, k] <- GENETIC_CODE[myCodon] + cCube[i, j, k] <- genCode[myCodon] } } } @@ -290,25 +300,25 @@ for (i in nuc) { # Solution: - # Fetch the code - GENETIC_CODE_TABLE - GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"] - altYcode <- getGeneticCode("12") + # Fetch the code + Biostrings::GENETIC_CODE_TABLE + Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] + altYcode <- Biostrings::getGeneticCode("12") - # what's the difference? - (delta <- which(GENETIC_CODE != altYcode)) + # what's the difference? + (delta <- which(Biostrings::GENETIC_CODE != altYcode)) - GENETIC_CODE[delta] - altYcode[delta] + Biostrings::GENETIC_CODE[delta] + altYcode[delta] - # translate - altYAA <- character(834) - for (i in seq_along(mbp1Codons)) { - altYAA[i] <- altYcode[mbp1Codons[i]] - } + # translate + altYAA <- character(834) + for (i in seq_along(mbp1Codons)) { + altYAA[i] <- altYcode[mbp1Codons[i]] + } - table(mbp1AA) - table(altYAA) + table(mbp1AA) + table(altYAA) # Task: The genetic code has significant redundacy, i.e. there are up to six # codons that code for the same amino acid. Write code that lists how @@ -319,7 +329,7 @@ for (i in nuc) { # # # Solution: -table(table(GENETIC_CODE)) +table(table(Biostrings::GENETIC_CODE)) # [END] diff --git a/FND-MAT-Graphs_and_networks.R b/FND-MAT-Graphs_and_networks.R index c2b3be0..2348f12 100644 --- a/FND-MAT-Graphs_and_networks.R +++ b/FND-MAT-Graphs_and_networks.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the FND-MAT-Graphs_and_networks unit. # -# Version: 1.1 +# Version: 1.2 # # Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.1 Update set.seed() usage # 1.0 First final version for learning units. # 0.1 First code copied from 2016 material. @@ -30,17 +32,17 @@ #TOC> #TOC> Section Title Line #TOC> ------------------------------------------------------------ -#TOC> 1 Review 48 -#TOC> 2 DEGREE DISTRIBUTIONS 201 -#TOC> 2.1 Random graph 207 -#TOC> 2.2 scale-free graph (Barabasi-Albert) 255 -#TOC> 2.3 Random geometric graph 320 -#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442 -#TOC> 3.1 Basics 445 -#TOC> 3.2 Components 517 -#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536 -#TOC> 4.1 Diameter 573 -#TOC> 5 GRAPH CLUSTERING 641 +#TOC> 1 Review 50 +#TOC> 2 DEGREE DISTRIBUTIONS 204 +#TOC> 2.1 Random graph 210 +#TOC> 2.2 scale-free graph (Barabasi-Albert) 258 +#TOC> 2.3 Random geometric graph 323 +#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 445 +#TOC> 3.1 Basics 448 +#TOC> 3.2 Components 520 +#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539 +#TOC> 4.1 Diameter 576 +#TOC> 5 GRAPH CLUSTERING 645 #TOC> #TOC> ========================================================================== @@ -123,9 +125,8 @@ set.seed(NULL) # reset the RNG # standard package for work with graphs in r is "igraph". We'll go into more # details of the igraph package a bit later, for now we just use it to plot: -if (! require(igraph, quietly=TRUE)) { +if (! requireNamespace("igraph", quietly = TRUE)) { install.packages("igraph") - library(igraph) } # Package information: # library(help = igraph) # basic information @@ -133,10 +134,12 @@ if (! require(igraph, quietly=TRUE)) { # data(package = "igraph") # available datasets -myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected") +myG <- igraph::graph_from_adjacency_matrix(myRandAM, mode = "undirected") set.seed(112358) # set RNG seed for repeatable randomness -myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates + # calculate layout coordinates +myGxy <- igraph::layout_with_graphopt(myG, + charge=0.0012) set.seed(NULL) # reset the RNG @@ -157,9 +160,9 @@ plot(myG, rescale = FALSE, xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), - vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], - vertex.size = 1600 + (300 * degree(myG)), - vertex.label = sprintf("%s(%i)", names(V(myG)), degree(myG)), + vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], + vertex.size = 1600 + (300 * igraph::degree(myG)), + vertex.label = sprintf("%s(%i)", names(igraph::V(myG)), igraph::degree(myG)), vertex.label.family = "sans", vertex.label.cex = 0.7) par(oPar) # reset plot window @@ -179,10 +182,10 @@ sum(myRandAM) rowSums(myRandAM) + colSums(myRandAM) # check this against the plot! # The function degree() gives the same values -degree(myG) +igraph::degree(myG) # Let's plot the degree distribution in a histogram: -degG <- degree(myG) +degG <- igraph::degree(myG) brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1) # define histogram breaks hist(degG, breaks=brk, col="#A5CCF5", xlim = c(-1,8), xaxt = "n", @@ -212,9 +215,9 @@ set.seed(31415927) # set RNG seed for repeatable randomness my200AM <- makeRandomAM(as.character(1:N), p = 0.015) set.seed(NULL) # reset the RNG -myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected") -myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout - # coordinates +myG200 <- igraph::graph_from_adjacency_matrix(my200AM, mode = "undirected") +myGxy <- igraph::layout_with_graphopt(myG200, charge=0.0001) # calculate layout + # coordinates oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state plot(myG200, @@ -222,8 +225,8 @@ plot(myG200, rescale = FALSE, xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), - vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1], - vertex.size = 150 + (60 * degree(myG200)), + vertex.color=heat.colors(max(igraph::degree(myG200)+1))[igraph::degree(myG200)+1], + vertex.size = 150 + (60 * igraph::degree(myG200)), vertex.label = NA) par(oPar) # restore graphics state @@ -231,7 +234,7 @@ par(oPar) # restore graphics state # biological graphs look approximately like this. # Calculate degree distributions -dg <- degree(myG200) +dg <- igraph::degree(myG200) brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) hist(dg, breaks=brk, col="#A5F5CC", xlim = c(-1,11), xaxt = "n", @@ -263,10 +266,10 @@ plot(log10(as.numeric(names(freqRank)) + 1), N <- 200 set.seed(31415927) # set RNG seed for repeatable randomness -GBA <- sample_pa(N, power = 0.8, directed = FALSE) +GBA <- igraph::sample_pa(N, power = 0.8, directed = FALSE) set.seed(NULL) # reset the RNG -GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates +GBAxy <- igraph::layout_with_graphopt(GBA, charge=0.0001) oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state plot(GBA, @@ -274,8 +277,8 @@ plot(GBA, rescale = FALSE, xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01), ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01), - vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1], - vertex.size = 200 + (30 * degree(GBA)), + vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1], + vertex.size = 200 + (30 * igraph::degree(GBA)), vertex.label = NA) par(oPar) # restore grphics state @@ -287,7 +290,7 @@ par(oPar) # restore grphics state # singletons. # What's the degree distribution of this graph? -(dg <- degree(GBA)) +(dg <- igraph::degree(GBA)) brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) hist(dg, breaks=brk, col="#DCF5B5", xlim = c(0,max(dg)+1), xaxt = "n", @@ -307,8 +310,8 @@ plot(log10(as.numeric(names(freqRank)) + 1), # Sort-of linear, but many of the higher ranked nodes have a frequency of only # one. That behaviour smooths out in larger graphs: # -X <- sample_pa(100000, power = 0.8, directed = FALSE) # 100,000 nodes -freqRank <- table(degree(X)) +X <- igraph::sample_pa(1e5, power = 0.8, directed = FALSE) # 100,000 nodes +freqRank <- table(igraph::degree(X)) plot(log10(as.numeric(names(freqRank)) + 1), log10(as.numeric(freqRank)), type = "b", xlab = "log(Rank)", ylab = "log(frequency)", @@ -367,7 +370,7 @@ makeRandomGeometricAM <- function(nam, B = 25, Q = 0.001, t = 0.6) { for (iCol in (iRow+1):N) { # geometric distance ... d <- sqrt((AM$x[iRow] - AM$x[iCol])^2 + - (AM$y[iRow] - AM$y[iCol])^2) # Pythagoras + (AM$y[iRow] - AM$y[iCol])^2) # Pythagoras # distance dependent probability p <- 1 - 1/((1 + (Q * (exp(-B * (d-t)))))^(1 / nu)) if (runif(1) < p) { @@ -404,7 +407,7 @@ rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4) set.seed(NULL) # reset the RNG -myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected") +myGRG <- igraph::graph_from_adjacency_matrix(rGAM$mat, mode = "undirected") oPar <- par(mar= rep(0,4)) # Turn margins off plot(myGRG, @@ -412,13 +415,13 @@ plot(myGRG, rescale = FALSE, xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), - vertex.color=heat.colors(max(degree(myGRG)+1))[degree(myGRG)+1], - vertex.size = 0.1 + (0.2 * degree(myGRG)), + vertex.color=heat.colors(max(igraph::degree(myGRG)+1))[igraph::degree(myGRG)+1], + vertex.size = 0.1 + (0.2 * igraph::degree(myGRG)), vertex.label = NA) par(oPar) # degree distribution: -(dg <- degree(myGRG)) +(dg <- igraph::degree(myGRG)) brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1) hist(dg, breaks = brk, col = "#FCC6D2", xlim = c(0, 25), xaxt = "n", @@ -450,7 +453,7 @@ summary(myG) # This output means: this is an IGRAPH graph, with U = UN-directed edges # and N = named nodes, that has 20 nodes and 20 edges. For details, see -?print.igraph +?igraph::print.igraph mode(myG) class(myG) @@ -463,11 +466,11 @@ class(myG) # recipes, called _games_ in this package. # Two basic functions retrieve nodes "Vertices", and "Edges": -V(myG) -E(myG) +igraph::V(myG) +igraph::E(myG) # additional properties can be retrieved from the Vertices ... -V(myG)$name +igraph::V(myG)$name # As with many R objects, loading the package provides special functions that @@ -487,12 +490,12 @@ plot(myG) # this is the result of default plot parameters # Plot with some customizing parameters oPar <- par(mar= rep(0,4)) # Turn margins off plot(myG, - layout = layout_with_fr(myG), - vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], - vertex.size = 9 + (2 * degree(myG)), - vertex.label.cex = 0.5 + (0.05 * degree(myG)), + layout = igraph::layout_with_fr(myG), + vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], + vertex.size = 9 + (2 * igraph::degree(myG)), + vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)), edge.width = 2, - vertex.label = V(myG)$name, + vertex.label = igraph::V(myG)$name, vertex.label.family = "sans", vertex.label.cex = 0.9) par(oPar) @@ -500,12 +503,12 @@ par(oPar) # ... or with a different layout: oPar <- par(mar= rep(0,4)) # Turn margins off plot(myG, - layout = layout_in_circle(myG), - vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], - vertex.size = 9 + (2 * degree(myG)), - vertex.label.cex = 0.5 + (0.05 * degree(myG)), + layout = igraph::layout_in_circle(myG), + vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], + vertex.size = 9 + (2 * igraph::degree(myG)), + vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)), edge.width = 2, - vertex.label = V(myG)$name, + vertex.label = igraph::V(myG)$name, vertex.label.family = "sans", vertex.label.cex = 0.9) par(oPar) @@ -518,18 +521,18 @@ par(oPar) # The igraph function components() tells us whether there are components of the # graph in which there is no path to other components. -components(myG) +igraph::components(myG) # In the _membership_ vector, nodes are annotated with the index of the # component they are part of. Sui7 is the only node of component 2, Cyj1 is in # the third component etc. This is perhaps more clear if we sort by component # index -sort(components(myG)$membership, decreasing = TRUE) +sort(igraph::components(myG)$membership, decreasing = TRUE) # Retrieving e.g. the members of the first component from the list can be done by subsetting: -(sel <- components(myG)$membership == 1) # boolean vector .. -(c1 <- components(myG)$membership[sel]) +(sel <- igraph::components(myG)$membership == 1) # boolean vector .. +(c1 <- igraph::components(myG)$membership[sel]) names(c1) @@ -542,9 +545,9 @@ names(c1) # preferential-attachment ... but igraph has ways to simulate the basic ones # (and we could easily simulate our own). Look at the following help pages: -?sample_gnm # see also sample_gnp for the Erdös-Rényi models -?sample_smallworld # for the Watts & Strogatz model -?sample_pa # for the Barabasi-Albert model +?igraph::sample_gnm # see also sample_gnp for the Erdös-Rényi models +?igraph::sample_smallworld # for the Watts & Strogatz model +?igraph::sample_pa # for the Barabasi-Albert model # But note that there are many more sample_ functions. Check out the docs! @@ -554,7 +557,7 @@ names(c1) # layout drawas them, obviously. set.seed(112358) # set RNG seed for repeatable randomness -myGxy <- layout_with_fr(myG) # calculate layout coordinates +myGxy <- igraph::layout_with_fr(myG) # calculate layout coordinates set.seed(NULL) # reset the RNG oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state @@ -563,30 +566,31 @@ plot(myG, rescale = FALSE, xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), - vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1], - vertex.size = 20 + (10 * degree(myG)), - vertex.label = V(myG)$name, + vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], + vertex.size = 20 + (10 * igraph::degree(myG)), + vertex.label = igraph::V(myG)$name, vertex.label.family = "sans", vertex.label.cex = 0.8) par(oPar) # restore graphics state # == 4.1 Diameter ========================================================== -diameter(myG) # The diameter of a graph is its maximum length shortest path. +igraph::diameter(myG) # The diameter of a graph is its maximum length + # shortest path. # let's plot this path: here are the nodes ... -get_diameter(myG) +igraph::get_diameter(myG) # ... and we can get the x, y coordinates from iGxy by subsetting with the node # names. The we draw the diameter-path with a transparent, thick pink line: -lines(myGxy[get_diameter(myG),], lwd=10, col="#ff63a788") +lines(myGxy[igraph::get_diameter(myG),], lwd=10, col="#ff63a788") # == Centralization scores -?centralize +?igraph::centralize # replot our graph, and color by log_betweenness: -bC <- centr_betw(myG) # calculate betweenness centrality +bC <- igraph::centr_betw(myG) # calculate betweenness centrality nodeBetw <- bC$res nodeBetw <- round(log(nodeBetw +1)) + 1 @@ -597,8 +601,8 @@ plot(myG, xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), vertex.color=heat.colors(max(nodeBetw))[nodeBetw], - vertex.size = 20 + (10 * degree(myG)), - vertex.label = V(myG)$name, + vertex.size = 20 + (10 * igraph::degree(myG)), + vertex.label = igraph::V(myG)$name, vertex.label.family = "sans", vertex.label.cex = 0.7) par(oPar) @@ -613,7 +617,7 @@ par(oPar) # # Lets plot betweenness centrality for our random geometric graph: -bCmyGRG <- centr_betw(myGRG) # calculate betweenness centrality +bCmyGRG <- igraph::centr_betw(myGRG) # calculate betweenness centrality nodeBetw <- bCmyGRG$res nodeBetw <- round((log(nodeBetw +1))^2.5) + 1 @@ -630,9 +634,9 @@ plot(myGRG, vertex.label = NA) par(oPar) -diameter(myGRG) -lines(rGAM$x[get_diameter(myGRG)], - rGAM$y[get_diameter(myGRG)], +igraph::diameter(myGRG) +lines(rGAM$x[igraph::get_diameter(myGRG)], + rGAM$y[igraph::get_diameter(myGRG)], lwd = 10, col = "#ff335533") @@ -648,11 +652,11 @@ lines(rGAM$x[get_diameter(myGRG)], # http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org -myGRGclusters <- cluster_infomap(myGRG) -modularity(myGRGclusters) # ... measures how separated the different membership - # types are from each other -membership(myGRGclusters) # which nodes are in what cluster? -table(membership(myGRGclusters)) # how large are the clusters? +myGRGclusters <- igraph::cluster_infomap(myGRG) +igraph::modularity(myGRGclusters) # ... measures how separated the different + # membership types are from each other +igraph::membership(myGRGclusters) # which nodes are in what cluster? +table(igraph::membership(myGRGclusters)) # how large are the clusters? # The largest cluster has 48 members, the second largest has 25, etc. @@ -661,7 +665,7 @@ table(membership(myGRGclusters)) # how large are the clusters? # their cluster membership: # first, make a vector with as many grey colors as we have communities ... -commColors <- rep("#f1eef6", max(membership(myGRGclusters))) +commColors <- rep("#f1eef6", max(igraph::membership(myGRGclusters))) # ... then overwrite the first five with "real colors" - something like rust, # lilac, pink, and mauve or so. commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da") @@ -673,8 +677,8 @@ plot(myGRG, rescale = FALSE, xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), - vertex.color=commColors[membership(myGRGclusters)], - vertex.size = 0.1 + (0.1 * degree(myGRG)), + vertex.color=commColors[igraph::membership(myGRGclusters)], + vertex.size = 0.1 + (0.1 * igraph::degree(myGRG)), vertex.label = NA) par(oPar) diff --git a/FND-STA-Probability_distribution.R b/FND-STA-Probability_distribution.R index 8b32f63..e77dc01 100644 --- a/FND-STA-Probability_distribution.R +++ b/FND-STA-Probability_distribution.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the FND-STA-Probability_distribution unit. # -# Version: 1.2 +# Version: 1.3 # # Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.3 Change from require() to requireNamespace(), +# use ::() idiom throughout, # 1.2 Update set.seed() usage # 1.1 Corrected empirical p-value # 1.0 First code live version @@ -26,24 +28,24 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ------------------------------------------------------------------------- -#TOC> 1 Introduction 50 -#TOC> 2 Three fundamental distributions 113 -#TOC> 2.1 The Poisson Distribution 116 -#TOC> 2.2 The uniform distribution 170 -#TOC> 2.3 The Normal Distribution 190 -#TOC> 3 quantile-quantile comparison 231 -#TOC> 3.1 qqnorm() 241 -#TOC> 3.2 qqplot() 307 -#TOC> 4 Quantifying the difference 324 -#TOC> 4.1 Chi2 test for discrete distributions 359 -#TOC> 4.2 Kullback-Leibler divergence 451 -#TOC> 4.2.1 An example from tossing dice 462 -#TOC> 4.2.2 An example from lognormal distributions 585 -#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ----------------------------------------------------------------------------- +#TOC> 1 Introduction 52 +#TOC> 2 Three fundamental distributions 115 +#TOC> 2.1 The Poisson Distribution 118 +#TOC> 2.2 The uniform distribution 172 +#TOC> 2.3 The Normal Distribution 192 +#TOC> 3 quantile-quantile comparison 233 +#TOC> 3.1 qqnorm() 243 +#TOC> 3.2 qqplot() 309 +#TOC> 4 Quantifying the difference 326 +#TOC> 4.1 Chi2 test for discrete distributions 361 +#TOC> 4.2 Kullback-Leibler divergence 452 +#TOC> 4.2.1 An example from tossing dice 463 +#TOC> 4.2.2 An example from lognormal distributions 586 +#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629 +#TOC> #TOC> ========================================================================== @@ -385,9 +387,8 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4]) # package information - plotrix has _many_ useful utilities to enhance # plots or produce informative visualizations. -if (! require(plotrix, quietly=TRUE)) { +if (! requireNamespace("plotrix", quietly = TRUE)) { install.packages("plotrix") - library(plotrix) } # Package information: # library(help = plotrix) # basic information @@ -395,9 +396,9 @@ if (! require(plotrix, quietly=TRUE)) { # data(package = "plotrix") # available datasets -h <- multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ), - breaks = myBreaks, - col = myCols) +h <- plotrix::multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ), + breaks = myBreaks, + col = myCols) legend("topright", legend = c("rL1", "rL2", "rG1.2", "rG1.5", "rG1.9"), pch=15, @@ -459,7 +460,7 @@ chisq.test(countsL1, countsG1.9, simulate.p.value = TRUE, B = 10000) # be applied to discrete distributions. But we need to talk a bit about # converting counts to p.m.f.'s. -# === 4.2.1 An example from tossing dice +# === 4.2.1 An example from tossing dice # The p.m.f of an honest die is (1:1/6, 2:1/6, 3:1/6, 4:1/6, 5:1/6, 6:1/6). But # there is an issue when we convert sampled counts to frequencies, and estimate @@ -582,7 +583,7 @@ abline(v = KLdiv(rep(1/6, 6), pmfPC(counts, 1:6)), col="firebrick") # somewhat but not drastically atypical. -# === 4.2.2 An example from lognormal distributions +# === 4.2.2 An example from lognormal distributions # We had compared a set of lognormal and gamma distributions above, now we # can use KL-divergence to quantify their similarity: diff --git a/RPR-Biostrings.R b/RPR-Biostrings.R index 3a68b25..b24c08e 100644 --- a/RPR-Biostrings.R +++ b/RPR-Biostrings.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Biostrings unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 20 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.0 2017 Revisions # 0.1 First code copied from 2016 material. # @@ -27,20 +30,20 @@ #TOC> ========================================================================== #TOC> -#TOC> Section Title Line -#TOC> --------------------------------------------------------- -#TOC> 1 The Biostrings Package 52 -#TOC> 2 Getting Data into Biostrings Objects 85 -#TOC> 3 Working with Biostrings Objects 106 -#TOC> 3.1 Properties 109 -#TOC> 3.2 Subsetting 146 -#TOC> 3.3 Operators 158 -#TOC> 3.4 Transformations 165 -#TOC> 4 Getting Data out of Biostrings Objects 172 -#TOC> 5 More 181 -#TOC> 5.1 Views 183 -#TOC> 5.2 Iranges 195 -#TOC> 5.3 StringSets 201 +#TOC> Section Title Line +#TOC> --------------------------------------------------------------- +#TOC> 1 The Biostrings Package 55 +#TOC> 2 Getting Data into Biostrings Objects 86 +#TOC> 3 Working with Biostrings Objects 108 +#TOC> 3.1 Properties 125 +#TOC> 3.2 Subsetting 163 +#TOC> 3.3 Operators 175 +#TOC> 3.4 Transformations 182 +#TOC> 4 Getting Data out of Biostrings Objects 189 +#TOC> 5 More 198 +#TOC> 5.1 Views 200 +#TOC> 5.2 Iranges 214 +#TOC> 5.3 StringSets 220 #TOC> #TOC> ========================================================================== @@ -54,14 +57,12 @@ # First, we install and load the Biostrings package from bioconductor -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") } - # Examine the package information: library(help = Biostrings) # basic information browseVignettes("Biostrings") # available vignettes @@ -72,72 +73,88 @@ data(package = "Biostrings") # available datasets # of a "class" in R as a special kind of list), that can take on particular # flavours for RNA, DNA or amino acid sequence information. -class(RNAString("AUG")) -class(DNAString("ATG")) -class(AAString("M")) +class(Biostrings::RNAString("AUG")) +class(Biostrings::DNAString("ATG")) +class(Biostrings::AAString("M")) # An essential property of Biostrings objects is that they only allow letters # from the applicable IUPAC alphabet: -RNAString("AUG") -DNAString("AUG") # Error! No "U" in IUPAC DNA codes +Biostrings::RNAString("AUG") +Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes # = 2 Getting Data into Biostrings Objects ================================ # Example: read FASTA. Extract sequence. Convert to DNAString object. -x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") -x <- dbSanitizeSequence(x) -myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a -# object of class DNAstring +rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") +rawSeq <- dbSanitizeSequence(rawSeq) +biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence + # into an object of class DNAstring # Multi FASTA files can be read directly as a "XStringSet) ... -(myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa")) +rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" +(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) # ... and if you subset one sequence from the set, you get an XString object # back again. -(Xseq <- myDNASet[[1]]) +(Xseq <- biosDNASet[[1]]) -myDNAseq == Xseq # the comparison evaluates to TRUE ... -identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical. +biosDNAseq == Xseq # the comparison evaluates to TRUE ... +identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical. # = 3 Working with Biostrings Objects ===================================== +# Biostrings is a highly engineered package that is tightly integrated into +# the Bioconductor world - unfortunately that brings with it a somewhat +# undesirable level of computational overhead and dependencies. Using the +# package as we normally do - i.e. calling required functions with their +# explicit package prefix is therefore not advisable. There are generics +# that won't be propery dispatched. If you only need a small number of +# functions for a very specific context, you will probably get away with +# Biostrings::() - but even in the demonstration code of this script +# not everything works out of the box. We'll therefore load the library, +# but we'll (redundantly) use the prefix anyway so as to emphasize where +# the functions come from. + +library(Biostrings) + # == 3.1 Properties ======================================================== -str(myDNAseq) -length(myDNAseq) # This gives you the _number of nucleotides_! -# By comparison ... -length(x) # ... is 1: one string only. To get the number of -# characters in a string, you need nchar(). -nchar(x) # However ... -nchar(myDNAseq) # ... also works. +str(rawSeq) +str(biosDNAseq) -uniqueLetters(myDNAseq) +length(rawSeq) # ... is 1: one string only. To get the number of + # characters in a string, you need nchar(). +length(biosDNAseq) # but the length of a "Bstring" is the number of elements +nchar(rawSeq) +nchar(biosDNAseq) # ... but nchar() works too. + +(uL <- Biostrings::uniqueLetters(biosDNAseq)) # Count frequencies - with strings, you would strsplit() into a character # vector and then use table(). biost -alphabetFrequency(myDNAseq) +Biostrings::alphabetFrequency(biosDNAseq) # letterFrequency() works with a defined alphabet - such as what uniqueLetters() # returns. -letterFrequency(myDNAseq, uniqueLetters(myDNAseq)) +Biostrings::letterFrequency(biosDNAseq, uL) +sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / + length(biosDNAseq) # GC contents -sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents +Biostrings::dinucleotideFrequency(biosDNAseq) +barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) -dinucleotideFrequency(myDNAseq) -barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5) - -(triNuc <- trinucleotideFrequency(myDNAseq)) +(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) barplot(sort(triNuc), col="#4499EE33") triNuc[triNuc == max(triNuc)] triNuc[triNuc == min(triNuc)] max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT # compare to a shuffled sequence: -(triNuc <- trinucleotideFrequency(sample(myDNAseq))) +(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) barplot(sort(triNuc), col="#EEEE4433", add = TRUE) # Interpret this plot. @@ -146,34 +163,34 @@ barplot(sort(triNuc), col="#EEEE4433", add = TRUE) # == 3.2 Subsetting ======================================================== # Subsetting any XString object works as expected: -myDNAseq[4:15] +biosDNAseq[4:15] -# ... well - maybe not expected, because x[4:15] would not work. +# ... well - maybe not expected, because rawSeq[4:15] would not work. # Alternatively to the "[" operator, use the subseq() function - especially for # long sequences. This is far more efficient. -subseq(myDNAseq, start = 1, end = 30) +Biostrings::subseq(biosDNAseq, start = 1, end = 30) # == 3.3 Operators ========================================================= # RNAstring() and DNAstring() objects compare U and T as equals! -RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == - DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") + Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == + Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") # == 3.4 Transformations =================================================== -myDNAseq[4:15] -reverseComplement(myDNAseq[4:15]) -translate(myDNAseq[4:15]) +biosDNAseq[4:15] +Biostrings::reverseComplement(biosDNAseq[4:15]) +Biostrings::translate(biosDNAseq[4:15]) # = 4 Getting Data out of Biostrings Objects ============================== # If you need a character object, use toString(): -toString(myDNAseq[4:15]) +Biostrings::toString(biosDNAseq[4:15]) # save() and load() works like on all other R objects. @@ -185,7 +202,9 @@ toString(myDNAseq[4:15]) # Biostring "Views" are objects that store multiple substrings of one # Biostring object. -(myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45))) +(myView <- Biostrings::Views(biosDNAseq, + start = c(1, 19, 37), + end = c(15, 30, 45))) # Views are convenient to store feature annotations names(myView) <- c("Feature-A", "Feature-B", "Feature-C") @@ -202,20 +221,20 @@ cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) # Biostring "StringSets" store multiple sequences. # -ompA <- AAString("MKKTAIAIAVALAGFATVAQA") +ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") sample(ompA) # sample can work directly on a Biostring object to shuffle it -x[1] <- toString(ompA) +x <- Biostrings::toString(ompA) for (i in 2:10) { - x[i] <- toString(sample(ompA)) + x[i] <- Biostrings::toString(sample(ompA)) } -shuffledPeptideSet <- AAStringSet(x) +shuffledPeptideSet <- Biostrings::AAStringSet(x) names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) shuffledPeptideSet length(shuffledPeptideSet) -width(shuffledPeptideSet) -alphabetFrequency(shuffledPeptideSet) +Biostrings::width(shuffledPeptideSet) +Biostrings::alphabetFrequency(shuffledPeptideSet) # [END] diff --git a/RPR-GEO2R.R b/RPR-GEO2R.R index bc53d92..7338ce9 100644 --- a/RPR-GEO2R.R +++ b/RPR-GEO2R.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR_GEO2R unit. # -# Version: 1.1 +# Version: 1.2 # -# Date: 2017 09 - 2018 01 +# Date: 2017 09 - 2019 01 # Author: Boris Steipe # # Versions: +# 1.2 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.1 Add section on GPL annotations # 1.0 Updates for BCH441 2017 # 0.1 First code copied from 2016 material. @@ -31,22 +34,22 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> -------------------------------------------------------------------- -#TOC> 1 Preparations 53 -#TOC> 2 Loading a GEO Dataset 84 -#TOC> 3 Column wise analysis - time points 154 -#TOC> 3.1 Task - Comparison of experiments 160 -#TOC> 3.2 Grouped Samples 207 -#TOC> 4 Row-wise Analysis: Expression Profiles 242 -#TOC> 4.1 Task - Read a table of features 277 -#TOC> 4.2 Selected Expression profiles 325 -#TOC> 5 Differential Expression 366 -#TOC> 5.1 Final task: Gene descriptions 490 -#TOC> 6 Improving on Discovery by Differential Expression 495 -#TOC> 7 Annotation data 577 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> -------------------------------------------------------------------------- +#TOC> 1 Preparations 56 +#TOC> 2 Loading a GEO Dataset 82 +#TOC> 3 Column wise analysis - time points 152 +#TOC> 3.1 Task - Comparison of experiments 158 +#TOC> 3.2 Grouped Samples 205 +#TOC> 4 Row-wise Analysis: Expression Profiles 240 +#TOC> 4.1 Task - Read a table of features 275 +#TOC> 4.2 Selected Expression profiles 323 +#TOC> 5 Differential Expression 364 +#TOC> 5.1 Final task: Gene descriptions 504 +#TOC> 6 Improving on Discovery by Differential Expression 510 +#TOC> 7 Annotation data 594 +#TOC> #TOC> ========================================================================== @@ -55,12 +58,11 @@ # To load and analyze GEO datasets we use a number of Bioconductor packages: -if (! require(Biobase, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biobase") - library(Biobase) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biobase", quietly = TRUE)) { + BiocManager::install("Biobase") } # Package information: # library(help = Biobase) # basic information @@ -68,12 +70,8 @@ if (! require(Biobase, quietly=TRUE)) { # data(package = "Biobase") # available datasets -if (! require(GEOquery, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("GEOquery") - library(GEOquery) +if (! requireNamespace("GEOquery", quietly = TRUE)) { + BiocManager::install("GEOquery") } # Package information: # library(help = GEOquery) # basic information @@ -94,7 +92,7 @@ if (! require(GEOquery, quietly=TRUE)) { # I have experienced outages over several hours. If the command below does # not work for you, skip ahead to the fallback procedure. -GSE3635 <- getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE) +GSE3635 <- GEOquery::getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE) # Note: GEO2R scripts call the expression data set # "gset" throughout ... in this script I give # it the name "GSE3635" for clarity. @@ -136,14 +134,14 @@ help("ExpressionSet-class") GSE3635 # Access contents via methods: -featureNames(GSE3635)[1:20] # Rows. What are these features? -sampleNames(GSE3635)[1:10] # Columns. What are these columns? +Biobase::featureNames(GSE3635)[1:20] # Rows. What are these features? +Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns? # Access contents by subsetting: ( tmp <- GSE3635[12:17, 1:6] ) # Access data -exprs(tmp) # exprs() gives us the actual expression values. +Biobase::exprs(tmp) # exprs() gives us the actual expression values. #TASK> What are the data: @@ -160,9 +158,9 @@ exprs(tmp) # exprs() gives us the actual expression values. # == 3.1 Task - Comparison of experiments ================================== # Get an overview of the distribution of data values in individual columns -summary(exprs(GSE3635)[ , 1]) -summary(exprs(GSE3635)[ , 4]) -summary(exprs(GSE3635)[ , 7]) +summary(Biobase::exprs(GSE3635)[ , 1]) +summary(Biobase::exprs(GSE3635)[ , 4]) +summary(Biobase::exprs(GSE3635)[ , 7]) # as a boxplot cyclicPalette <- colorRampPalette(c("#00AAFF", @@ -173,7 +171,7 @@ cyclicPalette <- colorRampPalette(c("#00AAFF", "#FFAA00", "#00AAFF")) tCols <- cyclicPalette(13) -boxplot(exprs(GSE3635), col = tCols) +boxplot(Biobase::exprs(GSE3635), col = tCols) #TASK> Study this boxplot. What's going on? Are these expression values? @@ -181,11 +179,11 @@ boxplot(exprs(GSE3635), col = tCols) # Lets plot the distributions of values in a more fine-grained manner: -hT0 <- hist(exprs(GSE3635)[ , 1], breaks = 100) -hT3 <- hist(exprs(GSE3635)[ , 4], breaks = 100) -hT6 <- hist(exprs(GSE3635)[ , 7], breaks = 100) -hT9 <- hist(exprs(GSE3635)[ , 10], breaks = 100) -hT12 <- hist(exprs(GSE3635)[ , 13], breaks = 100) +hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100) +hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100) +hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100) +hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100) +hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100) plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5)) @@ -218,7 +216,7 @@ for (i in 1:nchar(gsms)) { sml <- paste("G", sml, sep="") # set group names # order samples by group -ex <- exprs(GSE3635)[ , order(sml)] +ex <- Biobase::exprs(GSE3635)[ , order(sml)] sml <- sml[order(sml)] fl <- as.factor(sml) labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we @@ -231,8 +229,8 @@ labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5", "#dff4e4", "#f4dff4", "#AABBCC") dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window -par(mar = c(2 + round(max(nchar(sampleNames(GSE3635))) / 2), 4, 2, 1)) -title <- paste ("GSE3635", '/', annotation(GSE3635), +par(mar = c(2 + round(max(nchar(Biobase::sampleNames(GSE3635))) / 2), 4, 2, 1)) +title <- paste ("GSE3635", '/', Biobase::annotation(GSE3635), " grouped samples", sep ='') boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE, las = 2, col = GEOcols[fl]) @@ -331,7 +329,7 @@ gName <- "MBP1" (iFeature <- which(SGD_features$name == gName)) (iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) plot(seq(0, 120, by = 10), - exprs(GSE3635)[iExprs, ], + Biobase::exprs(GSE3635)[iExprs, ], main = paste("Expression profile for", gName), xlab = "time (min)", ylab = "expression", @@ -368,12 +366,8 @@ SGD_features$description[iFeature] # GEO2R discovers the top differentially expressed expressed genes by # using functions in the Bioconductor limma package. -if (! require(limma, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("limma") - library(limma) +if (! requireNamespace("limma", quietly = TRUE)) { + BiocManager::install("limma") } # Package information: # library(help = limma) # basic information @@ -392,6 +386,20 @@ if (! require(limma, quietly=TRUE)) { # the groups # 4. Format results. +# Biobase is a highly engineered package that is tightly integrated into +# the Bioconductor world - unfortunately that brings with it a somewhat +# undesirable level of computational overhead and dependencies. Using the +# package as we normally do - i.e. calling required functions with their +# explicit package prefix is therefore not advisable. There are generics +# that won't be propery dispatched. If you only need a small number of +# functions for a very specific context, you will probably get away with +# Biobase::() - but even in the demonstration code of this script +# not everything works out of the box. We'll therefore load the library, +# but we'll (redundantly) use the prefix anyway so as to emphasize where +# the functions come from. + +library(Biobase) + # We are recapitulating the experiment in which we assigned the 0, 10, 60 and # 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to # another group, and calculated differential expression values between these @@ -415,15 +423,15 @@ myDesign # Now we can calculate the fit of all rows to a linear model that depends # on the two groups as specified in the design: -myFit <- lmFit(mySet, myDesign) +myFit <- limma::lmFit(mySet, myDesign) # Next we calculate the contrasts, given the fit ... -myCont.matrix <- makeContrasts(A - B, levels = myDesign) -myFit2 <- contrasts.fit(myFit, myCont.matrix) +myCont.matrix <- limma::makeContrasts(A - B, levels = myDesign) +myFit2 <- limma::contrasts.fit(myFit, myCont.matrix) # ... compute appropriate probabilites from a modified t-test # (empirical Bayes) ... -myFit2 <- eBayes(myFit2, 0.01) +myFit2 <- limma::eBayes(myFit2, 0.01) # ... add the gene names to the fit - object ... myFit2$genes <- featureNames(mySet) @@ -433,7 +441,10 @@ myFit2$genes <- featureNames(mySet) # gave us only the top 250 genes, but we might as well do 1000, just so we # can be reasonable sure that our gens of interest are included. N <- 1000 -myTable <- topTable(myFit2, adjust.method = "fdr", sort.by = "B", number = N) +myTable <- limma::topTable(myFit2, + adjust.method = "fdr", + sort.by = "B", + number = N) str(myTable) # The gene names are now in the $ID column @@ -461,7 +472,7 @@ abline(h = 0, col = "#00000055") for (i in 1:10) { thisID <- myTable$ID[i] - points(seq(0, 120, by = 10), exprs(GSE3635)[thisID, ], type = "b") + points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b") } # Our guess that we might discover interesting genes be selecting groups A and B @@ -480,7 +491,10 @@ for (i in 1:10) { myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3") for (name in toupper(myControls)) { thisID <- SGD_features$sysName[which(SGD_features$name == name)] - points(seq(0, 120, by=10), exprs(GSE3635)[thisID, ], type="b", col="#AA0000") + points(seq(0, 120, by=10), + Biobase::exprs(GSE3635)[thisID, ], + type="b", + col="#AA0000") } # Indeed, the discovered gene profiles look much "cleaner" than the real cycle @@ -504,7 +518,7 @@ for (name in toupper(myControls)) { gName <- "CLN2" (iFeature <- which(SGD_features$name == gName)) (iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) -Cln2Profile <- exprs(GSE3635)[iExprs, ] +Cln2Profile <- Biobase::exprs(GSE3635)[iExprs, ] plot(seq(0, 120, by = 10), Cln2Profile, ylim = c(-1, 1), @@ -519,16 +533,16 @@ abline(v = 60, col = "#00000055") # Set up a vector of correlation values -myCorrelations <- numeric(nrow(exprs(GSE3635))) -names(myCorrelations) <- featureNames(GSE3635) +myCorrelations <- numeric(nrow(Biobase::exprs(GSE3635))) +names(myCorrelations) <- Biobase::featureNames(GSE3635) for (i in 1:length(myCorrelations)) { - myCorrelations[i] <- cor(Cln2Profile, exprs(GSE3635)[i, ]) + myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ]) } myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten # Number 1 -(ID <- featureNames(GSE3635)[myTopC[1]]) +(ID <- Biobase::featureNames(GSE3635)[myTopC[1]]) # Get information SGD_features[which(SGD_features$sysName == ID), ] @@ -537,12 +551,13 @@ SGD_features[which(SGD_features$sysName == ID), ] # Let's plot the rest for (i in 2:length(myTopC)) { - ID <- featureNames(GSE3635)[myTopC[i]] + ID <- Biobase::featureNames(GSE3635)[myTopC[i]] points(seq(0, 120, by = 10), - exprs(GSE3635)[ID, ], + Biobase::exprs(GSE3635)[ID, ], type = "b", col= "chartreuse") - print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")]) + print(SGD_features[which(SGD_features$sysName == ID), + c("name", "description")]) } # Note that all of these genes are highly correlated with a known cell cycle @@ -554,12 +569,13 @@ for (i in 2:length(myTopC)) { # And we haven't even looked at the anticorrelated genes yet... myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten for (i in 1:length(myBottomC)) { - ID <- featureNames(GSE3635)[myBottomC[i]] + ID <- Biobase::featureNames(GSE3635)[myBottomC[i]] points(seq(0, 120, by = 10), - exprs(GSE3635)[ID, ], + Biobase::exprs(GSE3635)[ID, ], type = "b", col= "coral") - print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")]) + print(SGD_features[which(SGD_features$sysName == ID), + c("name", "description")]) } # ... which are very interesting in their own right. @@ -583,7 +599,7 @@ for (i in 1:length(myBottomC)) { # we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL # annotations were not loaded. We could use getGPL = TRUE instead ... -GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE) +GSE3635annot <- GEOquery::getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE) GSE3635annot <- GSE3635annot[[1]] # ... and the feature data is then available in the GSE3635@featureData@data @@ -597,13 +613,8 @@ GSE3635annot@featureData@data[ 1:20 , ] myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")] str(myAnnot) -# ... Note that this is a data frame, but - oy veh - the gene symbols are -# factors. Really, we need to fix this! To convert a factor into a string, -# we need to cast it to character. - -myAnnot$Gene <- as.character(myAnnot$Gene) - -# ... whereupon we can find things we might be looking for ... +# ... Note that this is a data frame and it is easy to find things we +# might be looking for ... myAnnot[which(myAnnot$Gene == "MBP1"), ] # ... or identify rows that might give us trouble, such as probes that @@ -614,13 +625,11 @@ myAnnot[which(myAnnot$Gene == "MBP1"), ] GSE3635@annotation # "GPL1914" # ... and downloaded it directly from NCBI: -GPL1914 <- getGEO("GPL1914") +GPL1914 <- GEOquery::getGEO("GPL1914") str(GPL1914) # ... from which we can get the data - which is however NOT necessarily -# matched to the rows of our expression dataset. Note that here too: the -# majority of data elements are factors and will likely have to be converted -# before use. +# matched to the rows of our expression dataset. # [END] diff --git a/RPR-Genetic_code_optimality.R b/RPR-Genetic_code_optimality.R index 75178a5..bdf68dd 100644 --- a/RPR-Genetic_code_optimality.R +++ b/RPR-Genetic_code_optimality.R @@ -3,12 +3,15 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Genetic_code_optimality unit. # -# Version: 1.1 +# Version: 1.2 # # Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.2 Change from require() to requireNamespace(), +# use ::() idiom throughout, +# use Biocmanager:: not biocLite() # 1.1 Update set.seed() usage # 1.0.1 Fixed two bugs discovered by Suan Chin Yeo. # 1.0 New material. @@ -30,16 +33,16 @@ #TOC> #TOC> Section Title Line #TOC> -------------------------------------------------------------- -#TOC> 1 Designing a computational experiment 54 -#TOC> 2 Setting up the tools 70 -#TOC> 2.1 Natural and alternative genetic codes 73 -#TOC> 2.2 Effect of mutations 132 -#TOC> 2.2.1 reverse-translate 143 -#TOC> 2.2.2 Randomly mutate 168 -#TOC> 2.2.3 Forward- translate 193 -#TOC> 2.2.4 measure effect 211 -#TOC> 3 Run the experiment 258 -#TOC> 4 Task solutions 351 +#TOC> 1 Designing a computational experiment 57 +#TOC> 2 Setting up the tools 73 +#TOC> 2.1 Natural and alternative genetic codes 76 +#TOC> 2.2 Effect of mutations 134 +#TOC> 2.2.1 reverse-translate 145 +#TOC> 2.2.2 Randomly mutate 170 +#TOC> 2.2.3 Forward- translate 195 +#TOC> 2.2.4 measure effect 213 +#TOC> 3 Run the experiment 260 +#TOC> 4 Task solutions 356 #TOC> #TOC> ========================================================================== @@ -73,12 +76,11 @@ # == 2.1 Natural and alternative genetic codes ============================= # Load genetic code tables from the Biostrings package -if (! require(Biostrings, quietly=TRUE)) { - if (! exists("biocLite")) { - source("https://bioconductor.org/biocLite.R") - } - biocLite("Biostrings") - library(Biostrings) +if (! requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (! requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") } # Package information: # library(help = Biostrings) # basic information @@ -257,52 +259,55 @@ evalMut <- function(nat, mut) { # = 3 Run the experiment ================================================== +# Fetch the standard Genetic code from Biostrings:: + +stdCode <- Biostrings::GENETIC_CODE # Fetch the nucleotide sequence for MBP1: myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] myDNA <- paste0(myDNA, collapse = "") -myDNA <- as.character(codons(DNAString(myDNA))) +myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) myDNA <- myDNA[-length(myDNA)] # drop the stop codon -myAA <- traFor(myDNA, GENETIC_CODE) +myAA <- traFor(myDNA, stdCode) # Mutate and evaluate set.seed(112358) x <- randMut(myDNA) set.seed(NULL) -x <- traFor(x, GENETIC_CODE) +x <- traFor(x, stdCode) evalMut(myAA, x) # 166.4 # Try this 200 times, and see how the values are distributed. N <- 200 -valUGC <- numeric(N) +valSTDC <- numeric(N) set.seed(112358) # set RNG seed for repeatable randomness for (i in 1:N) { x <- randMut(myDNA) # mutate - x <- traFor(x, GENETIC_CODE) # translate - valUGC[i] <- evalMut(myAA, x) # evaluate + x <- traFor(x, stdCode) # translate + valSTDC[i] <- evalMut(myAA, x) # evaluate } set.seed(NULL) # reset the RNG -hist(valUGC, +hist(valSTDC, breaks = 15, col = "palegoldenrod", xlim = c(0, 400), ylim = c(0, N/4), - main = "Universal vs. Synthetic Genetic Code", + main = "Standard vs. Synthetic Genetic Code", xlab = "Mutation penalty") # This looks like a normal distribution. Let's assume the effect of mutations -# under the universal genetic code is the mean of this distribution: -effectUGC <- mean(valUGC) # 178.1 +# under the standard genetic code is the mean of this distribution: +effectSTDC <- mean(valSTDC) # 178.1 # Now we can look at the effects of alternate genetic codes: set.seed(112358) # choose a new code -GC <- randomGC(GENETIC_CODE) +GC <- randomGC(stdCode) set.seed(NULL) # reverse translate hypothetical sequence according to the new code @@ -321,7 +326,7 @@ valXGC <- numeric(N) set.seed(1414214) # set RNG seed for repeatable randomness for (i in 1:N) { - GC <- randomGC(GENETIC_CODE) # Choose code + GC <- randomGC(stdCode) # Choose code x <- traRev(myAA, GC) # reverse translate x <- randMut(x) # mutate x <- traFor(x, GC) # translate @@ -355,7 +360,7 @@ valSGC <- numeric(N) set.seed(2718282) # set RNG seed for repeatable randomness for (i in 1:N) { - GC <- swappedGC(GENETIC_CODE) # Choose code + GC <- swappedGC(stdCode) # Choose code x <- traRev(myAA, GC) # reverse translate x <- randMut(x) # mutate x <- traFor(x, GC) # translate diff --git a/RPR-PROSITE_POST.R b/RPR-PROSITE_POST.R index 09116d9..4e85457 100644 --- a/RPR-PROSITE_POST.R +++ b/RPR-PROSITE_POST.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # -# Version: 1.0.1 +# Version: 1.1 # -# Date: 2017 10 - 2018 12 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout, # 1.0.1 Updates for slightly changed interfaces # 1.0 First ABC units version # 0.1 First code copied from 2016 material. @@ -27,22 +29,21 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> --------------------------------------------------------------- -#TOC> 1 Constructing a POST command from a Web query 44 -#TOC> 1.1 Task - fetchPrositeFeatures() function 145 -#TOC> 2 Task solutions 153 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------------------------- +#TOC> 1 Constructing a POST command from a Web query 42 +#TOC> 1.1 Task - fetchPrositeFeatures() function 142 +#TOC> 2 Task solutions 150 +#TOC> #TOC> ========================================================================== # = 1 Constructing a POST command from a Web query ======================== -if (! require(httr, quietly=TRUE)) { +if (! requireNamespace("httr", quietly = TRUE)) { install.packages("httr") - library(httr) } # Package information: # library(help = httr) # basic information @@ -60,24 +61,24 @@ UniProtID <- "P39678" URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" -response <- POST(URL, - body = list(meta = "opt1", - meta1_protein = "opt1", - seq = UniProtID, - skip = "on", - output = "tabular")) +response <- httr::POST(URL, + body = list(meta = "opt1", + meta1_protein = "opt1", + seq = UniProtID, + skip = "on", + output = "tabular")) # Send off this request, and you should have a response in a few # seconds. Let's check the status first: -status_code(response) # If this is not 200, something went wrong and it - # makes no sense to continue. If this persists, ask - # on the mailing list what to do. +httr::status_code(response) # If this is not 200, something went wrong and it + # makes no sense to continue. If this persists, ask + # on the mailing list what to do. # The text contents of the response is available with the # content() function: -content(response, "text") +httr::content(response, "text") # ... should show you the same as the page contents that # you have seen in the browser. The date we need Now we need to extract @@ -86,7 +87,7 @@ content(response, "text") # individual lines, since each of our data elements is on # its own line. We simply split on the "\\n" newline character. -lines <- unlist(strsplit(content(response, "text"), "\\n")) +lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) head(lines) # Now we define a query pattern for the lines we want: diff --git a/RPR-SX-PDB.R b/RPR-SX-PDB.R index 9928dc5..55c588e 100644 --- a/RPR-SX-PDB.R +++ b/RPR-SX-PDB.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-SX-PDB unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 19 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0 First live version, completely refactores 2016 code # with remarkable speed gains. Added section on x, y, z # (density) plots. @@ -28,22 +30,22 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ---------------------------------------------------- -#TOC> 1 Introduction to the bio3D package 63 -#TOC> 2 A Ramachandran plot 151 -#TOC> 3 Density plots 227 -#TOC> 3.1 Density-based colours 241 -#TOC> 3.2 Plotting with smoothScatter() 260 -#TOC> 3.3 Plotting hexbins 275 -#TOC> 3.4 Plotting density contours 299 -#TOC> 3.4.1 ... as overlay on a colored grid 333 -#TOC> 3.4.2 ... as filled countour 350 -#TOC> 3.4.3 ... as a perspective plot 381 -#TOC> 4 cis-peptide bonds 399 -#TOC> 5 H-bond lengths 414 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ---------------------------------------------------------- +#TOC> 1 Introduction to the bio3D package 61 +#TOC> 2 A Ramachandran plot 152 +#TOC> 3 Density plots 228 +#TOC> 3.1 Density-based colours 242 +#TOC> 3.2 Plotting with smoothScatter() 261 +#TOC> 3.3 Plotting hexbins 276 +#TOC> 3.4 Plotting density contours 304 +#TOC> 3.4.1 ... as overlay on a colored grid 337 +#TOC> 3.4.2 ... as filled countour 354 +#TOC> 3.4.3 ... as a perspective plot 385 +#TOC> 4 cis-peptide bonds 403 +#TOC> 5 H-bond lengths 418 +#TOC> #TOC> ========================================================================== @@ -59,9 +61,8 @@ # = 1 Introduction to the bio3D package =================================== -if (! require(bio3d, quietly=TRUE)) { +if (! requireNamespace("bio3d", quietly = TRUE)) { install.packages("bio3d") - library(bio3d) } # Package information: # library(help = bio3d) # basic information @@ -89,8 +90,8 @@ file.show("./data/1BM8.pdb") # Are all atoms of the N-terminal residue present? # Are all atoms of the C-terminal residue present? -apses <- read.pdb("1bm8") # load a molecule directly from the PDB via the - # Internet. (This is not your local version.) +apses <- bio3d::read.pdb("1bm8") # load a molecule directly from the PDB via + # the Internet. # check what we have: apses @@ -121,10 +122,11 @@ apses$atom[apses$atom[,"resno"] == i, ] apses$seqres[1:10] # the "A"s here identify chain "A" # Can we convert this to one letter code? -aa321(apses$seqres[1:10]) +bio3d::aa321(apses$seqres[1:10]) # Lets get the implicit sequence: -aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code? +bio3d::aa321((apses$atom$resid[apses$calpha])[1:10]) +# Do you understand this code? # Do explicit and implicit sequence have the same length? length(apses$seqres) @@ -140,7 +142,10 @@ apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")] # The introduction to bio3d tutorial at # http://thegrantlab.org/bio3d/tutorials/structure-analysis # has the following example: -plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") +bio3d::plot.bio3d(apses$atom$b[apses$calpha], + sse=apses, + typ="l", + ylab="B-factor") # Good for now. Let's do some real work. @@ -149,7 +154,7 @@ plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") # Calculate a Ramachandran plot for the structure. The torsion.pdb() function # calculates all dihedral angles for backbone and sidechain bonds, NA where # the bond does not exist in an amino acid. -tor <- torsion.pdb(apses) +tor <- bio3d::torsion.pdb(apses) plot(tor$phi, tor$psi, xlim = c(-180, 180), ylim = c(-180, 180), main = "Ramachandran plot for 1BM8", @@ -164,7 +169,7 @@ abline(v = 0, lwd = 0.5, col = "#00000044") # color the points for glycine residues differently. First, we # get a vector of glycine residue indices in the structure: -mySeq <- pdbseq(apses) +mySeq <- bio3d::pdbseq(apses) # Explore the result object and extract the indices of GLY resiues. mySeq == "G" @@ -210,7 +215,7 @@ for (i in 1:nrow(dat)) { points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000") text(dat$phi[i], dat$psi[i], - labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]), + labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]), pos = 4, offset = 0.4, cex = 0.7) @@ -272,9 +277,8 @@ abline(v = 0, lwd = 0.5, col = "#00000044") # If we wish to approximate values in a histogram-like fashion, we can use # hexbin() -if (! require(hexbin, quietly=TRUE)) { +if (! requireNamespace("hexbin", quietly = TRUE)) { install.packages("hexbin") - library(hexbin) } # Package information: # library(help = hexbin) # basic information @@ -285,11 +289,16 @@ if (! require(hexbin, quietly=TRUE)) { myColorRamp <- colorRampPalette(c("#EEEEEE", "#3399CC", "#2266DD")) -plot(hexbin(phi, psi, xbins = 10), - colramp = myColorRamp, - main = "phi-psi Density Bins for 1BM8", - xlab = expression(phi), - ylab = expression(psi)) +hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10), + colramp = myColorRamp, + main = "phi-psi Density Bins for 1BM8", + xlab = expression(phi), + ylab = expression(psi)) + +# Note: +# Had we loaded hexbin with library(hexbin), the plot function would have +# been dispatched by the plot() generic, and we could simply have written: +# plot(hexbin(phi, psi, xbins = 10), ... # == 3.4 Plotting density contours ========================================= @@ -305,19 +314,18 @@ plot(hexbin(phi, psi, xbins = 10), # distributions. But for 2D data like or phi-psi plots, we need a function from # the MASS package: kde2d() -if (! require(MASS, quietly=TRUE)) { +if (! requireNamespace("MASS", quietly = TRUE)) { install.packages("MASS") - library(MASS) } # Package information: # library(help = MASS) # basic information # browseVignettes("MASS") # available vignettes # data(package = "MASS") # available datasets -?kde2d -dPhiPsi <-kde2d(phi, psi, - n = 60, - lims = c(-180, 180, -180, 180)) +?MASS::kde2d +dPhiPsi <-MASS::kde2d(phi, psi, + n = 60, + lims = c(-180, 180, -180, 180)) str(dPhiPsi) # This is a list, with gridpoints in x and y, and the estimated densities in z. @@ -326,7 +334,7 @@ str(dPhiPsi) contour(dPhiPsi) -# === 3.4.1 ... as overlay on a colored grid +# === 3.4.1 ... as overlay on a colored grid image(dPhiPsi, col = myColorRamp(100), @@ -343,7 +351,7 @@ abline(h = 0, lwd = 0.5, col = "#00000044") abline(v = 0, lwd = 0.5, col = "#00000044") -# === 3.4.2 ... as filled countour +# === 3.4.2 ... as filled countour filled.contour(dPhiPsi, xlim = c(-180, 180), ylim = c(-180, 180), @@ -374,7 +382,7 @@ filled.contour(dPhiPsi, abline(v = 0, lwd = 0.5, col = "#00000044") }) -# === 3.4.3 ... as a perspective plot +# === 3.4.3 ... as a perspective plot persp(dPhiPsi, xlab = "phi", @@ -469,12 +477,12 @@ ssSelect <- function(PDB, myChain = "A", ssType, myElety) { # get id's from PDB - x <- atom.select(PDB, - string = "protein", - type = "ATOM", - chain = myChain, - resno = myResno, - elety = myElety) + x <- bio3d::atom.select(PDB, + string = "protein", + type = "ATOM", + chain = myChain, + resno = myResno, + elety = myElety) return(x$atom) } @@ -506,7 +514,7 @@ pairDist <- function(PDB, a, b) { A <- PDB$atom[a, c("x", "y", "z")] B <- PDB$atom[b, c("x", "y", "z")] - dMat <- dist.xyz(A, B) + dMat <- bio3d::dist.xyz(A, B) } return(dMat) diff --git a/RPR-UniProt_GET.R b/RPR-UniProt_GET.R index 79557f2..74578ee 100644 --- a/RPR-UniProt_GET.R +++ b/RPR-UniProt_GET.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 05 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0 First ABC units version # 0.1 First code copied from 2016 material. # @@ -27,11 +29,11 @@ #TOC> ========================================================================== #TOC> -#TOC> Section Title Line -#TOC> ---------------------------------------------------- -#TOC> 1 UniProt files via GET 44 -#TOC> 1.1 Task - fetchUniProtSeq() function 107 -#TOC> 2 Task solutions 114 +#TOC> Section Title Line +#TOC> ---------------------------------------------------------- +#TOC> 1 UniProt files via GET 41 +#TOC> 1.1 Task - fetchUniProtSeq() function 103 +#TOC> 2 Task solutions 110 #TOC> #TOC> ========================================================================== @@ -48,9 +50,8 @@ # a Web browser. Since this is a short and simple request, the GET verb is the # right tool: -if (! require(httr, quietly=TRUE)) { +if (! requireNamespace("httr", quietly = TRUE)) { install.packages("httr") - library(httr) } # Package information: # library(help = httr) # basic information @@ -69,7 +70,7 @@ UniProtID <- "P39678" (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) # the GET() function from httr will get the data. -response <- GET(URL) +response <- httr::GET(URL) str(response) # the response object is a bit complex ... as.character(response) # ... but it is easy to pull out the data. @@ -82,21 +83,21 @@ dbSanitizeSequence(x) # Simple. # But what happens if there is an error, e.g. the uniprot ID does not exist? -response <- GET("http://www.uniprot.org/uniprot/X000000.fasta") +response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") as.character(response) # this is a large HTML page that tells us the URL was not found. So we need to -# check for errors. The Right way to do this is to evaluate the staus code that +# check for errors. The Right Way to do this is to evaluate the staus code that # every Web server returns for every transaction. # -status_code(response) # 404 == Page Not Found +httr::status_code(response) # 404 == Page Not Found # There are many possible codes, but the only code we will be happy with # is 200 - oK. # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) -response <- GET(URL) -status_code(response) +response <- httr::GET(URL) +httr::status_code(response) # == 1.1 Task - fetchUniProtSeq() function ================================= diff --git a/RPR-Unit_testing.R b/RPR-Unit_testing.R index 285db5a..3fa2e0f 100644 --- a/RPR-Unit_testing.R +++ b/RPR-Unit_testing.R @@ -3,12 +3,13 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Unit_testing unit. # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 15 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace() # 1.0 New code # # @@ -25,13 +26,14 @@ #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> ------------------------------------------- -#TOC> 1 Unit Tests with testthat 43 -#TOC> 2 Organizing your tests 156 -#TOC> 3 Task solutions 181 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> ------------------------------------------------- +#TOC> 1 Unit Tests with testthat 40 +#TOC> 2 Organizing your tests 159 +#TOC> 2.1 Testing scripts 183 +#TOC> 3 Task solutions 198 +#TOC> #TOC> ========================================================================== @@ -39,15 +41,22 @@ # The testthat package supports writing and executing unit tests in many ways. -if (! require(testthat, quietly=TRUE)) { +if (! requireNamespace("testthat", quietly = TRUE)) { install.packages("testthat") - library(testthat) } # Package information: # library(help = testthat) # basic information # browseVignettes("testthat") # available vignettes # data(package = "testthat") # available datasets +# testthat is one of those packages that we either use A LOT in a script, +# or not at all. Therfore it's more reasonable to depart from our usual +# ::() idiom, and load the entire library. In fact, if +# we author packages, it is common practice to load testthat in the part +# of the package that automates testing. + +library(testthat) + # An atomic test consists of an expectation about the bahaviour of a function or # the existence of an object. testthat provides a number of useful expectations: @@ -171,6 +180,20 @@ test_file("./tests/test_biCode.R") # .. or execute all the test files in the directory: test_dir("./tests") +# == 2.1 Testing scripts =================================================== + +# Scripts need special consideration since we do not necessarily source() them +# entirely. Therefore automated testing is not reasonable. What you can do +# instead is to place a conditional block at the end of your script, that +# never gets executed - then you can manually execute the code in the block +# whenever you wish to test your functions. For example: + +if (FALSE) { + # ... your tests go here + +} + + # = 3 Task solutions ====================================================== diff --git a/RPR-eUtils_XML.R b/RPR-eUtils_XML.R index bdc5eaa..1f8c23b 100644 --- a/RPR-eUtils_XML.R +++ b/RPR-eUtils_XML.R @@ -3,12 +3,14 @@ # Purpose: A Bioinformatics Course: # R code accompanying the RPR-Scripting_data_downloads unit. # -# Version: 1.0 +# Version: 1.1 # # Date: 2017 10 05 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0 First ABC units version # 0.1 First code copied from 2016 material. # @@ -27,11 +29,11 @@ #TOC> ========================================================================== #TOC> -#TOC> Section Title Line -#TOC> ----------------------------------------------------- -#TOC> 1 Working with NCBI eUtils 44 -#TOC> 1.1 Task - fetchNCBItaxData() function 162 -#TOC> 2 Task solutions 169 +#TOC> Section Title Line +#TOC> ----------------------------------------------------------- +#TOC> 1 Working with NCBI eUtils 41 +#TOC> 1.1 Task - fetchNCBItaxData() function 144 +#TOC> 2 Task solutions 151 #TOC> #TOC> ========================================================================== @@ -40,26 +42,11 @@ -# To begin, we load some libraries with functions -# we need... - -# ... the package httr, which sends and receives information via the http -# protocol, just like a Web browser. -if (! require(httr, quietly=TRUE)) { - install.packages("httr") - library(httr) -} -# Package information: -# library(help = httr) # basic information -# browseVignettes("httr") # available vignettes -# data(package = "httr") # available datasets - - -# ...plus the package xml2: NCBI's eUtils send information in XML format so we -# need to be able to parse XML. -if (! require(xml2, quietly=TRUE)) { +# To begin, we load the xml2 package that contains functions +# we need to receive and parse html data. NCBI's eUtils send information in +# XML format so we need to be able to parse XML. +if (! requireNamespace("xml2", quietly=TRUE)) { install.packages("xml2") - library(xml2) } # Package information: # library(help = xml2) # basic information @@ -91,24 +78,23 @@ URL <- paste(eUtilsBase, # what the response should look like. URL -# To fetch a response in R, we use the function GET() from the httr package +# To fetch a response in R, we use the function read_xml() # with our URL as its argument. -myXML <- read_xml(URL) -myXML +(myXML <- xml2::read_xml(URL)) # This is XML. We can take the response apart into # its indvidual components with the as_list() function. -as_list(myXML) +xml2::as_list(myXML) # Note how the XML "tree" is represented as a list of # lists of lists ... # If we know exactly what elelement we are looking for, # we can extract it from this structure: -as_list(myXML)[["IdList"]][["Id"]][[1]] +xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] # But this is not very robust, it would break with the -# slightest change that the NCBI makes to their response +# slightest change that the NCBI makes to their data format - # and the NCBI changes things A LOT! # Somewhat more robust is to specify the type of element @@ -116,11 +102,12 @@ as_list(myXML)[["IdList"]][["Id"]][[1]] # element, and use the XPath XML parsing language to # retrieve it. -xml_find_all(myXML, "//Id") # returns a "node set" +xml2::xml_find_all(myXML, "//Id") # returns a "node set" -xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set +xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents + # of the node set -# We will need doing this a lot, so we write a function +# We will need to do this more than once, so we write a function # for it... node2text <- function(doc, tag) { # an extractor function for the contents of elements @@ -128,8 +115,8 @@ node2text <- function(doc, tag) { # Contents of all matching elements is returned in # a vector of strings. path <- paste0("//", tag) - nodes <- xml_find_all(doc, path) - return(xml_text(nodes)) + nodes <- xml2::xml_find_all(doc, path) + return(xml2::xml_text(nodes)) } # using node2text() ... @@ -145,7 +132,7 @@ URL <- paste0(eUtilsBase, "&id=", GID, "&version=2.0") -(myXML <- read_xml(URL)) +(myXML <- xml2::read_xml(URL)) (taxID <- node2text(myXML, "TaxId")) (organism <- node2text(myXML, "Organism")) diff --git a/scriptTemplate.R b/scriptTemplate.R index 1db9d7a..0f252e9 100644 --- a/scriptTemplate.R +++ b/scriptTemplate.R @@ -22,17 +22,18 @@ setwd("") # ==== PACKAGES ============================================================== -# Load all required packages. +# Check that required packages have been installed. Install if needed. -if (! require(seqinr, quietly=TRUE)) { +if (! requireNamespace("seqinr", quietly=TRUE)) { install.packages("seqinr") - library(seqinr) } # Package information: # library(help = seqinr) # basic information # browseVignettes("seqinr") # available vignettes # data(package = "seqinr") # available datasets +# Note: use package functions with the :: operator - eg. +# seqinr::aaa("K") diff --git a/scripts/ABC-dbUtilities.R b/scripts/ABC-dbUtilities.R index 2a582a5..3137611 100644 --- a/scripts/ABC-dbUtilities.R +++ b/scripts/ABC-dbUtilities.R @@ -9,21 +9,18 @@ # ====== PACKAGES ============================================================== -if (! require(jsonlite, quietly = TRUE)) { +if (! requireNamespace("jsonlite", quietly = TRUE)) { install.packages("jsonlite") - library(jsonlite) } -if (! require(httr, quietly = TRUE)) { +if (! requireNamespace("httr", quietly = TRUE)) { install.packages("httr") - library(httr) } -if (! require(xml2, quietly = TRUE)) { +if (! requireNamespace("xml2", quietly = TRUE)) { install.packages("xml2") - library(xml2) } @@ -226,10 +223,10 @@ dbFetchUniProtSeq <- function(ID) { URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID) - response <- GET(URL) + response <- httr::GET(URL) mySeq <- character() - if (status_code(response) == 200) { + if (httr::status_code(response) == 200) { x <- as.character(response) x <- strsplit(x, "\n") mySeq <- dbSanitizeSequence(x) @@ -253,17 +250,17 @@ dbFetchPrositeFeatures <- function(ID) { URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" - response <- POST(URL, - body = list(meta = "opt1", - meta1_protein = "opt1", - seq = ID, - skip = "on", - output = "tabular")) + response <- httr::POST(URL, + body = list(meta = "opt1", + meta1_protein = "opt1", + seq = ID, + skip = "on", + output = "tabular")) myFeatures <- data.frame() - if (status_code(response) == 200) { + if (httr::status_code(response) == 200) { - lines <- unlist(strsplit(content(response, "text"), "\\n")) + lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) patt <- sprintf("\\|%s\\|", UniProtID) lines <- lines[grep(patt, lines)] @@ -289,8 +286,8 @@ node2text <- function(doc, tag) { # Contents of all matching elements is returned in # a vector of strings. path <- paste0("//", tag) - nodes <- xml_find_all(doc, path) - return(xml_text(nodes)) + nodes <- xml2::xml_find_all(doc, path) + return(xml2::xml_text(nodes)) } @@ -309,7 +306,7 @@ dbFetchNCBItaxData <- function(ID) { "db=protein", "&term=", ID, sep="") - myXML <- read_xml(URL) + myXML <- xml2::read_xml(URL) GID <- node2text(myXML, "Id") URL <- paste0(eUtilsBase, @@ -318,7 +315,7 @@ dbFetchNCBItaxData <- function(ID) { "&id=", GID, "&version=2.0") - myXML <- read_xml(URL) + myXML <- xml2::read_xml(URL) x <- as.integer(node2text(myXML, "TaxId")) y <- node2text(myXML, "Organism") @@ -346,14 +343,14 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { # for IDs that are not mapped. URL <- "https://www.uniprot.org/uploadlists/" - response <- POST(URL, - body = list(from = mapFrom, - to = mapTo, - format = "tab", - query = s)) + response <- httr::POST(URL, + body = list(from = mapFrom, + to = mapTo, + format = "tab", + query = s)) - if (status_code(response) == 200) { # 200: oK - myMap <- read.delim(file = textConnection(content(response)), + if (httr::status_code(response) == 200) { # 200: oK + myMap <- read.delim(file = textConnection(httr::content(response)), sep = "\t", stringsAsFactors = FALSE) myMap <- myMap[ , c(1,3)] @@ -362,12 +359,23 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { myMap <- data.frame() warning(paste("No uniProt ID mapping returned:", "server sent status", - status_code(response))) + httr::status_code(response))) } return(myMap) } +# ====== TESTS ================================================================= + +if (FALSE) { + if (! requireNamespace("testthat", quietly = TRUE)) { + install.packages("testthat") + } + + # ToDo: test everything here + +} + # [END] diff --git a/scripts/ABC-makeMYSPElist.R b/scripts/ABC-makeMYSPElist.R index 8d44f06..555d6b3 100644 --- a/scripts/ABC-makeMYSPElist.R +++ b/scripts/ABC-makeMYSPElist.R @@ -3,14 +3,16 @@ # Purpose: Create a list of genome sequenced fungi with protein annotations and # Mbp1 homologues. # -# Version: 1.1.2 +# Version: 1.2 # -# Date: 2016 09 - 2017 09 +# Date: 2016 09 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # -# V 1.1.2 Moved BLAST.R to ./scripts directory -# V 1.1 Update 2017 -# V 1.0 First code 2016 +# Versions +# 1.2 Change from require() to requireNamespace() +# 1.1.2 Moved BLAST.R to ./scripts directory +# 1.1 Update 2017 +# 1.0 First code 2016 # # TODO: # @@ -31,27 +33,25 @@ # the respective intermediate results. # + #TOC> ========================================================================== -#TOC> -#TOC> Section Title Line -#TOC> --------------------------------------------------- -#TOC> 1 The strategy 54 -#TOC> 2 GOLD species 66 -#TOC> 2.1 Initialize 71 -#TOC> 2.2 Import 77 -#TOC> 2.3 Unique species 129 -#TOC> 3 BLAST species 171 -#TOC> 3.1 find homologous proteins 178 -#TOC> 3.2 Identify species in "hits" 202 -#TOC> 4 Intersect GOLD and BLAST species 247 -#TOC> 5 Cleanup and finish 265 -#TOC> +#TOC> +#TOC> Section Title Line +#TOC> --------------------------------------------------------- +#TOC> 1 The strategy 55 +#TOC> 2 GOLD species 67 +#TOC> 2.1 Initialize 72 +#TOC> 2.2 Import 79 +#TOC> 2.3 Unique species 131 +#TOC> 3 BLAST species 173 +#TOC> 3.1 find homologous proteins 180 +#TOC> 3.2 Identify species in "hits" 204 +#TOC> 4 Intersect GOLD and BLAST species 249 +#TOC> 5 Cleanup and finish 267 +#TOC> #TOC> ========================================================================== -#TOC> -#TOC> - # = 1 The strategy ======================================================== # This script will create a list of "MYSPE" species and save it in an R object @@ -70,9 +70,10 @@ # (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI. # == 2.1 Initialize ======================================================== -if (! require(httr)) { # httr provides interfaces to Webservers on the Internet - install.packages("httr") - library(httr) + +# httr provides interfaces to Webservers on the Internet +if (! requireNamespace("httr", quietly = TRUE)) { + install.packages("httr") } # == 2.2 Import ============================================================ diff --git a/scripts/ABC-makeScCCnet.R b/scripts/ABC-makeScCCnet.R index a19fd83..5138590 100644 --- a/scripts/ABC-makeScCCnet.R +++ b/scripts/ABC-makeScCCnet.R @@ -15,12 +15,14 @@ # Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab # # -# Version: 1.0 +# Version: 1.1 # -# Date: 2017 10 06 +# Date: 2017 10 - 2019 01 # Author: Boris Steipe (boris.steipe@utoronto.ca) # # Versions: +# 1.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 1.0 First code copied from 2016 material. # # TODO: @@ -28,16 +30,16 @@ # # ============================================================================== -if (! require(readr, quietly = TRUE)) { +if (! requireNamespace("readr", quietly = TRUE)) { install.packages("readr") - library(readr) } # STRING functional interaction data # Read STRING Data (needs to be downloaded from database, see URL in Notes) -STR <- read_delim("./data/4932.protein.links.full.v10.5.txt", delim = " ") +STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt", + delim = " ") # Subset only IDs and combined_score column STR <- STR[ , c("protein1", "protein2", "combined_score")] @@ -61,14 +63,14 @@ myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene # # Read GOSlim data (needs to be downloaded from database, see URL in Notes) -Gsl <- read_tsv("./data/go_slim_mapping.tab", - col_names = c("ID", - "name", - "SGDId", - "Ontology", - "termName", - "termID", - "status")) +Gsl <- readr::read_tsv("./data/go_slim_mapping.tab", + col_names = c("ID", + "name", + "SGDId", + "Ontology", + "termName", + "termID", + "status")) # head(Gsl) # diff --git a/scripts/BLAST.R b/scripts/BLAST.R index b4cc198..32c7e2f 100644 --- a/scripts/BLAST.R +++ b/scripts/BLAST.R @@ -7,11 +7,13 @@ # https://ncbi.github.io/blast-cloud/dev/api.html # # -# Version: 3 -# Date: 2016 09 - 2017 11 +# Version: 3.1 +# Date: 2016 09 - 2019 01 # Author: Boris Steipe # # Versions: +# 3.1 Change from require() to requireNamespace(), +# use ::() idiom throughout # 3 parsing logic had not been fully implemented; Fixed. # 2.1 bugfix in BLAST(), bug was blanking non-split deflines; # refactored parseBLASTalignment() to handle lists with multiple hits. @@ -29,9 +31,8 @@ # ============================================================================== -if (! require(httr, quietly = TRUE)) { +if (! requireNamespace(httr, quietly = TRUE)) { install.packages("httr") - library(httr) } @@ -92,13 +93,13 @@ BLAST <- function(q, } # send it off ... - response <- GET(results$query) - if (http_status(response)$category != "Success" ) { + response <- httr::GET(results$query) + if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", - http_status(response)$message)) + httr::http_status(response)$message)) } - txt <- content(response, "text", encoding = "UTF-8") + txt <- httr::content(response, "text", encoding = "UTF-8") patt <- "RID = (\\w+)" # match the request id results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2] @@ -127,13 +128,13 @@ BLAST <- function(q, while (TRUE) { # Check whether the result is ready - response <- GET(checkStatus) - if (http_status(response)$category != "Success" ) { + response <- httr::GET(checkStatus) + if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", - http_status(response)$message)) + httr::http_status(response)$message)) } - txt <- content(response, "text", encoding = "UTF-8") + txt <- httr::content(response, "text", encoding = "UTF-8") if (length(grep("Status=WAITING", txt)) > 0) { myTimeout <- myTimeout - EXTRAWAIT @@ -184,13 +185,13 @@ BLAST <- function(q, "&FORMAT_TYPE=Text", sep = "") - response <- GET(retrieve) - if (http_status(response)$category != "Success" ) { + response <- httr::GET(retrieve) + if (httr::http_status(response)$category != "Success" ) { stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", - http_status(response)$message)) + httr::http_status(response)$message)) } - txt <- content(response, "text", encoding = "UTF-8") + txt <- httr::content(response, "text", encoding = "UTF-8") # txt contains the whole set of results. Process: @@ -357,7 +358,7 @@ parseBLASTalignment <- function(hit) { # ==== TESTS =================================================================== # define query: -# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence +# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain # "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", # "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", # sep="")