Use requireNamespace(), <package>::<function>() idiom,
Biocmanager:: - not biocLite()
This commit is contained in:
		| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-BLAST unit. | #              R code accompanying the BIN-ALI-BLAST unit. | ||||||
| # | # | ||||||
| # Version:  1.1 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10  23 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.1    Fixed parsing logic. | #           1.1    Fixed parsing logic. | ||||||
| #           1.0    First live version 2017. | #           1.0    First live version 2017. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| @@ -29,29 +31,15 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                               Line | #TOC>   Section  Title                               Line | ||||||
| #TOC> --------------------------------------------- | #TOC> --------------------------------------------------- | ||||||
| #TOC>   1        Preparations                    41 | #TOC>   1        Defining the APSES domain             42 | ||||||
| #TOC>   2        Defining the APSES domain       54 | #TOC>   2        Executing the BLAST search            64 | ||||||
| #TOC>   3        Executing the BLAST search      76 | #TOC>   3        Analysing results                     86 | ||||||
| #TOC>   4        Analysing results               98 |  | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparations  ======================================================== | # =    1  Defining the APSES domain  =========================================== | ||||||
|  |  | ||||||
| if (!require(Biostrings, quietly=TRUE)) { |  | ||||||
|   source("https://bioconductor.org/biocLite.R") |  | ||||||
|   biocLite("Biostrings") |  | ||||||
|   library(Biostrings) |  | ||||||
| } |  | ||||||
| # Package information: |  | ||||||
| #  library(help = Biostrings)       # basic information |  | ||||||
| #  browseVignettes("Biostrings")    # available vignettes |  | ||||||
| #  data(package = "Biostrings")     # available datasets |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Defining the APSES domain  =========================================== |  | ||||||
|  |  | ||||||
| # Load your protein database | # Load your protein database | ||||||
| source("makeProteinDB.R") | source("makeProteinDB.R") | ||||||
| @@ -73,7 +61,7 @@ source("makeProteinDB.R") | |||||||
| # BLAST search. | # BLAST search. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Executing the BLAST search  ========================================== | # =    2  Executing the BLAST search  ========================================== | ||||||
|  |  | ||||||
| # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | # The ./scripts/BLAST.R code defines two functions to access the BLAST interface | ||||||
| # through its Web API, and to parse results. Have a look at the script, then | # through its Web API, and to parse results. Have a look at the script, then | ||||||
| @@ -91,11 +79,11 @@ BLASTresults <- BLAST(apses,                       # MYSPE APSES domain sequence | |||||||
|                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c |                      limits = "txid559292[ORGN]")  # S. cerevisiae S288c | ||||||
|  |  | ||||||
|  |  | ||||||
| length(BLASTresults$hits)  # There should be at least one hit there. Ask for advice | length(BLASTresults$hits)  # There should be at least one hit there. Ask for | ||||||
|                            # in case this step fails. |                            # advice in case this step fails. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Analysing results  =================================================== | # =    3  Analysing results  =================================================== | ||||||
|  |  | ||||||
| (topHit <- BLASTresults$hits[[1]])   # Get the top hit | (topHit <- BLASTresults$hits[[1]])   # Get the top hit | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Dotplot unit. | #              R code accompanying the BIN-ALI-Dotplot unit. | ||||||
| # | # | ||||||
| # Version:  0.1 | # Version:  0.2 | ||||||
| # | # | ||||||
| # Date:     2017  08  28 | # Date:     2019  01  07 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           0.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # | # | ||||||
| @@ -23,24 +25,37 @@ | |||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| # = 1 ___Section___ |  | ||||||
|  |  | ||||||
| # First, we install and load the Biostrings package. | #TOC> ========================================================================== | ||||||
| if (!require(Biostrings, quietly=TRUE)) { | #TOC>  | ||||||
|   if (! exists("biocLite")) { | #TOC>   Section  Title                  Line | ||||||
|     source("https://bioconductor.org/biocLite.R") | #TOC> -------------------------------------- | ||||||
|  | #TOC>   1        ___Section___            39 | ||||||
|  | #TOC>   2        Tasks                   187 | ||||||
|  | #TOC>  | ||||||
|  | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # =    1  ___Section___  ======================================================= | ||||||
|  |  | ||||||
|  | if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|  |   install.packages("BiocManager") | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
|  | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  | if (!requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|  |   install.packages("seqinr") | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # Let's load BLOSUM62 | # Let's load BLOSUM62 | ||||||
| data(BLOSUM62) | data(BLOSUM62, package = "Biostrings") | ||||||
|  |  | ||||||
| # Now let's craft code for a dotplot. That's surprisingly simple. We build a | # Now let's craft code for a dotplot. That's surprisingly simple. We build a | ||||||
| # matrix that has as many rows as one sequence, as many columns as another. Then | # matrix that has as many rows as one sequence, as many columns as another. Then | ||||||
| @@ -51,10 +66,10 @@ data(BLOSUM62) | |||||||
|  |  | ||||||
| # First we fetch our sequences and split them into single characters. | # First we fetch our sequences and split them into single characters. | ||||||
| sel <- myDB$protein$name == "MBP1_SACCE" | sel <- myDB$protein$name == "MBP1_SACCE" | ||||||
| MBP1_SACCE <- s2c(myDB$protein$sequence[sel]) | MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||||
| MBP1_MYSPE <- s2c(myDB$protein$sequence[sel]) | MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| # Check that we have two character vectors of the expected length. | # Check that we have two character vectors of the expected length. | ||||||
| str(MBP1_SACCE) | str(MBP1_SACCE) | ||||||
| @@ -136,7 +151,7 @@ axis(4, at = c(1, seq(10, len, by=10))) | |||||||
| # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | # utilities file and called it dotPlot2(). Why not dotPlot() ... that's because | ||||||
| # there already is a dotplot function in the seqinr package: | # there already is a dotplot function in the seqinr package: | ||||||
|  |  | ||||||
| dotPlot(MBP1_SACCE, MBP1_MYSPE)                                 # seqinr | seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE)                           # seqinr | ||||||
| dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE")  # Our's | ||||||
|  |  | ||||||
| # Which one do you prefer? You can probably see the block patterns that arise | # Which one do you prefer? You can probably see the block patterns that arise | ||||||
| @@ -169,7 +184,7 @@ dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter) | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # = 1 Tasks | # =    2  Tasks  =============================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										131
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								BIN-ALI-MSA.R
									
									
									
									
									
								
							| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-MSA unit. | #              R code accompanying the BIN-ALI-MSA unit. | ||||||
| # | # | ||||||
| # Version:  1.1 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.1    Added fetchMSAmotif() | #           1.1    Added fetchMSAmotif() | ||||||
| #           1.0    Fully refactored and rewritten for 2017 | #           1.0    Fully refactored and rewritten for 2017 | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| @@ -29,22 +31,22 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                              Line | #TOC>   Section  Title                                              Line | ||||||
| #TOC> ------------------------------------------------------------ | #TOC> ------------------------------------------------------------------ | ||||||
| #TOC>   1        Preparations                                   51 | #TOC>   1        Preparations                                         54 | ||||||
| #TOC>   2        Aligning full length MBP1 proteins             99 | #TOC>   2        Aligning full length MBP1 proteins                   96 | ||||||
| #TOC>   2.1      Preparing Sequences                           110 | #TOC>   2.1        Preparing Sequences                               107 | ||||||
| #TOC>   2.2      Compute the MSA                               135 | #TOC>   2.2        Compute the MSA                                   132 | ||||||
| #TOC>   3        Analyzing an MSA                              156 | #TOC>   3        Analyzing an MSA                                    153 | ||||||
| #TOC>   4        Comparing MSAs                                227 | #TOC>   4        Comparing MSAs                                      224 | ||||||
| #TOC>   4.1      Importing an alignment to msa                 236 | #TOC>   4.1        Importing an alignment to msa                     233 | ||||||
| #TOC>   4.1.1    importing an .aln file                        245 | #TOC>   4.1.1          importing an .aln file                        242 | ||||||
| #TOC>   4.1.2    Creating an MsaAAMultipleAlignment object     276 | #TOC>   4.1.2          Creating an MsaAAMultipleAlignment object     273 | ||||||
| #TOC>   4.2      More alignments                               313 | #TOC>   4.2        More alignments                                   324 | ||||||
| #TOC>   4.3      Computing comparison metrics                  325 | #TOC>   4.3        Computing comparison metrics                      336 | ||||||
| #TOC>   5        Profile-Profile alignments                    462 | #TOC>   5        Profile-Profile alignments                          473 | ||||||
| #TOC>   6        Sequence Logos                                539 | #TOC>   6        Sequence Logos                                      546 | ||||||
| #TOC>   6.1      Subsetting an alignment by motif              548 | #TOC>   6.1        Subsetting an alignment by motif                  555 | ||||||
| #TOC>   6.2      Plot a Sequence Logo                          591 | #TOC>   6.2        Plot a Sequence Logo                              604 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -59,28 +61,22 @@ | |||||||
| source("makeProteinDB.R") | source("makeProteinDB.R") | ||||||
|  |  | ||||||
|  |  | ||||||
| # Multiple sequence alignment algorithms are provided in | if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
| # the Bioconductor msa package. |   install.packages("BiocManager") | ||||||
|  |  | ||||||
| if (! require(Biostrings, quietly=TRUE)) { |  | ||||||
|   if (! exists("biocLite")) { |  | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  | # Multiple sequence alignment algorithms are provided in | ||||||
|  | # the Bioconductor msa package. | ||||||
|  |  | ||||||
| if (! require(msa, quietly=TRUE)) { | if (! requireNamespace("msa", quietly=TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("msa") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("msa") |  | ||||||
|   library(msa) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help=msa)       # basic information | #  library(help=msa)       # basic information | ||||||
| @@ -115,7 +111,7 @@ help(package = "msa") | |||||||
| # of sequence. | # of sequence. | ||||||
|  |  | ||||||
| sel <- grep("MBP1", myDB$protein$name) | sel <- grep("MBP1", myDB$protein$name) | ||||||
| MBP1set <- AAStringSet(myDB$protein$sequence[sel]) | MBP1set <- Biostrings::AAStringSet(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| # To help us make sense of the alignment we need to add the names for | # To help us make sense of the alignment we need to add the names for | ||||||
| # the sequences. Names for a seqSet object are held in the ranges slot... | # the sequences. Names for a seqSet object are held in the ranges slot... | ||||||
| @@ -142,10 +138,10 @@ MBP1set | |||||||
|  |  | ||||||
|  |  | ||||||
| # Let's run an alignment with "Muscle" | # Let's run an alignment with "Muscle" | ||||||
| (msaM <-  msaMuscle( MBP1set, order = "aligned")) | (msaM <-  msa::msaMuscle( MBP1set, order = "aligned")) | ||||||
|  |  | ||||||
| # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method): | # ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method): | ||||||
| print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) | msa::print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) | ||||||
|  |  | ||||||
|  |  | ||||||
| # You see that the alignment object has sequence strings with hyphens as | # You see that the alignment object has sequence strings with hyphens as | ||||||
| @@ -173,7 +169,7 @@ print(msaM, show=c("alignment", "complete"), showConsensus=FALSE) | |||||||
|  |  | ||||||
| data("BLOSUM62")  # fetch the BLOSUM62 package from the Biostrings package | data("BLOSUM62")  # fetch the BLOSUM62 package from the Biostrings package | ||||||
|  |  | ||||||
| msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62) | msaMScores <- msa::msaConservationScore(msaM, substitutionMatrix = BLOSUM62) | ||||||
| plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position") | plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position") | ||||||
|  |  | ||||||
| # That plot shows the well-aligned regions (domains ?) of the sequence, but it | # That plot shows the well-aligned regions (domains ?) of the sequence, but it | ||||||
| @@ -246,17 +242,17 @@ for (i in seq_along(highScoringRanges$lengths)) { | |||||||
| # ===   4.1.1  importing an .aln file                    | # ===   4.1.1  importing an .aln file                    | ||||||
|  |  | ||||||
| # The seqinr package has a function to read CLUSTAL W formatted .aln files ... | # The seqinr package has a function to read CLUSTAL W formatted .aln files ... | ||||||
| if (! require(seqinr, quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages(seqinr) |   install.packages("seqinr") | ||||||
|   library(seqinr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help=seqinr)       # basic information | #  library(help=seqinr)       # basic information | ||||||
| #  browseVignettes("seqinr")  # available vignettes | #  browseVignettes("seqinr")  # available vignettes | ||||||
| #  data(package = "seqinr")   # available datasets | #  data(package = "seqinr")   # available datasets | ||||||
|  |  | ||||||
| # read the donwloaded file | # read the T-coffee aligned file that you donwloaded from the EBI MSA tools | ||||||
| tmp <- read.alignment("msaT.aln", format = "clustal") | # (cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/BIN-ALI-MSA) | ||||||
|  | tmp <- seqinr::read.alignment("msaT.aln", format = "clustal") | ||||||
|  |  | ||||||
| # read.alignment() returns a list. $seq is a list of strings, one for each | # read.alignment() returns a list. $seq is a list of strings, one for each | ||||||
| # complete alignment. However, they are converted to lower case. | # complete alignment. However, they are converted to lower case. | ||||||
| @@ -278,12 +274,12 @@ for (i in seq_along(x)) { | |||||||
|  |  | ||||||
| # MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects | # MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects | ||||||
| # in their @unmasked slot, and a few additional items. Rather then build the | # in their @unmasked slot, and a few additional items. Rather then build the | ||||||
| # object from scratch, we copy an axisting object, and overwrite the dta in its | # object from scratch, we copy an existing object, and overwrite the data in its | ||||||
| # slots with what we need. Our goal is pragmatic, we want an object that msa's | # slots with what we need. Our goal is pragmatic, we want an object that msa's | ||||||
| # functions will accept as input. | # functions will accept as input. | ||||||
|  |  | ||||||
| # First: convert our named char vector into an AAstringSet | # First: convert our named char vector into an AAstringSet | ||||||
| x <- AAStringSet(x) | x <- Biostrings::AAStringSet(x) | ||||||
|  |  | ||||||
| # Then: create a new MsaAAMultipleAlignment S4 object. The msa package has | # Then: create a new MsaAAMultipleAlignment S4 object. The msa package has | ||||||
| # defined what such an object should look like, with the SetClass() function. To | # defined what such an object should look like, with the SetClass() function. To | ||||||
| @@ -294,8 +290,22 @@ x <- AAStringSet(x) | |||||||
|  |  | ||||||
| str(msaM) | str(msaM) | ||||||
|  |  | ||||||
|  | # There is a catch however in the way R makes such operations specific to | ||||||
|  | # the packages they need them: the function that creates the class is | ||||||
|  | # defined as a "generic", and when it is called, R looks in the package | ||||||
|  | # namespace for a more specific function with precise instructions what | ||||||
|  | # to do. However, we have not loaded the package namespace - we access all | ||||||
|  | # of the functions directly with the msa:: prefix. This method breaks down | ||||||
|  | # when generic functions are involved. I.e. - we could make it work, but | ||||||
|  | # the amount of code we need then is unreasonable. The straightforward | ||||||
|  | # way is to load the package. We can still use the prefix notation for | ||||||
|  | # its functions, just to emphasize where the function comes from. But since | ||||||
|  | # the namespace then exists, we ensure that generics are properly dispatched. | ||||||
|  |  | ||||||
|  | library(msa)  # load the msa package namespace | ||||||
|  |  | ||||||
| msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object | msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object | ||||||
|             unmasked = x,             # "unmasked" slot takes an AASringSet |             unmasked = x,             # "unmasked" slot takes an AAStringSet | ||||||
|             version = "T-Coffee",     # "version" slot takes a string |             version = "T-Coffee",     # "version" slot takes a string | ||||||
|             params = list(),          # "params" takes a list(), we leave the |             params = list(),          # "params" takes a list(), we leave the | ||||||
|                                       #   list empty, but we could add the |                                       #   list empty, but we could add the | ||||||
| @@ -309,18 +319,18 @@ str(msaT) | |||||||
| msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can | msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can | ||||||
|      # use the msa package functions on it |      # use the msa package functions on it | ||||||
|  |  | ||||||
| msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62) | msaTScores <- msa::msaConservationScore(msaT, substitutionMatrix = BLOSUM62) | ||||||
|  |  | ||||||
| # ==   4.2  More alignments  =================================================== | # ==   4.2  More alignments  =================================================== | ||||||
|  |  | ||||||
| # Next, we calculate alignments with msa's two other alignment options: | # Next, we calculate alignments with msa's two other alignment options: | ||||||
| # CLUSTAL Omega | # CLUSTAL Omega | ||||||
| (msaO <- msaClustalOmega( MBP1set, order = "aligned")) | (msaO <- msa::msaClustalOmega( MBP1set, order = "aligned")) | ||||||
| msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62) | msaOScores <- msa::msaConservationScore(msaO, substitutionMatrix = BLOSUM62) | ||||||
|  |  | ||||||
| # CLUSTAL W | # CLUSTAL W | ||||||
| (msaW <- msaClustalW( MBP1set, order = "aligned")) | (msaW <- msa::msaClustalW( MBP1set, order = "aligned")) | ||||||
| msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62) | msaWScores <- msa::msaConservationScore(msaW, substitutionMatrix = BLOSUM62) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   4.3  Computing comparison metrics  ====================================== | # ==   4.3  Computing comparison metrics  ====================================== | ||||||
| @@ -454,7 +464,7 @@ legend("bottomright", | |||||||
|  |  | ||||||
| # Your alignment is going to be different from mine, due to the inclusion of | # Your alignment is going to be different from mine, due to the inclusion of | ||||||
| # MYSPE - but what I see is that MUSCLE gives the highest score overall, and | # MYSPE - but what I see is that MUSCLE gives the highest score overall, and | ||||||
| # achieves this with fewer indels then most, and the lowest number of gaps of | # achieves this with fewer indels than most, and the lowest number of gaps of | ||||||
| # all algorithms. | # all algorithms. | ||||||
|  |  | ||||||
| # To actually compare regions of alignments, we need to align alignments. | # To actually compare regions of alignments, we need to align alignments. | ||||||
| @@ -470,12 +480,8 @@ legend("bottomright", | |||||||
| # to compare two MSAs with each other, by aligning them. The algorithm is | # to compare two MSAs with each other, by aligning them. The algorithm is | ||||||
| # provided by the DECIPHER package. | # provided by the DECIPHER package. | ||||||
|  |  | ||||||
| if (! require(DECIPHER, quietly=TRUE)) { | if (! requireNamespace("DECIPHER", quietly=TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("DECIPHER") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("DECIPHER") |  | ||||||
|   library(DECIPHER) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = DECIPHER)       # basic information | #  library(help = DECIPHER)       # basic information | ||||||
| @@ -484,14 +490,14 @@ if (! require(DECIPHER, quietly=TRUE)) { | |||||||
|  |  | ||||||
| # AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and | # AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and | ||||||
| # CLUSTAL W alignments: we could do this directly ... | # CLUSTAL W alignments: we could do this directly ... | ||||||
| AlignProfiles(msaW@unmasked, msaM@unmasked) | DECIPHER::AlignProfiles(msaW@unmasked, msaM@unmasked) | ||||||
|  |  | ||||||
| # But for ease of comparison, we'll reorder the sequences of the CLUSTAL W | # But for ease of comparison, we'll reorder the sequences of the CLUSTAL W | ||||||
| # alignment into the same order as the MUSCLE alignment: | # alignment into the same order as the MUSCLE alignment: | ||||||
| m <- as.character(msaM) | m <- as.character(msaM) | ||||||
| w <- as.character(msaW)[names(m)] | w <- as.character(msaW)[names(m)] | ||||||
|  |  | ||||||
| (ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m))) | (ppa <- DECIPHER::AlignProfiles(msa::AAStringSet(w), msa::AAStringSet(m))) | ||||||
|  |  | ||||||
| # Conveniently, AlignProfiles() returns an AAStringSet, so we can use our | # Conveniently, AlignProfiles() returns an AAStringSet, so we can use our | ||||||
| # writeALN function to show it. Here is an arbitrary block, from somewhere in | # writeALN function to show it. Here is an arbitrary block, from somewhere in | ||||||
| @@ -533,8 +539,8 @@ writeALN(ppa2, range = c(800, 960)) | |||||||
| # Again, go explore, and get a sense of what's going on. You may find that | # Again, go explore, and get a sense of what's going on. You may find that | ||||||
| # CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas | # CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas | ||||||
| # MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would | # MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would | ||||||
| # expect from an algorithm that builds alignments from pairwise local | # expect from an algorithm that builds alignments incrementally from pairwise | ||||||
| # alignments, without global refinement. | # local alignments, without global refinement. | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    6  Sequence Logos  ====================================================== | # =    6  Sequence Logos  ====================================================== | ||||||
| @@ -602,16 +608,15 @@ writeALN(fetchMSAmotif(msaM, wing)) | |||||||
| # ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at | # ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at | ||||||
| # the EBI. | # the EBI. | ||||||
|  |  | ||||||
| if (! require(ggseqlogo, quietly=TRUE)) { | if (! requireNamspace("ggseqlogo", quietly=TRUE)) { | ||||||
|   install.packages(("ggseqlogo")) |   install.packages(("ggseqlogo")) | ||||||
|   library(ggseqlogo) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help=ggseqlogo)       # basic information | #  library(help=ggseqlogo)       # basic information | ||||||
| #  browseVignettes("ggseqlogo")  # available vignettes | #  browseVignettes("ggseqlogo")  # available vignettes | ||||||
| #  data(package = "ggseqlogo")   # available datasets | #  data(package = "ggseqlogo")   # available datasets | ||||||
|  |  | ||||||
| ggseqlogo(as.character(motifAli)) | ggseqlogo::ggseqlogo(as.character(motifAli)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | #              R code accompanying the BIN-ALI-Optimal_sequence_alignment unit. | ||||||
| # | # | ||||||
| # Version:  1.4 | # Version:  1.5 | ||||||
| # | # | ||||||
| # Date:     2017  09   -   2017  11 | # Date:     2017  09   -   2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.5    Change from require() to requireNamespace(), | ||||||
|  | #                    use <package>::<function>() idiom throughout | ||||||
| #           1.4    Pull s2c() from seqinr package, rather then loading the | #           1.4    Pull s2c() from seqinr package, rather then loading the | ||||||
| #                    entire library. | #                    entire library. | ||||||
| #           1.3    Updated confirmation task with correct logic | #           1.3    Updated confirmation task with correct logic | ||||||
| @@ -34,27 +36,30 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                      Line | #TOC>   Section  Title                                                      Line | ||||||
| #TOC> -------------------------------------------------------------------- | #TOC> -------------------------------------------------------------------------- | ||||||
| #TOC>   1        Prepare                                                52 | #TOC>   1        Prepare                                                      54 | ||||||
| #TOC>   2        Biostrings Pairwise Alignment                          66 | #TOC>   2        Biostrings Pairwise Alignment                                71 | ||||||
| #TOC>   2.1      Optimal global alignment                               84 | #TOC>   2.1        Optimal global alignment                                   89 | ||||||
| #TOC>   2.2      Optimal local alignment                               147 | #TOC>   2.2        Optimal local alignment                                   152 | ||||||
| #TOC>   3        APSES Domain annotation by alignment                  171 | #TOC>   3        APSES Domain annotation by alignment                        176 | ||||||
| #TOC>   4        Update your database script                           252 | #TOC>   4        Update your database script                                 257 | ||||||
| #TOC>   4.1      Preparing an annotation file ...                      258 | #TOC>   4.1        Preparing an annotation file ...                          263 | ||||||
| #TOC>   4.1.1    If you HAVE NOT done the BIN-FUNC-Annotation unit     260 | #TOC>   4.1.1          If you HAVE NOT done the BIN-FUNC-Annotation unit     265 | ||||||
| #TOC>   4.1.2    If you HAVE done the BIN-FUNC-Annotation unit         303 | #TOC>   4.1.2          If you HAVE done the BIN-FUNC-Annotation unit         308 | ||||||
| #TOC>   4.2      Execute and Validate                                  327 | #TOC>   4.2        Execute and Validate                                      332 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Prepare  ============================================================= | # =    1  Prepare  ============================================================= | ||||||
|  |  | ||||||
| # To simplify code, we pull the function s2c(x) from the seqinr package, | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
| # rather than using the lengthier idiom unlist(strsplit(x, ""). |   install.packages("seqinr") | ||||||
| # This assumes that the seqinr package has been installed previously. | } | ||||||
| s2c <- seqinr::s2c | # You can get package information with the following commands: | ||||||
|  | # library(help = seqinr)       # basic information | ||||||
|  | # browseVignettes("seqinr")    # available vignettes | ||||||
|  | # data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| # You need to recreate the protein database that you have constructed in the | # You need to recreate the protein database that you have constructed in the | ||||||
| @@ -66,13 +71,13 @@ source("makeProteinDB.R") | |||||||
| # =    2  Biostrings Pairwise Alignment  ======================================= | # =    2  Biostrings Pairwise Alignment  ======================================= | ||||||
|  |  | ||||||
|  |  | ||||||
| if (!require(Biostrings, quietly=TRUE)) { | if (!requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (!requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
|  | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| #  browseVignettes("Biostrings")    # available vignettes | #  browseVignettes("Biostrings")    # available vignettes | ||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
| @@ -88,15 +93,15 @@ if (!require(Biostrings, quietly=TRUE)) { | |||||||
|  |  | ||||||
| # First: make AAString objects ... | # First: make AAString objects ... | ||||||
| sel <- myDB$protein$name == "MBP1_SACCE" | sel <- myDB$protein$name == "MBP1_SACCE" | ||||||
| aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel]) | aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "") | ||||||
| aaMBP1_MYSPE <-   AAString(myDB$protein$sequence[sel]) | aaMBP1_MYSPE <-   Biostrings::AAString(myDB$protein$sequence[sel]) | ||||||
|  |  | ||||||
| ?pairwiseAlignment | ?pairwiseAlignment | ||||||
| # ... and align. | # ... and align. | ||||||
| # Global optimal alignment with end-gap penalties is default. | # Global optimal alignment with end-gap penalties is default. | ||||||
| ali1 <-  pairwiseAlignment( | ali1 <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMBP1_SACCE, |   aaMBP1_SACCE, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   substitutionMatrix = "BLOSUM62", |   substitutionMatrix = "BLOSUM62", | ||||||
| @@ -108,7 +113,7 @@ str(ali1)  # ... it's complicated | |||||||
| # This is a Biostrings alignment object. But we can use Biostrings functions to | # This is a Biostrings alignment object. But we can use Biostrings functions to | ||||||
| # tame it: | # tame it: | ||||||
| ali1 | ali1 | ||||||
| writePairwiseAlignments(ali1)   # That should look familiar | Biostrings::writePairwiseAlignments(ali1)   # That should look familiar | ||||||
|  |  | ||||||
| # And we can make the internal structure work for us  (@ is for classes as | # And we can make the internal structure work for us  (@ is for classes as | ||||||
| # $ is for lists ...) | # $ is for lists ...) | ||||||
| @@ -147,7 +152,7 @@ percentID(ali1) | |||||||
| # ==   2.2  Optimal local alignment  =========================================== | # ==   2.2  Optimal local alignment  =========================================== | ||||||
|  |  | ||||||
| # Compare with local optimal alignment (like EMBOSS Water) | # Compare with local optimal alignment (like EMBOSS Water) | ||||||
| ali2 <-  pairwiseAlignment( | ali2 <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMBP1_SACCE, |   aaMBP1_SACCE, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   type = "local", |   type = "local", | ||||||
| @@ -155,9 +160,9 @@ ali2 <-  pairwiseAlignment( | |||||||
|   gapOpening = 50, |   gapOpening = 50, | ||||||
|   gapExtension = 10) |   gapExtension = 10) | ||||||
|  |  | ||||||
| writePairwiseAlignments(ali2)   # This has probably only aligned the N-terminal | Biostrings::writePairwiseAlignments(ali2) | ||||||
|                                 # DNA binding domain - but that one has quite | # This has probably only aligned the N-terminal DNA binding domain - but that | ||||||
|                                 # high sequence identity: | # one has quite high sequence identity: | ||||||
| percentID(ali2) | percentID(ali2) | ||||||
|  |  | ||||||
| # == TASK: == | # == TASK: == | ||||||
| @@ -209,14 +214,14 @@ myDB$annotation[myDB$annotation$ID == proID & | |||||||
| # the sequence, and used the start and end coordinates to extract a substring. | # the sequence, and used the start and end coordinates to extract a substring. | ||||||
|  |  | ||||||
| # Let's convert this to an AAstring and assign it: | # Let's convert this to an AAstring and assign it: | ||||||
| aaMB1_SACCE_APSES <- AAString(apses) | aaMB1_SACCE_APSES <- Biostrings::AAString(apses) | ||||||
|  |  | ||||||
| # Now let's align these two sequences of very different length without end-gap | # Now let's align these two sequences of very different length without end-gap | ||||||
| # penalties using the "overlap" type. "overlap" turns the | # penalties using the "overlap" type. "overlap" turns the | ||||||
| # end-gap penalties off and that is crucially important since | # end-gap penalties off and that is crucially important since | ||||||
| # the sequences have very different length. | # the sequences have very different length. | ||||||
|  |  | ||||||
| aliApses <-  pairwiseAlignment( | aliApses <-  Biostrings::pairwiseAlignment( | ||||||
|   aaMB1_SACCE_APSES, |   aaMB1_SACCE_APSES, | ||||||
|   aaMBP1_MYSPE, |   aaMBP1_MYSPE, | ||||||
|   type = "overlap", |   type = "overlap", | ||||||
| @@ -228,7 +233,7 @@ aliApses <-  pairwiseAlignment( | |||||||
| # homologous, and have (almost) no indels. The entire "pattern" | # homologous, and have (almost) no indels. The entire "pattern" | ||||||
| # sequence from QIYSAR ... to ... KPLFDF  should be matched | # sequence from QIYSAR ... to ... KPLFDF  should be matched | ||||||
| # with the "query". Is this correct? | # with the "query". Is this correct? | ||||||
| writePairwiseAlignments(aliApses) | Biostrings::writePairwiseAlignments(aliApses) | ||||||
|  |  | ||||||
| # If this is correct, you can extract the matched sequence from | # If this is correct, you can extract the matched sequence from | ||||||
| # the alignment object. The syntax is a bit different from what | # the alignment object. The syntax is a bit different from what | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-ALI-Similarity unit. | #              R code accompanying the BIN-ALI-Similarity unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  20 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    Refactored for 2017; add aaindex, ternary plot. | #           1.0    Refactored for 2017; add aaindex, ternary plot. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -28,10 +30,10 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                          Line | #TOC>   Section  Title                          Line | ||||||
| #TOC> ---------------------------------------- | #TOC> ---------------------------------------------- | ||||||
| #TOC>   1        Amino Acid Properties      43 | #TOC>   1        Amino Acid Properties            41 | ||||||
| #TOC>   2        Mutation Data matrix      163 | #TOC>   2        Mutation Data matrix            158 | ||||||
| #TOC>   3        Background score          205 | #TOC>   3        Background score                199 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -41,9 +43,8 @@ | |||||||
| # A large collection of amino acid property tables is available via the seqinr | # A large collection of amino acid property tables is available via the seqinr | ||||||
| # package: | # package: | ||||||
|  |  | ||||||
| if (!require(seqinr)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
|   library(seqinr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| @@ -127,9 +128,8 @@ text(Y$I, K$I, names(Y$I)) | |||||||
| # plots are in general unintuitive and hard to interpret. One alternative is a | # plots are in general unintuitive and hard to interpret. One alternative is a | ||||||
| # so-called "ternary plot": | # so-called "ternary plot": | ||||||
|  |  | ||||||
| if (!require(ggtern)) { | if (! requireNamespace("ggtern", quietly=TRUE)) { | ||||||
|   install.packages("ggtern") |   install.packages("ggtern") | ||||||
|   library(ggtern) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = ggtern)       # basic information | #  library(help = ggtern)       # basic information | ||||||
| @@ -145,12 +145,11 @@ myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05, | |||||||
|                     stringsAsFactors = FALSE) |                     stringsAsFactors = FALSE) | ||||||
| rownames(myDat) <- names(Y$I) | rownames(myDat) <- names(Y$I) | ||||||
|  |  | ||||||
| ggtern(data = myDat, | ggtern::ggtern(data = myDat, | ||||||
|        aes(x = vol, |                ggplot2::aes(x = vol, | ||||||
|                    y = phi, |                    y = phi, | ||||||
|                    z = pK, |                    z = pK, | ||||||
|            label = rownames(myDat))) + |                    label = rownames(myDat))) + ggplot2::geom_text() | ||||||
|   geom_text() |  | ||||||
|  |  | ||||||
| # This results in a mapping of amino acids relative to each other that is | # This results in a mapping of amino acids relative to each other that is | ||||||
| # similar to the Venn diagram you have seen in the notes. | # similar to the Venn diagram you have seen in the notes. | ||||||
| @@ -162,12 +161,11 @@ ggtern(data = myDat, | |||||||
|  |  | ||||||
| # The Biostrings package contains the most common mutation data matrices. | # The Biostrings package contains the most common mutation data matrices. | ||||||
|  |  | ||||||
| if (!require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly=TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly=TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help=Biostrings)       # basic information | #  library(help=Biostrings)       # basic information | ||||||
| @@ -200,7 +198,9 @@ BLOSUM62["W", "R"] | |||||||
|  |  | ||||||
| # =    3  Background score  ==================================================== | # =    3  Background score  ==================================================== | ||||||
|  |  | ||||||
| # The mutation data matrix is designed to give high scores to homologous sequences, low scores to non-homologous sequences. What score on average should we expect for a random sequence? | # The mutation data matrix is designed to give high scores to homologous | ||||||
|  | # sequences, low scores to non-homologous sequences. What score on average | ||||||
|  | # should we expect for a random sequence? | ||||||
|  |  | ||||||
| # If we sample amino acid pairs at random, we will get a score that is the | # If we sample amino acid pairs at random, we will get a score that is the | ||||||
| # average of the individual pairscores in the matrix. Omitting the ambiguity | # average of the individual pairscores in the matrix. Omitting the ambiguity | ||||||
| @@ -219,12 +219,12 @@ sum(BLOSUM62[1:20, 1:20])/400 | |||||||
| # PDB ID 3FG7 - a villin headpiece structure with a large amount of | # PDB ID 3FG7 - a villin headpiece structure with a large amount of | ||||||
| # low-complexity amino acid sequence ... | # low-complexity amino acid sequence ... | ||||||
|  |  | ||||||
| aa3FG7 <- readAAStringSet("./data/3FG7.fa")[[1]] | aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]] | ||||||
|  |  | ||||||
| # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | # ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C) | ||||||
| # with an exceptionally high percentage of hydrophobic residues. | # with an exceptionally high percentage of hydrophobic residues. | ||||||
|  |  | ||||||
| aa2F1C <- readAAStringSet("./data/2F1C.fa")[[1]] | aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]] | ||||||
|  |  | ||||||
| # Here is a function that takes two sequences and | # Here is a function that takes two sequences and | ||||||
| # returns their average pairscore. | # returns their average pairscore. | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-Data_integration unit. | #              R code accompanying the BIN-Data_integration unit. | ||||||
| # | # | ||||||
| # Version:  1.0.1 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2018  10  30 | # Date:     2018  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0.1  Bugfix: UniProt ID Mapping service API change | #           1.0.1  Bugfix: UniProt ID Mapping service API change | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| # | # | ||||||
| @@ -31,8 +33,8 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                             Line | #TOC>   Section  Title                             Line | ||||||
| #TOC> ------------------------------------------------- | #TOC> ------------------------------------------------- | ||||||
| #TOC>   1        Identifier mapping                  40 | #TOC>   1        Identifier mapping                  42 | ||||||
| #TOC>   2        Cross-referencing tables           164 | #TOC>   2        Cross-referencing tables           165 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -54,9 +56,8 @@ | |||||||
|  |  | ||||||
| # To begin, we load  httr, which supports sending and receiving data via the | # To begin, we load  httr, which supports sending and receiving data via the | ||||||
| # http protocol, just like a Web browser. | # http protocol, just like a Web browser. | ||||||
| if (!require(httr, quietly=TRUE)) { | if (! requireNamespace("httpr", quietly=TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httpr") | ||||||
|   library(httr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| @@ -75,22 +76,22 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036" | |||||||
| # of the request. GET() and POST() are functions from httr. | # of the request. GET() and POST() are functions from httr. | ||||||
|  |  | ||||||
| URL <- "https://www.uniprot.org/mapping/" | URL <- "https://www.uniprot.org/mapping/" | ||||||
| response <- POST(URL, | response <- httr::POST(URL, | ||||||
|                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein |                        body = list(from = "P_REFSEQ_AC",   # Refseq Protein | ||||||
|                                    to = "ACC",             # UniProt ID |                                    to = "ACC",             # UniProt ID | ||||||
|                                    format = "tab", |                                    format = "tab", | ||||||
|                                    query = myQueryIDs)) |                                    query = myQueryIDs)) | ||||||
|  |  | ||||||
| cat(content(response)) | cat(httr::content(response)) | ||||||
|  |  | ||||||
| # We need to check the status code - if it is not 200, an error ocurred and we | # We need to check the status code - if it is not 200, an error ocurred and we | ||||||
| # can't process the result: | # can't process the result: | ||||||
| status_code(response) | httr::status_code(response) | ||||||
|  |  | ||||||
| # If the query is successful, tabbed text is returned. We can assign that to a | # If the query is successful, tabbed text is returned. We can assign that to a | ||||||
| # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | # data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument. | ||||||
|  |  | ||||||
| myMappedIDs <- read.delim(file = textConnection(content(response)), | myMappedIDs <- read.delim(file = textConnection(httr::content(response)), | ||||||
|                           sep = "\t", |                           sep = "\t", | ||||||
|                           stringsAsFactors = FALSE) |                           stringsAsFactors = FALSE) | ||||||
| myMappedIDs | myMappedIDs | ||||||
| @@ -132,14 +133,14 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | |||||||
|   #    for IDs that are not mapped. |   #    for IDs that are not mapped. | ||||||
|  |  | ||||||
|   URL <- "https://www.uniprot.org/uploadlists/" |   URL <- "https://www.uniprot.org/uploadlists/" | ||||||
|   response <- POST(URL, |   response <- httr::POST(URL, | ||||||
|                          body = list(from = mapFrom, |                          body = list(from = mapFrom, | ||||||
|                                      to = mapTo, |                                      to = mapTo, | ||||||
|                                      format = "tab", |                                      format = "tab", | ||||||
|                                      query = s)) |                                      query = s)) | ||||||
|  |  | ||||||
|   if (status_code(response) == 200) { # 200: oK |   if (httr::status_code(response) == 200) { # 200: oK | ||||||
|     myMap <- read.delim(file = textConnection(content(response)), |     myMap <- read.delim(file = textConnection(httr::content(response)), | ||||||
|                         sep = "\t", |                         sep = "\t", | ||||||
|                         stringsAsFactors = FALSE) |                         stringsAsFactors = FALSE) | ||||||
|     myMap <- myMap[ , c(1,3)] |     myMap <- myMap[ , c(1,3)] | ||||||
| @@ -148,7 +149,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | |||||||
|     myMap <- data.frame() |     myMap <- data.frame() | ||||||
|     warning(paste("No uniProt ID mapping returned:", |     warning(paste("No uniProt ID mapping returned:", | ||||||
|                   "server sent status", |                   "server sent status", | ||||||
|                   status_code(response))) |                   httr::status_code(response))) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(myMap) |   return(myMap) | ||||||
| @@ -168,7 +169,8 @@ myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165") | |||||||
| # Nomenclature commission. How do we map one set of identifiers to another one? | # Nomenclature commission. How do we map one set of identifiers to another one? | ||||||
|  |  | ||||||
| # The function to use is match(). | # The function to use is match(). | ||||||
| # Here is a tiny set of identifiers taken from a much larger table to illustrate the principle: | # Here is a tiny set of identifiers taken from a much larger table to | ||||||
|  | # illustrate the principle: | ||||||
| # | # | ||||||
|  |  | ||||||
| myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | myIDs <- data.frame(uID =   c("P38903", "P31383", "P47177", "P47096", "Q07747", | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | #              R code accompanying the BIN-FUNC_Semantic_similarity unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  11  12 | # Date:     2017  11  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    New code. | #           1.0    New code. | ||||||
| # | # | ||||||
| # | # | ||||||
| @@ -27,59 +30,65 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                Line | #TOC>   Section  Title                                                Line | ||||||
| #TOC> -------------------------------------------------------------- | #TOC> -------------------------------------------------------------------- | ||||||
| #TOC>   1        Preparations: Packages, AnnotationDB, Setup      39 | #TOC>   1        Preparations: Packages, AnnotationDB, Setup            42 | ||||||
| #TOC>   2        Fetch GO Annotations                             89 | #TOC>   2        Fetch GO Annotations                                   98 | ||||||
| #TOC>   3        Semantic Similarities                            98 | #TOC>   3        Semantic Similarities                                 107 | ||||||
| #TOC>   4        GO Term Enrichment in Gene Sets                 116 | #TOC>   4        GO Term Enrichment in Gene Sets                       125 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | # =    1  Preparations: Packages, AnnotationDB, Setup  ========================= | ||||||
|  |  | ||||||
|  | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|  |   install.packages("BiocManager") | ||||||
|  | } | ||||||
|  |  | ||||||
| # GOSim is an R-package in the Bioconductor project. | # GOSim is an R-package in the Bioconductor project. | ||||||
| if (! require(GOSim, quietly=TRUE)) { | if (! requireNamespace("GOSim", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("GOSim") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("GOSim") |  | ||||||
|   library(GOSim) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = GOSim)       # basic information | #  library(help = GOSim)       # basic information | ||||||
| #  browseVignettes("GOSim")    # available vignettes | #  browseVignettes("GOSim")    # available vignettes | ||||||
| #  data(package = "GOSim")     # available datasets | #  data(package = "GOSim")     # available datasets | ||||||
|  |  | ||||||
|  | # GOSim makes extensive assumptions about loaded packages, and many base | ||||||
|  | # methods are masked. We will thus use library(GOSim) to load it | ||||||
|  | # in its entirety and with all packages it depends on. We will still use | ||||||
|  | # the <package>::<function>() syntax in the code below, but this now serves | ||||||
|  | # more of a didactic purpose, rather than actual syntax requirements. | ||||||
|  |  | ||||||
|  | library(GOSim) | ||||||
|  |  | ||||||
| # GOSim loads human annotations by default. We load yeast annotations instead... | # GOSim loads human annotations by default. We load yeast annotations instead... | ||||||
| if (!require(org.Sc.sgd.db, quietly=TRUE)) { | if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("org.Sc.sgd.db") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("org.Sc.sgd.db") |  | ||||||
|  | # Bioconductor annotation packages won't work stably unless we actually load | ||||||
|  | # them: | ||||||
| library(org.Sc.sgd.db) | library(org.Sc.sgd.db) | ||||||
| } |  | ||||||
|  |  | ||||||
| # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | # org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such | ||||||
| # databases exist for all model organisms. It's a kind of a fancy data frame | # databases exist for all model organisms. It's a kind of a fancy data frame | ||||||
| # from which we can get annotations by rows (genes) with the keys() funtion ... | # from which we can get annotations by rows (genes) with the keys() funtion ... | ||||||
| keys(org.Sc.sgd.db)[1500:1510] | AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510] | ||||||
|  |  | ||||||
| # ... and the types of available annotations with the columns() function | # ... and the types of available annotations with the columns() function | ||||||
| columns(org.Sc.sgd.db) | AnnotationDbi::columns(org.Sc.sgd.db) | ||||||
|  |  | ||||||
| # Note that one of the columns is "GO" ... and we load that into the | # Note that one of the columns is "GO" ... and we load that into the | ||||||
| # datastructures used by GOSim: | # datastructures used by GOSim: | ||||||
|  |  | ||||||
| # Choose GOterms to use | # Choose GOterms to use | ||||||
| setEvidenceLevel(evidences="all", | GOSim::setEvidenceLevel(evidences = "all", | ||||||
|                         organism = org.Sc.sgdORGANISM, |                         organism = org.Sc.sgdORGANISM, | ||||||
|                         gomap = org.Sc.sgdGO) |                         gomap = org.Sc.sgdGO) | ||||||
|  |  | ||||||
| # Use Biological Process ontology | # Use Biological Process ontology | ||||||
| setOntology("BP", loadIC=FALSE) | GOSim::setOntology("BP", loadIC = FALSE) | ||||||
|  |  | ||||||
| # confirm that we loaded the correct ontology | # confirm that we loaded the correct ontology | ||||||
| head(get("gomap", envir = GOSimEnv)) | head(get("gomap", envir = GOSimEnv)) | ||||||
| @@ -92,7 +101,7 @@ head(get("gomap", envir=GOSimEnv)) | |||||||
| # All keys being used here are yeast systematic names. | # All keys being used here are yeast systematic names. | ||||||
|  |  | ||||||
| # Get one set of annotations | # Get one set of annotations | ||||||
| getGOInfo(c("YDL056W"))  # Mbp1 | GOSim::getGOInfo(c("YDL056W"))  # Mbp1 | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Semantic Similarities  =============================================== | # =    3  Semantic Similarities  =============================================== | ||||||
| @@ -105,30 +114,30 @@ getGOInfo(c("YDL056W"))  # Mbp1 | |||||||
| # in this package. | # in this package. | ||||||
|  |  | ||||||
|                                                          # Mbp1 and... |                                                          # Mbp1 and... | ||||||
| getGeneSim("YDL056W", "YLR182W", similarity = "OA")  # Swi6 - MCB complex | GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex | ||||||
| getGeneSim("YDL056W", "YER111C", similarity = "OA")  # Swi4 - collaborators | GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators | ||||||
| getGeneSim("YDL056W", "YBR160W", similarity = "OA")  # Cdc28 - mediator | GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator | ||||||
| getGeneSim("YDL056W", "YGR108W", similarity = "OA")  # Clb1 - antagonist | GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist | ||||||
| getGeneSim("YDL056W", "YLR079W", similarity = "OA")  # Sic1 - antagonist | GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist | ||||||
| getGeneSim("YDL056W", "YJL130C", similarity = "OA")  # Pgk1 - Gluconeogenesis | GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  GO Term Enrichment in Gene Sets  ===================================== | # =    4  GO Term Enrichment in Gene Sets  ===================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # Calculating GO term enrichment in gene sets is done with the topGO package. | # Calculating GO term enrichment in gene sets is done with the Bioconductor | ||||||
| if (! require(topGO, quietly=TRUE)) { | # topGO package. | ||||||
|   if (! exists("biocLite")) { | if (! requireNamespace("topGO", quietly = TRUE)) { | ||||||
|     source("https://bioconductor.org/biocLite.R") |   BiocManager::install("topGO") | ||||||
|   } |  | ||||||
|   biocLite("topGO") |  | ||||||
|   library(topGO) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = topGO)       # basic information | #  library(help = topGO)       # basic information | ||||||
| #  browseVignettes("topGO")    # available vignettes | #  browseVignettes("topGO")    # available vignettes | ||||||
| #  data(package = "topGO")     # available datasets | #  data(package = "topGO")     # available datasets | ||||||
|  |  | ||||||
|  | # Once again - assumptions are made by GOsim that require us to load the | ||||||
|  | # topGO package wholesale: | ||||||
|  | library(topGO) | ||||||
|  |  | ||||||
| # Let's define a gene set: GOterm enrichment for G1/S switch activators: | # Let's define a gene set: GOterm enrichment for G1/S switch activators: | ||||||
| mySet <- c("YFR028C", # Cdc14 | mySet <- c("YFR028C", # Cdc14 | ||||||
| @@ -141,7 +150,7 @@ mySet <- c("YFR028C", # Cdc14 | |||||||
|            "YPL256C", # Cln2 |            "YPL256C", # Cln2 | ||||||
|            "YAL040C") # Cln3 |            "YAL040C") # Cln3 | ||||||
|  |  | ||||||
| allGenes <- keys(org.Sc.sgd.db) | allGenes <- AnnotationDbi::keys(org.Sc.sgd.db) | ||||||
| allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | allGenes <- allGenes[grep("^Y", allGenes)]  # This is the context against which | ||||||
|                                             # we define enrichment |                                             # we define enrichment | ||||||
|  |  | ||||||
| @@ -164,7 +173,7 @@ setdiff(fullSet, mySet)   # These are annotated to that term but not in mySet. | |||||||
|  |  | ||||||
| # What are these genes? | # What are these genes? | ||||||
| # Select annotations from the annotation database: | # Select annotations from the annotation database: | ||||||
| select(org.Sc.sgd.db, | AnnotationDbi::select(org.Sc.sgd.db, | ||||||
|                       keys = setdiff(fullSet, mySet), |                       keys = setdiff(fullSet, mySet), | ||||||
|                       columns = c("COMMON", "DESCRIPTION")) |                       columns = c("COMMON", "DESCRIPTION")) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Data_preparation unit. | #              R code accompanying the BIN-PHYLO-Data_preparation unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  31 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -28,12 +31,12 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                     Line | #TOC>   Section  Title                                     Line | ||||||
| #TOC> --------------------------------------------------- | #TOC> --------------------------------------------------------- | ||||||
| #TOC>   1        Preparations                          41 | #TOC>   1        Preparations                                44 | ||||||
| #TOC>   2        Fetching sequences                    78 | #TOC>   2        Fetching sequences                          76 | ||||||
| #TOC>   3        Multiple Sequence Alignment          119 | #TOC>   3        Multiple Sequence Alignment                117 | ||||||
| #TOC>   4        Reviewing and Editing Alignments     138 | #TOC>   4        Reviewing and Editing Alignments           136 | ||||||
| #TOC>   4.1      Masking workflow                     154 | #TOC>   4.1        Masking workflow                         152 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -49,12 +52,11 @@ source("makeProteinDB.R") | |||||||
|  |  | ||||||
| # Load packages we need | # Load packages we need | ||||||
|  |  | ||||||
| if (! require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| @@ -62,12 +64,8 @@ if (! require(Biostrings, quietly=TRUE)) { | |||||||
| #  data(package = "Biostrings")     # available datasets | #  data(package = "Biostrings")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(msa, quietly=TRUE)) { | if (! requireNamespace("msa", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("msa") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("msa") |  | ||||||
|   library(msa) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = msa)       # basic information | #  library(help = msa)       # basic information | ||||||
| @@ -123,8 +121,8 @@ tail(APSI) | |||||||
| # the MSA algorithms in Biostrings. | # the MSA algorithms in Biostrings. | ||||||
| # | # | ||||||
|  |  | ||||||
| APSESSet <- AAStringSet(APSI) | APSESSet <- Biostrings::AAStringSet(APSI) | ||||||
| APSESMsa <- msaMuscle(APSESSet, order = "aligned") | APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned") | ||||||
|  |  | ||||||
| # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | # Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If | ||||||
| # that happens in your case, just use msaClustalOmega() instead. | # that happens in your case, just use msaClustalOmega() instead. | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | #              R code accompanying the BIN-PHYLO-Tree_analysis unit. | ||||||
| # | # | ||||||
| # Version:  1.0.2 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  31 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0.2  Typo in variable name, style changes | #           1.0.2  Typo in variable name, style changes | ||||||
| #           1.0.1  Wrong section heading | #           1.0.1  Wrong section heading | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| @@ -31,11 +34,11 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                              Line | #TOC>   Section  Title                              Line | ||||||
| #TOC> -------------------------------------------------- | #TOC> -------------------------------------------------- | ||||||
| #TOC>   1        Preparation and Tree Plot            43 | #TOC>   1        Preparation and Tree Plot            46 | ||||||
| #TOC>   2        Tree Analysis                        82 | #TOC>   2        Tree Analysis                        86 | ||||||
| #TOC>   2.1        Rooting Trees                     141 | #TOC>   2.1        Rooting Trees                     145 | ||||||
| #TOC>   2.2        Rotating Clades                   187 | #TOC>   2.2        Rotating Clades                   190 | ||||||
| #TOC>   2.3        Computing tree distances          234 | #TOC>   2.3        Computing tree distances          241 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -43,19 +46,17 @@ | |||||||
| # =    1  Preparation and Tree Plot  =========================================== | # =    1  Preparation and Tree Plot  =========================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (!require(Rphylip, quietly=TRUE)) { | if (! requireNamespace("ape", quietly = TRUE)) { | ||||||
|   install.packages("Rphylip") |   install.packages("ape") | ||||||
|   library(Rphylip) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Rphylip)       # basic information | #  library(help = ape)       # basic information | ||||||
| #  browseVignettes("Rphylip")    # available vignettes | #  browseVignettes("ape")    # available vignettes | ||||||
| #  data(package = "Rphylip")     # available datasets | #  data(package = "ape")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Read the species tree that you have created at the phyloT Website: | # Read the species tree that you have created at the phyloT Website: | ||||||
| fungiTree <- read.tree("fungiTree.txt") | fungiTree <- ape::read.tree("fungiTree.txt") | ||||||
|  |  | ||||||
| plot(fungiTree) | plot(fungiTree) | ||||||
|  |  | ||||||
| @@ -73,7 +74,10 @@ for (i in seq_along(fungiTree$tip.label)) { | |||||||
|  |  | ||||||
| # Plot the tree | # Plot the tree | ||||||
| plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE) | plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE) | ||||||
| nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA") | ape::nodelabels(text = fungiTree$node.label, | ||||||
|  |                 cex = 0.6, | ||||||
|  |                 adj = 0.2, | ||||||
|  |                 bg = "#D4F2DA") | ||||||
| # Note that you can use the arrow buttons in the menu above the plot to scroll | # Note that you can use the arrow buttons in the menu above the plot to scroll | ||||||
| # back to plots you have created earlier - so you can reference back to the | # back to plots you have created earlier - so you can reference back to the | ||||||
| # species tree. | # species tree. | ||||||
| @@ -91,10 +95,10 @@ nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA") | |||||||
| # trees in Newick format and visualize them elsewhere. | # trees in Newick format and visualize them elsewhere. | ||||||
|  |  | ||||||
| # The "phylo" class object is one of R's "S3" objects and methods to plot and | # The "phylo" class object is one of R's "S3" objects and methods to plot and | ||||||
| # print it have been defined with the Rphylip package, and the package ape that | # print it have been defined with the Rphylip package, and in ape. You can | ||||||
| # Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to | # simply call plot(<your-tree>) and R knows what to do with <your-tree> and how | ||||||
| # do with <your-tree> and how to plot it. The underlying function is | # to plot it. The underlying function is plot.phylo(), and documentation for its | ||||||
| # plot.phylo(), and documentation for its many options can by found by typing: | # many options can by found by typing: | ||||||
|  |  | ||||||
| ?plot.phylo | ?plot.phylo | ||||||
|  |  | ||||||
| @@ -125,40 +129,39 @@ apsTree$edge.length | |||||||
|  |  | ||||||
| # show the node / edge and tip labels on a plot | # show the node / edge and tip labels on a plot | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
| nodelabels() | ape::nodelabels() | ||||||
| edgelabels() | ape::edgelabels() | ||||||
| tiplabels() | ape::tiplabels() | ||||||
|  |  | ||||||
| # show the number of nodes, edges and tips | # show the number of nodes, edges and tips | ||||||
| Nnode(apsTree) | ape::Nnode(apsTree) | ||||||
| Nedge(apsTree) | ape::Nedge(apsTree) | ||||||
| Ntip(apsTree) | ape::Ntip(apsTree) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Finally, write the tree to console in Newick format | # Finally, write the tree to console in Newick format | ||||||
| write.tree(apsTree) | ape::write.tree(apsTree) | ||||||
|  |  | ||||||
| # ==   2.1  Rooting Trees  ===================================================== | # ==   2.1  Rooting Trees  ===================================================== | ||||||
|  |  | ||||||
| # In order to analyse the tree, it is helpful to root it first and reorder its | # In order to analyse the tree, it is helpful to root it first and reorder its | ||||||
| # clades. Contrary to documentation, Rproml() returns an unrooted tree. | # clades. Contrary to documentation, Rproml() returns an unrooted tree. | ||||||
|  |  | ||||||
| is.rooted(apsTree) | ape::is.rooted(apsTree) | ||||||
|  |  | ||||||
| # You can root the tree with the command root() from the "ape" package. ape is | # You can root the tree with the command root() from the "ape" package. | ||||||
| # automatically installed and loaded with Rphylip. |  | ||||||
|  |  | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
|  |  | ||||||
| # add labels for internal nodes and tips | # add labels for internal nodes and tips | ||||||
| nodelabels(cex = 0.5, frame = "circle") | ape::nodelabels(cex = 0.5, frame = "circle") | ||||||
| tiplabels(cex = 0.5, frame = "rect") | ape::tiplabels(cex = 0.5, frame = "rect") | ||||||
|  |  | ||||||
| # The outgroup of the tree is tip "11" in my sample tree, it may be a different | # The outgroup of the tree is tip "11" in my sample tree, it may be a different | ||||||
| # number in yours. Substitute the correct node number below for "outgroup". | # number in yours. Substitute the correct node number below for "outgroup". | ||||||
| apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE) | apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE) | ||||||
| plot(apsTree) | plot(apsTree) | ||||||
| is.rooted(apsTree) | ape::is.rooted(apsTree) | ||||||
|  |  | ||||||
| # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | # This tree _looks_ unchanged, beacuse when the root trifurcation was resolved, | ||||||
| # an edge of length zero was added to connect the MRCA (Most Recent Common | # an edge of length zero was added to connect the MRCA (Most Recent Common | ||||||
| @@ -172,7 +175,7 @@ apsTree$edge.length | |||||||
| # overlap. | # overlap. | ||||||
| apsTree$edge.length[1] <- 0.1 | apsTree$edge.length[1] <- 0.1 | ||||||
| plot(apsTree, cex = 0.7) | plot(apsTree, cex = 0.7) | ||||||
| nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | ||||||
|  |  | ||||||
|  |  | ||||||
| # This procedure does however not assign an actual length to a root edge, and | # This procedure does however not assign an actual length to a root edge, and | ||||||
| @@ -181,7 +184,7 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866") | |||||||
|  |  | ||||||
| apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | apsTree$root.edge <- mean(apsTree$edge.length) * 1.5 | ||||||
| plot(apsTree, cex = 0.7, root.edge = TRUE) | plot(apsTree, cex = 0.7, root.edge = TRUE) | ||||||
| nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.2  Rotating Clades  =================================================== | # ==   2.2  Rotating Clades  =================================================== | ||||||
| @@ -192,9 +195,9 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866") | |||||||
| # We can either rotate around individual internal nodes ... | # We can either rotate around individual internal nodes ... | ||||||
| layout(matrix(1:2, 1, 2)) | layout(matrix(1:2, 1, 2)) | ||||||
| plot(apsTree, no.margin = TRUE, root.edge = TRUE) | plot(apsTree, no.margin = TRUE, root.edge = TRUE) | ||||||
| nodelabels(node = 17, cex = 0.7, bg = "#ff8866") | ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866") | ||||||
| plot(rotate(apsTree, node = 17), no.margin = TRUE, root.edge = TRUE) | plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE) | ||||||
| nodelabels(node = 17, cex = 0.7, bg = "#88ff66") | ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66") | ||||||
| # Note that the species at the bottom of the clade descending from node | # Note that the species at the bottom of the clade descending from node | ||||||
| # 17 is now plotted at the top. | # 17 is now plotted at the top. | ||||||
| layout(matrix(1), widths = 1.0, heights = 1.0) | layout(matrix(1), widths = 1.0, heights = 1.0) | ||||||
| @@ -211,11 +214,15 @@ nOrg <- length(apsTree$tip.label) | |||||||
| layout(matrix(1:2, 1, 2)) | layout(matrix(1:2, 1, 2)) | ||||||
| plot(fungiTree, | plot(fungiTree, | ||||||
|      no.margin = TRUE, root.edge = TRUE) |      no.margin = TRUE, root.edge = TRUE) | ||||||
| nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA") | ape::nodelabels(text = fungiTree$node.label, | ||||||
|  |                 cex = 0.5, | ||||||
|  |                 adj = 0.2, | ||||||
|  |                 bg = "#D4F2DA") | ||||||
|  |  | ||||||
| plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), | plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]), | ||||||
|      no.margin = TRUE, root.edge = TRUE) |      no.margin = TRUE, | ||||||
| add.scale.bar(length = 0.5) |      root.edge = TRUE) | ||||||
|  | ape::add.scale.bar(length = 0.5) | ||||||
| layout(matrix(1), widths = 1.0, heights = 1.0) | layout(matrix(1), widths = 1.0, heights = 1.0) | ||||||
|  |  | ||||||
| # Task: Study the two trees and consider their similarities and differences. | # Task: Study the two trees and consider their similarities and differences. | ||||||
| @@ -236,9 +243,8 @@ layout(matrix(1), widths = 1.0, heights = 1.0) | |||||||
|  |  | ||||||
| # Many superb phylogeny tools are contributed by the phangorn package. | # Many superb phylogeny tools are contributed by the phangorn package. | ||||||
|  |  | ||||||
| if (!require(phangorn, quietly=TRUE)) { | if (! requireNamespace("phangorn", quietly = TRUE)) { | ||||||
|   install.packages("phangorn") |   install.packages("phangorn") | ||||||
|   library(phangorn) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = phangorn)       # basic information | #  library(help = phangorn)       # basic information | ||||||
| @@ -253,14 +259,14 @@ apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label) | |||||||
| # phangorn provides several functions to compute tree-differences (and there | # phangorn provides several functions to compute tree-differences (and there | ||||||
| # is a _whole_ lot of theory on how to compare trees). treedist() returns the | # is a _whole_ lot of theory on how to compare trees). treedist() returns the | ||||||
| # "symmetric difference" | # "symmetric difference" | ||||||
| treedist(fungiTree, apsTree2, check.labels = TRUE) | phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE) | ||||||
|  |  | ||||||
| # Numbers. What do they mean? How much more similar is our apsTree to the | # Numbers. What do they mean? How much more similar is our apsTree to the | ||||||
| # (presumably) ground truth of fungiTree than a random tree would be? | # (presumably) ground truth of fungiTree than a random tree would be? | ||||||
| # The ape package (which was loaded with RPhylip) provides the function rtree() | # The ape package provides the function rtree() | ||||||
| # to compute random trees. | # to compute random trees. | ||||||
|  |  | ||||||
| rtree(n = length(apsTree2$tip.label),  # number of tips | ape::rtree(n = length(apsTree2$tip.label), # number of tips | ||||||
|           rooted = TRUE,                   # we rooted the tree above, |           rooted = TRUE,                   # we rooted the tree above, | ||||||
|                                            #  and fungiTree is rooted anyway |                                            #  and fungiTree is rooted anyway | ||||||
|           tip.label = apsTree2$tip.label,  # use the apsTree2 labels |           tip.label = apsTree2$tip.label,  # use the apsTree2 labels | ||||||
| @@ -278,17 +284,17 @@ colnames(myTreeDistances) <- c("symm", "path") | |||||||
|  |  | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   xTree <- rtree(n = length(apsTree2$tip.label), |   xTree <- ape::rtree(n = length(apsTree2$tip.label), | ||||||
|                       rooted = TRUE, |                       rooted = TRUE, | ||||||
|                       tip.label = apsTree2$tip.label, |                       tip.label = apsTree2$tip.label, | ||||||
|                       br = NULL) |                       br = NULL) | ||||||
|   myTreeDistances[i, ] <- treedist(fungiTree, xTree) |   myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree) | ||||||
| } | } | ||||||
| set.seed(NULL)                      # reset the random number generator | set.seed(NULL)                      # reset the random number generator | ||||||
|  |  | ||||||
| table(myTreeDistances[, "symm"]) | table(myTreeDistances[, "symm"]) | ||||||
|  |  | ||||||
| (symmObs <- treedist(fungiTree, apsTree2)[1]) | (symmObs <- phangorn::treedist(fungiTree, apsTree2)[1]) | ||||||
|  |  | ||||||
| # Random events less-or-equal to observation, divided by total number of | # Random events less-or-equal to observation, divided by total number of | ||||||
| # events gives us the empirical p-value. | # events gives us the empirical p-value. | ||||||
| @@ -298,7 +304,7 @@ cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n | |||||||
| hist(myTreeDistances[, "path"], | hist(myTreeDistances[, "path"], | ||||||
|      col = "aliceblue", |      col = "aliceblue", | ||||||
|      main = "Distances of random Trees to fungiTree") |      main = "Distances of random Trees to fungiTree") | ||||||
| (pathObs <- treedist(fungiTree, apsTree2)[2]) | (pathObs <- phangorn::treedist(fungiTree, apsTree2)[2]) | ||||||
| abline(v = pathObs, col = "chartreuse") | abline(v = pathObs, col = "chartreuse") | ||||||
|  |  | ||||||
| # Random events less-or-equal to observation, divided by total number of | # Random events less-or-equal to observation, divided by total number of | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PHYLO-Tree_building unit. | #              R code accompanying the BIN-PHYLO-Tree_building unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10.  31 | # Date:     2017  10.  31 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
| #           1.0    First 2017 version | #           1.0    First 2017 version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -29,14 +31,14 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                       Line | #TOC>   Section  Title                                       Line | ||||||
| #TOC> ------------------------------------------------------- | #TOC> ----------------------------------------------------------- | ||||||
| #TOC>   1        Calculating Trees                         43 | #TOC>   1        Calculating Trees                             46 | ||||||
| #TOC>   1.1      PROMLPATH ...                             64 | #TOC>   1.1        PROMLPATH ...                               66 | ||||||
| #TOC>   1.1.1    ... on the Mac                            69 | #TOC>   1.1.1          ... on the Mac                          71 | ||||||
| #TOC>   1.1.2    ... on Windows                            80 | #TOC>   1.1.2          ... on Windows                          82 | ||||||
| #TOC>   1.1.3    ... on Linux                              94 | #TOC>   1.1.3          ... on Linux                            96 | ||||||
| #TOC>   1.1.4    Confirming PROMLPATH                      99 | #TOC>   1.1.4          Confirming PROMLPATH                   101 | ||||||
| #TOC>   1.2      Building a maximum likelihood tree       108 | #TOC>   1.2        Building a maximum likelihood tree         110 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -50,9 +52,8 @@ | |||||||
| # After you have installed Phylip on your computer, install the R package that | # After you have installed Phylip on your computer, install the R package that | ||||||
| # provides an interface to the Phylip functions. | # provides an interface to the Phylip functions. | ||||||
|  |  | ||||||
| if (!require(Rphylip, quietly=TRUE)) { | if (! requireNamespace("Rphylip", quietly = TRUE)) { | ||||||
|   install.packages("Rphylip") |   install.packages("Rphylip") | ||||||
|   library(Rphylip) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Rphylip)       # basic information | #  library(help = Rphylip)       # basic information | ||||||
| @@ -110,7 +111,7 @@ list.files(PROMLPATH)    # lists the files [1] "proml"   "proml.command" | |||||||
| # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | # Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit, | ||||||
| # as a "proseq" object with the read.protein() function of the RPhylip package: | # as a "proseq" object with the read.protein() function of the RPhylip package: | ||||||
|  |  | ||||||
| apsIn <- read.protein("APSESphyloSet.mfa") | apsIn <- Rphylip::read.protein("APSESphyloSet.mfa") | ||||||
|  |  | ||||||
| # ... and you are ready to build a tree. | # ... and you are ready to build a tree. | ||||||
|  |  | ||||||
| @@ -125,7 +126,7 @@ apsIn <- read.protein("APSESphyloSet.mfa") | |||||||
| # process will take us about 5 to 10 minutes. Run this, and anjoy a good cup | # process will take us about 5 to 10 minutes. Run this, and anjoy a good cup | ||||||
| # of coffee while you are waiting. | # of coffee while you are waiting. | ||||||
|  |  | ||||||
| apsTree <- Rproml(apsIn, path=PROMLPATH) | apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH) | ||||||
|  |  | ||||||
| # A quick first look: | # A quick first look: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-PPI-Analysis unit. | #              R code accompanying the BIN-PPI-Analysis unit. | ||||||
| # | # | ||||||
| # Version:   1.0 | # Version:   1.1 | ||||||
| # | # | ||||||
| # Date:     2017  08  - 2017 11 | # Date:     2017  08  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -29,13 +32,13 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                           Line | #TOC>   Section  Title                                           Line | ||||||
| #TOC> --------------------------------------------------------------- | #TOC> --------------------------------------------------------------- | ||||||
| #TOC>   1        Setup and data                                    43 | #TOC>   1        Setup and data                                    46 | ||||||
| #TOC>   2        Functional Edges in the Human Proteome            80 | #TOC>   2        Functional Edges in the Human Proteome            82 | ||||||
| #TOC>   2.1        Cliques                                        123 | #TOC>   2.1        Cliques                                        125 | ||||||
| #TOC>   2.2        Communities                                    164 | #TOC>   2.2        Communities                                    166 | ||||||
| #TOC>   2.3        Betweenness Centrality                         178 | #TOC>   2.3        Betweenness Centrality                         180 | ||||||
| #TOC>   3        biomaRt                                          224 | #TOC>   3        biomaRt                                          226 | ||||||
| #TOC>   4        Task for submission                              295 | #TOC>   4        Task for submission                              296 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -45,9 +48,8 @@ | |||||||
|  |  | ||||||
| # Not surprisingly, the analysis of PPI networks needs iGraph: | # Not surprisingly, the analysis of PPI networks needs iGraph: | ||||||
|  |  | ||||||
| if (!require(igraph, quietly=TRUE)) { | if (! requireNamespace("igraph", quietly = TRUE)) { | ||||||
|   install.packages("igraph") |   install.packages("igraph") | ||||||
|   library(igraph) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = igraph)       # basic information | #  library(help = igraph)       # basic information | ||||||
| @@ -88,9 +90,9 @@ head(STRINGedges) | |||||||
|  |  | ||||||
|  |  | ||||||
| # Make a graph from this dataframe | # Make a graph from this dataframe | ||||||
| ?graph_from_data_frame | ?igraph::graph_from_data_frame | ||||||
|  |  | ||||||
| gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE) | gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE) | ||||||
|  |  | ||||||
| # CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges - | # CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges - | ||||||
| # layout of such large graphs is possible, but requires specialized code. Google | # layout of such large graphs is possible, but requires specialized code. Google | ||||||
| @@ -99,13 +101,13 @@ gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE) | |||||||
|  |  | ||||||
| # Of course simple computations on this graph are reasonably fast: | # Of course simple computations on this graph are reasonably fast: | ||||||
|  |  | ||||||
| compSTR <- components(gSTR) | compSTR <- igraph::components(gSTR) | ||||||
| summary(compSTR) # our graph is fully connected! | summary(compSTR) # our graph is fully connected! | ||||||
|  |  | ||||||
| hist(log(degree(gSTR)), col="#FEE0AF") | hist(log(igraph::degree(gSTR)), col="#FEE0AF") | ||||||
| # this actually does look rather scale-free | # this actually does look rather scale-free | ||||||
|  |  | ||||||
| (freqRank <- table(degree(gSTR))) | (freqRank <- table(igraph::degree(gSTR))) | ||||||
| plot(log10(as.numeric(names(freqRank)) + 1), | plot(log10(as.numeric(names(freqRank)) + 1), | ||||||
|      log10(as.numeric(freqRank)), type = "b", |      log10(as.numeric(freqRank)), type = "b", | ||||||
|      pch = 21, bg = "#FEE0AF", |      pch = 21, bg = "#FEE0AF", | ||||||
| @@ -126,29 +128,29 @@ abline(regressionLine, col = "firebrick") | |||||||
| # subgraph, i.e. a subgraph in which every node is connected to every other. | # subgraph, i.e. a subgraph in which every node is connected to every other. | ||||||
| # Biological complexes often appear as cliques in interaction graphs. | # Biological complexes often appear as cliques in interaction graphs. | ||||||
|  |  | ||||||
| clique_num(gSTR) | igraph::clique_num(gSTR) | ||||||
| # The largest clique has 63 members. | # The largest clique has 63 members. | ||||||
|  |  | ||||||
| (C <- largest_cliques(gSTR)[[1]]) | (C <- igraph::largest_cliques(gSTR)[[1]]) | ||||||
|  |  | ||||||
| # Pick one of the proteins and find out what this fully connected cluster of 63 | # Pick one of the proteins and find out what this fully connected cluster of 63 | ||||||
| # proteins is (you can simply Google for any of the IDs). Is this expected? | # proteins is (you can simply Google for any of the IDs). Is this expected? | ||||||
|  |  | ||||||
| # Plot this ... | # Plot this ... | ||||||
| R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices | R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices | ||||||
|  |  | ||||||
| # color the vertices along a color spectrum | # color the vertices along a color spectrum | ||||||
| vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes | vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes | ||||||
|  |  | ||||||
| # color the edges to have the same color as the originating node | # color the edges to have the same color as the originating node | ||||||
| eCol <- character() | eCol <- character() | ||||||
| for (i in seq_along(vCol)) { | for (i in seq_along(vCol)) { | ||||||
|   eCol <- c(eCol, rep(vCol[i], gorder(R))) |   eCol <- c(eCol, rep(vCol[i], igraph::gorder(R))) | ||||||
| } | } | ||||||
|  |  | ||||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | oPar <- par(mar= rep(0,4)) # Turn margins off | ||||||
| plot(R, | plot(R, | ||||||
|      layout = layout_in_circle(R), |      layout = igraph::layout_in_circle(R), | ||||||
|      vertex.size = 3, |      vertex.size = 3, | ||||||
|      vertex.color = vCol, |      vertex.color = vCol, | ||||||
|      edge.color = eCol, |      edge.color = eCol, | ||||||
| @@ -164,14 +166,14 @@ par(oPar) | |||||||
| # ==   2.2  Communities  ======================================================= | # ==   2.2  Communities  ======================================================= | ||||||
|  |  | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| gSTRclusters <- cluster_infomap(gSTR) | gSTRclusters <- igraph::cluster_infomap(gSTR) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| modularity(gSTRclusters) # ... measures how separated the different membership | igraph::modularity(gSTRclusters) # ... measures how separated the different | ||||||
|                          # types are from each other |                                  # membership types are from each other | ||||||
| tMem <- table(membership(gSTRclusters)) | tMem <- table(igraph::membership(gSTRclusters)) | ||||||
| length(tMem)  # More than 2000 communities identified | length(tMem)  # More than 2000 communities identified | ||||||
| hist(tMem, breaks = 50)  # most clusters are small ... | hist(tMem, breaks = 50, col = "skyblue")  # most clusters are small ... | ||||||
| range(tMem) # ... but one has > 100 members | range(tMem) # ... but one has > 100 members | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -179,14 +181,14 @@ range(tMem) # ... but one has > 100 members | |||||||
|  |  | ||||||
| # Let's find the nodes with the 10 - highest betweenness centralities. | # Let's find the nodes with the 10 - highest betweenness centralities. | ||||||
| # | # | ||||||
| BC <- centr_betw(gSTR) | BC <- igraph::centr_betw(gSTR) | ||||||
|  |  | ||||||
| # remember: BC$res contains the results | # remember: BC$res contains the results | ||||||
| head(BC$res) | head(BC$res) | ||||||
|  |  | ||||||
| BC$res[1]   # betweeness centrality of node 1 in the graph ... | BC$res[1]   # betweeness centrality of node 1 in the graph ... | ||||||
| # ... which one is node 1? | # ... which one is node 1? | ||||||
| V(gSTR)[1] | igraph::V(gSTR)[1] | ||||||
|  |  | ||||||
| # to get the ten-highest nodes, we simply label the elements of BC with their | # to get the ten-highest nodes, we simply label the elements of BC with their | ||||||
| # index ... | # index ... | ||||||
| @@ -203,7 +205,7 @@ head(sBC) | |||||||
|  |  | ||||||
| # We can use the first ten labels to subset the nodes in gSTR and fetch the | # We can use the first ten labels to subset the nodes in gSTR and fetch the | ||||||
| # IDs... | # IDs... | ||||||
| (ENSPsel <- names(V(gSTR)[BCsel])) | (ENSPsel <- names(igraph::V(gSTR)[BCsel])) | ||||||
|  |  | ||||||
| # We are going to use these IDs to produce some output for a submitted task: | # We are going to use these IDs to produce some output for a submitted task: | ||||||
| # so I need you to personalize ENSPsel with the following | # so I need you to personalize ENSPsel with the following | ||||||
| @@ -231,12 +233,11 @@ set.seed(NULL)                      # reset the random number generator | |||||||
| # day), simply a few lines of sample code to get you started on the specific use | # day), simply a few lines of sample code to get you started on the specific use | ||||||
| # case of retrieving descriptions for ensembl protein IDs. | # case of retrieving descriptions for ensembl protein IDs. | ||||||
|  |  | ||||||
| if (!require(biomaRt, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("biomaRt") | if (! requireNamespace("biomaRt", quietly = TRUE)) { | ||||||
|   library(biomaRt) |   BiocManager::install("biomaRt") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = biomaRt)       # basic information | #  library(help = biomaRt)       # basic information | ||||||
| @@ -244,14 +245,14 @@ if (!require(biomaRt, quietly=TRUE)) { | |||||||
| #  data(package = "biomaRt")     # available datasets | #  data(package = "biomaRt")     # available datasets | ||||||
|  |  | ||||||
| # define which dataset to use ... | # define which dataset to use ... | ||||||
| myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl") | myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl") | ||||||
|  |  | ||||||
| # what filters are defined? | # what filters are defined? | ||||||
| (filters <- listFilters(myMart)) | (filters <- biomaRt::listFilters(myMart)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # and what attributes can we filter for? | # and what attributes can we filter for? | ||||||
| (attributes <- listAttributes(myMart)) | (attributes <- biomaRt::listAttributes(myMart)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # Soooo many options - let's look for the correct name of filters that are | # Soooo many options - let's look for the correct name of filters that are | ||||||
| @@ -264,7 +265,7 @@ attributes[grep("description", attributes$description, ignore.case=TRUE), ] | |||||||
|  |  | ||||||
|  |  | ||||||
| # ... so we can put this together: here is a syntax example: | # ... so we can put this together: here is a syntax example: | ||||||
| getBM(filters = "ensembl_peptide_id", | biomaRt::getBM(filters = "ensembl_peptide_id", | ||||||
|                attributes = c("hgnc_symbol", |                attributes = c("hgnc_symbol", | ||||||
|                               "wikigene_description", |                               "wikigene_description", | ||||||
|                               "interpro_description", |                               "interpro_description", | ||||||
| @@ -279,7 +280,7 @@ CPdefs <- list()  # Since we don't know how many matches one of our queries | |||||||
| # will return, we'll put the result dataframes into a list. | # will return, we'll put the result dataframes into a list. | ||||||
|  |  | ||||||
| for (ID in ENSPsel) { | for (ID in ENSPsel) { | ||||||
|   CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id", |   CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id", | ||||||
|                                  attributes = c("hgnc_symbol", |                                  attributes = c("hgnc_symbol", | ||||||
|                                                 "wikigene_description", |                                                 "wikigene_description", | ||||||
|                                                 "interpro_description", |                                                 "interpro_description", | ||||||
|   | |||||||
| @@ -3,13 +3,17 @@ | |||||||
| # Purpose: A Bioinformatics Course: | # Purpose: A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-SEQA-Comparison unit | #              R code accompanying the BIN-SEQA-Comparison unit | ||||||
| # | # | ||||||
| # Version: 1.0 | # Version: 1.1 | ||||||
| # | # | ||||||
| # Date:    2017  11  17 | # Date:    2017  11  -  2019  01 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # V 1.0    First live version 2017 | #           1.1    Change from require() to requireNamespace(), | ||||||
| # V 0.1    First code copied from BCH441_A03_makeYFOlist.R | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
|  | # Versions: | ||||||
|  | #           1.0    First live version 2017 | ||||||
|  | #           0.1    First code copied from BCH441_A03_makeYFOlist.R | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| @@ -27,24 +31,23 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                      Line | #TOC>   Section  Title                                      Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------------- | ||||||
| #TOC>   1        Preparation                            41 | #TOC>   1        Preparation                                  47 | ||||||
| #TOC>   2        Aggregate properties                   63 | #TOC>   2        Aggregate properties                         68 | ||||||
| #TOC>   3        Sequence Composition Enrichment       106 | #TOC>   3        Sequence Composition Enrichment             111 | ||||||
| #TOC>   3.1      Barplot, and side-by-side barplot     129 | #TOC>   3.1        Barplot, and side-by-side barplot         134 | ||||||
| #TOC>   3.2      Plotting ratios                       164 | #TOC>   3.2        Plotting ratios                           169 | ||||||
| #TOC>   3.3      Plotting log ratios                   180 | #TOC>   3.3        Plotting log ratios                       185 | ||||||
| #TOC>   3.4      Sort by frequency                     195 | #TOC>   3.4        Sort by frequency                         200 | ||||||
| #TOC>   3.5      Color by amino acid type              210 | #TOC>   3.5        Color by amino acid type                  215 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    1  Preparation  ========================================================= | # =    1  Preparation  ========================================================= | ||||||
|  |  | ||||||
| if (!require(seqinr, quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly = TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
|   library(seqinr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| @@ -66,7 +69,7 @@ if (!require(seqinr, quietly=TRUE)) { | |||||||
|  |  | ||||||
|  |  | ||||||
| # Let's try a simple function from seqinr: computing the pI of the sequence | # Let's try a simple function from seqinr: computing the pI of the sequence | ||||||
| ?computePI | ?seqinr::computePI | ||||||
|  |  | ||||||
| # This takes as input a vector of upper-case AA codes | # This takes as input a vector of upper-case AA codes | ||||||
|  |  | ||||||
| @@ -82,12 +85,12 @@ s <- unlist(s)             # strsplit() returns a list! Why? | |||||||
| # the function s2c() to convert strings into | # the function s2c() to convert strings into | ||||||
| # character vectors (and c2s to convert them back). | # character vectors (and c2s to convert them back). | ||||||
|  |  | ||||||
| s2c(mySeq) | seqinr::s2c(mySeq) | ||||||
|  |  | ||||||
|  |  | ||||||
| computePI(s2c(mySeq))  # isoelectric point | seqinr::computePI(s2c(mySeq))  # isoelectric point | ||||||
| pmw(s2c(mySeq))        # molecular weight | seqinr::pmw(s2c(mySeq))        # molecular weight | ||||||
| AAstat(s2c(mySeq))     # This also plots the distribution of | seqinr::AAstat(s2c(mySeq))     # This also plots the distribution of | ||||||
|                                # values along the sequence |                                # values along the sequence | ||||||
|  |  | ||||||
| # A true Labor of Love has gone into the | # A true Labor of Love has gone into the | ||||||
| @@ -117,7 +120,7 @@ aaindex[[459]]$D | |||||||
| # with the amino acid counts in our sequence. | # with the amino acid counts in our sequence. | ||||||
|  |  | ||||||
| (refData <- aaindex[[459]]$I)                # reference frequencies in % | (refData <- aaindex[[459]]$I)                # reference frequencies in % | ||||||
| names(refData) <- a(names(refData))  # change names to single-letter | names(refData) <- seqinr::a(names(refData))  # change names to single-letter | ||||||
|                                              # code using seqinr's "a()" function |                                              # code using seqinr's "a()" function | ||||||
| sum(refData) | sum(refData) | ||||||
| refData        # ... in % | refData        # ... in % | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the BIN-Sequence unit. | #              R code accompanying the BIN-Sequence unit. | ||||||
| # | # | ||||||
| # Version:  1.3 | # Version:  1.4 | ||||||
| # | # | ||||||
| # Date:     2017  09  - 2019  01 | # Date:     2017  09  - 2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.4    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.3    Update set.seed() usage | #           1.3    Update set.seed() usage | ||||||
| #           1.2    Removed irrelevant task. How did that even get in there? smh | #           1.2    Removed irrelevant task. How did that even get in there? smh | ||||||
| #           1.1    Add chartr() | #           1.1    Add chartr() | ||||||
| @@ -30,23 +33,23 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                Line | #TOC>   Section  Title                                Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------- | ||||||
| #TOC>   1        Prepare                                60 | #TOC>   1        Prepare                                63 | ||||||
| #TOC>   2        Storing Sequence                       78 | #TOC>   2        Storing Sequence                       80 | ||||||
| #TOC>   3        String properties                     107 | #TOC>   3        String properties                     109 | ||||||
| #TOC>   4        Substrings                            114 | #TOC>   4        Substrings                            116 | ||||||
| #TOC>   5        Creating strings: sprintf()           135 | #TOC>   5        Creating strings: sprintf()           137 | ||||||
| #TOC>   6        Changing strings                      170 | #TOC>   6        Changing strings                      172 | ||||||
| #TOC>   6.1.1          Changing case                   172 | #TOC>   6.1.1          Changing case                   174 | ||||||
| #TOC>   6.1.2          Reverse                         177 | #TOC>   6.1.2          Reverse                         179 | ||||||
| #TOC>   6.1.3          Change characters               181 | #TOC>   6.1.3          Change characters               183 | ||||||
| #TOC>   6.1.4          Substitute characters           209 | #TOC>   6.1.4          Substitute characters           211 | ||||||
| #TOC>   6.2        stringi and stringr                 229 | #TOC>   6.2        stringi and stringr                 231 | ||||||
| #TOC>   6.3        dbSanitizeSequence()                239 | #TOC>   6.3        dbSanitizeSequence()                241 | ||||||
| #TOC>   7        Permuting and sampling                251 | #TOC>   7        Permuting and sampling                253 | ||||||
| #TOC>   7.1        Permutations                        258 | #TOC>   7.1        Permutations                        260 | ||||||
| #TOC>   7.2        Sampling                            304 | #TOC>   7.2        Sampling                            306 | ||||||
| #TOC>   7.2.1          Equiprobable characters         306 | #TOC>   7.2.1          Equiprobable characters         308 | ||||||
| #TOC>   7.2.2          Defined probability vector      348 | #TOC>   7.2.2          Defined probability vector      350 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -62,12 +65,11 @@ | |||||||
| # Much basic sequence handling is supported by the Bioconductor package | # Much basic sequence handling is supported by the Bioconductor package | ||||||
| # Biostrings. | # Biostrings. | ||||||
|  |  | ||||||
| if (! require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| @@ -86,7 +88,7 @@ if (! require(Biostrings, quietly=TRUE)) { | |||||||
|  |  | ||||||
| # ... or as more complex objects with rich metadata e.g. as a Biostrings | # ... or as more complex objects with rich metadata e.g. as a Biostrings | ||||||
| # DNAstring, RNAstring, AAString, etc. | # DNAstring, RNAstring, AAString, etc. | ||||||
| (a <- AAString("DIVMTQ")) | (a <- Biostrings::AAString("DIVMTQ")) | ||||||
|  |  | ||||||
| # ... and all of these representations can be interconverted: | # ... and all of these representations can be interconverted: | ||||||
|  |  | ||||||
| @@ -314,6 +316,7 @@ N <- 100 | |||||||
| set.seed(16818)                        # set RNG seed for repeatable randomness | set.seed(16818)                        # set RNG seed for repeatable randomness | ||||||
| v <- sample(nuc, N, replace = TRUE) | v <- sample(nuc, N, replace = TRUE) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| (mySeq <- paste(v, collapse = "")) | (mySeq <- paste(v, collapse = "")) | ||||||
|  |  | ||||||
| # What's the GC content? | # What's the GC content? | ||||||
| @@ -323,9 +326,8 @@ sum(table(v)[c("G", "C")]) # 51 is close to expected | |||||||
| # What's the number of CpG motifs? Easy to check with the stringi | # What's the number of CpG motifs? Easy to check with the stringi | ||||||
| # stri_match_all() function | # stri_match_all() function | ||||||
|  |  | ||||||
| if (! require(stringi, quietly=TRUE)) { | if (! requireNamespace("stringi", quietly = TRUE)) { | ||||||
|   install.packages("stringi") |   install.packages("stringi") | ||||||
|   library(stringi) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = stringi)       # basic information | #  library(help = stringi)       # basic information | ||||||
|   | |||||||
| @@ -29,27 +29,27 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                   Line | #TOC>   Section  Title                                                   Line | ||||||
| #TOC> ----------------------------------------------------------------- | #TOC> ----------------------------------------------------------------------- | ||||||
| #TOC>   1        A Relational Datamodel in R: review                 62 | #TOC>   1        A Relational Datamodel in R: review                       57 | ||||||
| #TOC>   1.1      Building a sample database structure               102 | #TOC>   1.1        Building a sample database structure                    97 | ||||||
| #TOC>   1.1.1    completing the database                            213 | #TOC>   1.1.1          completing the database                            208 | ||||||
| #TOC>   1.2      Querying the database                              248 | #TOC>   1.2        Querying the database                                  243 | ||||||
| #TOC>   1.3      Task: submit for credit (part 1/2)                 277 | #TOC>   1.3        Task: submit for credit (part 1/2)                     272 | ||||||
| #TOC>   2        Implementing the protein datamodel                 289 | #TOC>   2        Implementing the protein datamodel                       284 | ||||||
| #TOC>   2.1      JSON formatted source data                         315 | #TOC>   2.1        JSON formatted source data                             310 | ||||||
| #TOC>   2.2      "Sanitizing" sequence data                         355 | #TOC>   2.2        "Sanitizing" sequence data                             350 | ||||||
| #TOC>   2.3      Create a protein table for our data model          375 | #TOC>   2.3        Create a protein table for our data model              370 | ||||||
| #TOC>   2.3.1    Initialize the database                            377 | #TOC>   2.3.1          Initialize the database                            372 | ||||||
| #TOC>   2.3.2    Add data                                           389 | #TOC>   2.3.2          Add data                                           384 | ||||||
| #TOC>   2.4      Complete the database                              409 | #TOC>   2.4        Complete the database                                  404 | ||||||
| #TOC>   2.4.1    Examples of navigating the database                436 | #TOC>   2.4.1          Examples of navigating the database                431 | ||||||
| #TOC>   2.5      Updating the database                              468 | #TOC>   2.5        Updating the database                                  463 | ||||||
| #TOC>   3        Add your own data                                  480 | #TOC>   3        Add your own data                                        475 | ||||||
| #TOC>   3.1      Find a protein                                     488 | #TOC>   3.1        Find a protein                                         483 | ||||||
| #TOC>   3.2      Put the information into JSON files                517 | #TOC>   3.2        Put the information into JSON files                    512 | ||||||
| #TOC>   3.3      Create an R script to create your own database     540 | #TOC>   3.3        Create an R script to create your own database         535 | ||||||
| #TOC>   3.3.1    Check and validate                                 560 | #TOC>   3.3.1          Check and validate                                 555 | ||||||
| #TOC>   3.4      Task: submit for credit (part 2/2)                 601 | #TOC>   3.4        Task: submit for credit (part 2/2)                     596 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -328,11 +328,11 @@ file.show("./data/MBP1_SACCE.json") | |||||||
| #     sanitize the sequence at some point. But since we need to do that | #     sanitize the sequence at some point. But since we need to do that | ||||||
| #     anyway, it is easier to see the whole sequence if we store it in chunks. | #     anyway, it is easier to see the whole sequence if we store it in chunks. | ||||||
|  |  | ||||||
| # Let's load the "jsonlite" package and have a look at how it reads this data. | # Let's make sure the "jsonlite" package exists on your computer, then we'll | ||||||
|  | # explore how it reads this data. | ||||||
|  |  | ||||||
| if (! require(jsonlite, quietly=TRUE)) { | if (! requireNamespace("jsonlite", quietly = TRUE)) { | ||||||
|   install.packages("jsonlite") |   install.packages("jsonlite") | ||||||
|   library(jsonlite) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = jsonlite)       # basic information | #  library(help = jsonlite)       # basic information | ||||||
| @@ -340,7 +340,7 @@ if (! require(jsonlite, quietly=TRUE)) { | |||||||
| #  data(package = "jsonlite")     # available datasets | #  data(package = "jsonlite")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| x <- fromJSON("./data/MBP1_SACCE.json") | x <- jsonlite::fromJSON("./data/MBP1_SACCE.json") | ||||||
| str(x) | str(x) | ||||||
|  |  | ||||||
| x$name | x$name | ||||||
| @@ -389,7 +389,7 @@ str(myDB) | |||||||
|  |  | ||||||
| dbAddProtein | dbAddProtein | ||||||
|  |  | ||||||
| myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json")) | myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json")) | ||||||
| str(myDB) | str(myDB) | ||||||
|  |  | ||||||
| # Lets check that the 833 amino acids of the yeast MBP1 sequence have | # Lets check that the 833 amino acids of the yeast MBP1 sequence have | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-Genetic_code unit. | #              R code accompanying the FND-Genetic_code unit. | ||||||
| # | # | ||||||
| # Version:  1.0.1 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  12 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0.1  Comment on "incomplete final line" warning in FASTA | #           1.0.1  Comment on "incomplete final line" warning in FASTA | ||||||
| #           1.0    First live version | #           1.0    First live version | ||||||
| # | # | ||||||
| @@ -27,14 +30,14 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                            Line | #TOC>   Section  Title                                            Line | ||||||
| #TOC> ---------------------------------------------------------- | #TOC> ---------------------------------------------------------------- | ||||||
| #TOC>   1        Storing the genetic code                     47 | #TOC>   1        Storing the genetic code                           45 | ||||||
| #TOC>   1.1      Genetic code in Biostrings                   65 | #TOC>   1.1        Genetic code in Biostrings                       63 | ||||||
| #TOC>   2        Working with the genetic code                97 | #TOC>   2        Working with the genetic code                      94 | ||||||
| #TOC>   2.1      Translate a sequence.                       126 | #TOC>   2.1        Translate a sequence.                           129 | ||||||
| #TOC>   3        An alternative representation: 3D array     208 | #TOC>   3        An alternative representation: 3D array           212 | ||||||
| #TOC>   3.1      Print a Genetic code table                  241 | #TOC>   3.1        Print a Genetic code table                      246 | ||||||
| #TOC>   4        Tasks                                       267 | #TOC>   4        Tasks                                             272 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -63,12 +66,11 @@ x["TAA"] | |||||||
| # available in the Bioconductor "Biostrings" package: | # available in the Bioconductor "Biostrings" package: | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| @@ -77,45 +79,51 @@ if (! require(Biostrings, quietly=TRUE)) { | |||||||
|  |  | ||||||
|  |  | ||||||
| # The standard genetic code vector | # The standard genetic code vector | ||||||
| GENETIC_CODE | Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
| # The table of genetic codes. This information corresponds to this page | # The table of genetic codes. This information corresponds to this page | ||||||
| # at the NCBI: | # at the NCBI: | ||||||
| # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | # https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes | ||||||
| GENETIC_CODE_TABLE | Biostrings::GENETIC_CODE_TABLE | ||||||
|  |  | ||||||
| # Most of the alternative codes are mitochondrial codes. The id of the | # Most of the alternative codes are mitochondrial codes. The id of the | ||||||
| # Alternative Yeast Nuclear code is "12" | # Alternative Yeast Nuclear code is "12" | ||||||
| getGeneticCode("12")  # Alternative Yeast Nuclear | Biostrings::getGeneticCode("12")  # Alternative Yeast Nuclear | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Working with the genetic code  ======================================= | # =    2  Working with the genetic code  ======================================= | ||||||
|  |  | ||||||
| # GENETIC_CODE is a "named vector" | # We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it | ||||||
|  | # to a "local" variable, rather than retrieving it from the package all the | ||||||
|  | # time. | ||||||
|  |  | ||||||
| str(GENETIC_CODE) | genCode <- Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
|  | # This is a named vector of characters ... | ||||||
|  |  | ||||||
|  | str(genCode) | ||||||
|  |  | ||||||
| # ... which also stores the alternative initiation codons TTG and CTG in | # ... which also stores the alternative initiation codons TTG and CTG in | ||||||
| # an attribute of the vector. (Alternative initiation codons sometimes are | # an attribute of the vector. (Alternative initiation codons sometimes are | ||||||
| # used instead of ATG to intiate translation, if if not ATG they are translated | # used instead of ATG to intiate translation, if if not ATG they are translated | ||||||
| # with fMet.) | # with fMet.) | ||||||
|  |  | ||||||
| attr(GENETIC_CODE, "alt_init_codons") | attr(genCode, "alt_init_codons") | ||||||
|  |  | ||||||
| # But the key to use this vector is in the "names" which we use for subsetting | # But the key to use this vector is in the "names" which we use for subsetting | ||||||
| # the list of amino acids in whatever way we need. | # the list of amino acids in whatever way we need. | ||||||
| names(GENETIC_CODE) | names(genCode) | ||||||
|  |  | ||||||
| # The translation of "TGG" ... | # The translation of "TGG" ... | ||||||
| GENETIC_CODE["TGG"] | genCode["TGG"] | ||||||
|  |  | ||||||
| # All stop codons | # All stop codons | ||||||
| names(GENETIC_CODE)[GENETIC_CODE == "*"] | names(genCode)[genCode == "*"] | ||||||
|  |  | ||||||
| # All start codons | # All start codons | ||||||
| names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or | names(genCode)[genCode == "M"] # ... or | ||||||
| c(names(GENETIC_CODE)[GENETIC_CODE == "M"], | c(names(genCode)[genCode == "M"], | ||||||
|   attr(GENETIC_CODE, "alt_init_codons")) |   attr(genCode, "alt_init_codons")) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   2.1  Translate a sequence.  ============================================= | # ==   2.1  Translate a sequence.  ============================================= | ||||||
| @@ -165,7 +173,7 @@ nchar(mbp1)/3 | |||||||
| # attributes that are useful for Biostrings. Thus we convert the sequence first | # attributes that are useful for Biostrings. Thus we convert the sequence first | ||||||
| # with DNAstring(), then split it up, then convert it into a plain | # with DNAstring(), then split it up, then convert it into a plain | ||||||
| # character vector. | # character vector. | ||||||
| mbp1Codons <- as.character(codons(DNAString(mbp1))) | mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1))) | ||||||
|  |  | ||||||
| head(mbp1Codons) | head(mbp1Codons) | ||||||
|  |  | ||||||
| @@ -173,7 +181,7 @@ head(mbp1Codons) | |||||||
|  |  | ||||||
| mbp1AA <- character(834) | mbp1AA <- character(834) | ||||||
| for (i in seq_along(mbp1Codons)) { | for (i in seq_along(mbp1Codons)) { | ||||||
|   mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]] |   mbp1AA[i] <- genCode[mbp1Codons[i]] | ||||||
| } | } | ||||||
|  |  | ||||||
| head(mbp1Codons) | head(mbp1Codons) | ||||||
| @@ -196,7 +204,8 @@ sort(table(mbp1AA), decreasing = TRUE) | |||||||
| mbp1AA <- mbp1AA[-(length(mbp1AA))] | mbp1AA <- mbp1AA[-(length(mbp1AA))] | ||||||
| tail(mbp1AA) # Note the stop is gone! | tail(mbp1AA) # Note the stop is gone! | ||||||
|  |  | ||||||
| # paste it together, collapsing the elements without separation-character | # paste it together, collapsing the elements using an empty string as the | ||||||
|  | # separation-character (i.e.: nothing) | ||||||
| (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | (Mbp1 <- paste(mbp1AA, sep = "", collapse = "")) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -204,14 +213,15 @@ tail(mbp1AA) # Note the stop is gone! | |||||||
|  |  | ||||||
|  |  | ||||||
| # We don't use 3D arrays often - usually just 2D tables and data frames, so | # We don't use 3D arrays often - usually just 2D tables and data frames, so | ||||||
| # here is a good opportunity to review the syntax with a genetic code cube: | # here is a good opportunity to review the syntax of 3D arrays with a | ||||||
|  | # genetic code cube: | ||||||
|  |  | ||||||
| # Initialize, using A C G T as the names of the elements in each dimension | # Initialize, using A G C T as the names of the elements in each dimension | ||||||
| cCube <- array(data     = character(64), | cCube <- array(data     = character(64), | ||||||
|                dim      = c(4, 4, 4), |                dim      = c(4, 4, 4), | ||||||
|                dimnames = list(c("A", "C", "G", "T"), |                dimnames = list(c("A", "G", "C", "T"), | ||||||
|                                c("A", "C", "G", "T"), |                                c("A", "G", "C", "T"), | ||||||
|                                c("A", "C", "G", "T"))) |                                c("A", "G", "C", "T"))) | ||||||
|  |  | ||||||
| # fill it with amino acid codes using three nested loops | # fill it with amino acid codes using three nested loops | ||||||
| for (i in 1:4) { | for (i in 1:4) { | ||||||
| @@ -222,7 +232,7 @@ for (i in 1:4) { | |||||||
|                        dimnames(cCube)[[3]][k], |                        dimnames(cCube)[[3]][k], | ||||||
|                        sep = "", |                        sep = "", | ||||||
|                        collapse = "") |                        collapse = "") | ||||||
|       cCube[i, j, k] <- GENETIC_CODE[myCodon] |       cCube[i, j, k] <- genCode[myCodon] | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
| @@ -291,14 +301,14 @@ for (i in nuc) { | |||||||
| # Solution: | # Solution: | ||||||
|  |  | ||||||
|     # Fetch the code |     # Fetch the code | ||||||
|           GENETIC_CODE_TABLE |     Biostrings::GENETIC_CODE_TABLE | ||||||
|           GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"] |     Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"] | ||||||
|           altYcode <- getGeneticCode("12") |     altYcode <- Biostrings::getGeneticCode("12") | ||||||
|  |  | ||||||
|     # what's the difference? |     # what's the difference? | ||||||
|           (delta <- which(GENETIC_CODE != altYcode)) |     (delta <- which(Biostrings::GENETIC_CODE != altYcode)) | ||||||
|  |  | ||||||
|           GENETIC_CODE[delta] |     Biostrings::GENETIC_CODE[delta] | ||||||
|     altYcode[delta] |     altYcode[delta] | ||||||
|  |  | ||||||
|     # translate |     # translate | ||||||
| @@ -319,7 +329,7 @@ for (i in nuc) { | |||||||
| # | # | ||||||
| # | # | ||||||
| # Solution: | # Solution: | ||||||
| table(table(GENETIC_CODE)) | table(table(Biostrings::GENETIC_CODE)) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-MAT-Graphs_and_networks unit. | #              R code accompanying the FND-MAT-Graphs_and_networks unit. | ||||||
| # | # | ||||||
| # Version:  1.1 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2019  01 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.1    Update set.seed() usage | #           1.1    Update set.seed() usage | ||||||
| #           1.0    First final version for learning units. | #           1.0    First final version for learning units. | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| @@ -30,17 +32,17 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                        Line | #TOC>   Section  Title                                        Line | ||||||
| #TOC> ------------------------------------------------------------ | #TOC> ------------------------------------------------------------ | ||||||
| #TOC>   1        Review                                         48 | #TOC>   1        Review                                         50 | ||||||
| #TOC>   2        DEGREE DISTRIBUTIONS                          201 | #TOC>   2        DEGREE DISTRIBUTIONS                          204 | ||||||
| #TOC>   2.1        Random graph                                207 | #TOC>   2.1        Random graph                                210 | ||||||
| #TOC>   2.2        scale-free graph (Barabasi-Albert)          255 | #TOC>   2.2        scale-free graph (Barabasi-Albert)          258 | ||||||
| #TOC>   2.3        Random geometric graph                      320 | #TOC>   2.3        Random geometric graph                      323 | ||||||
| #TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE           442 | #TOC>   3        A CLOSER LOOK AT THE igraph PACKAGE           445 | ||||||
| #TOC>   3.1        Basics                                      445 | #TOC>   3.1        Basics                                      448 | ||||||
| #TOC>   3.2        Components                                  517 | #TOC>   3.2        Components                                  520 | ||||||
| #TOC>   4        RANDOM GRAPHS AND GRAPH METRICS               536 | #TOC>   4        RANDOM GRAPHS AND GRAPH METRICS               539 | ||||||
| #TOC>   4.1        Diameter                                    573 | #TOC>   4.1        Diameter                                    576 | ||||||
| #TOC>   5        GRAPH CLUSTERING                              641 | #TOC>   5        GRAPH CLUSTERING                              645 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -123,9 +125,8 @@ set.seed(NULL)                         # reset the RNG | |||||||
| # standard package for work with graphs in r is "igraph". We'll go into more | # standard package for work with graphs in r is "igraph". We'll go into more | ||||||
| # details of the igraph package a bit later, for now we just use it to plot: | # details of the igraph package a bit later, for now we just use it to plot: | ||||||
|  |  | ||||||
| if (! require(igraph, quietly=TRUE)) { | if (! requireNamespace("igraph", quietly = TRUE)) { | ||||||
|   install.packages("igraph") |   install.packages("igraph") | ||||||
|   library(igraph) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = igraph)       # basic information | #  library(help = igraph)       # basic information | ||||||
| @@ -133,10 +134,12 @@ if (! require(igraph, quietly=TRUE)) { | |||||||
| #  data(package = "igraph")     # available datasets | #  data(package = "igraph")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected") | myG <- igraph::graph_from_adjacency_matrix(myRandAM, mode = "undirected") | ||||||
|  |  | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates |                                        # calculate layout coordinates | ||||||
|  | myGxy <- igraph::layout_with_graphopt(myG, | ||||||
|  |                                       charge=0.0012) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -157,9 +160,9 @@ plot(myG, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), |      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), | ||||||
|      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), |      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), | ||||||
|      vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], |      vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], | ||||||
|      vertex.size = 1600 + (300 * degree(myG)), |      vertex.size = 1600 + (300 * igraph::degree(myG)), | ||||||
|      vertex.label = sprintf("%s(%i)", names(V(myG)), degree(myG)), |      vertex.label = sprintf("%s(%i)", names(igraph::V(myG)), igraph::degree(myG)), | ||||||
|      vertex.label.family = "sans", |      vertex.label.family = "sans", | ||||||
|      vertex.label.cex = 0.7) |      vertex.label.cex = 0.7) | ||||||
| par(oPar)  # reset plot window | par(oPar)  # reset plot window | ||||||
| @@ -179,10 +182,10 @@ sum(myRandAM) | |||||||
| rowSums(myRandAM) +  colSums(myRandAM) # check this against the plot! | rowSums(myRandAM) +  colSums(myRandAM) # check this against the plot! | ||||||
|  |  | ||||||
| # The function degree() gives the same values | # The function degree() gives the same values | ||||||
| degree(myG) | igraph::degree(myG) | ||||||
|  |  | ||||||
| # Let's  plot the degree distribution in a histogram: | # Let's  plot the degree distribution in a histogram: | ||||||
| degG <- degree(myG) | degG <- igraph::degree(myG) | ||||||
| brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1)  # define histogram breaks | brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1)  # define histogram breaks | ||||||
| hist(degG, breaks=brk, col="#A5CCF5", | hist(degG, breaks=brk, col="#A5CCF5", | ||||||
|      xlim = c(-1,8), xaxt = "n", |      xlim = c(-1,8), xaxt = "n", | ||||||
| @@ -212,8 +215,8 @@ set.seed(31415927)                     # set RNG seed for repeatable randomness | |||||||
| my200AM <- makeRandomAM(as.character(1:N), p = 0.015) | my200AM <- makeRandomAM(as.character(1:N), p = 0.015) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected") | myG200 <- igraph::graph_from_adjacency_matrix(my200AM, mode = "undirected") | ||||||
| myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout | myGxy <- igraph::layout_with_graphopt(myG200, charge=0.0001) # calculate layout | ||||||
|                                                              # coordinates |                                                              # coordinates | ||||||
|  |  | ||||||
| oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state | oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state | ||||||
| @@ -222,8 +225,8 @@ plot(myG200, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), |      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), | ||||||
|      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), |      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), | ||||||
|      vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1], |      vertex.color=heat.colors(max(igraph::degree(myG200)+1))[igraph::degree(myG200)+1], | ||||||
|      vertex.size = 150 + (60 * degree(myG200)), |      vertex.size = 150 + (60 * igraph::degree(myG200)), | ||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar)                              # restore graphics state | par(oPar)                              # restore graphics state | ||||||
|  |  | ||||||
| @@ -231,7 +234,7 @@ par(oPar)                              # restore graphics state | |||||||
| # biological graphs look approximately like this. | # biological graphs look approximately like this. | ||||||
|  |  | ||||||
| # Calculate degree distributions | # Calculate degree distributions | ||||||
| dg <- degree(myG200) | dg <- igraph::degree(myG200) | ||||||
| brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) | brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) | ||||||
| hist(dg, breaks=brk, col="#A5F5CC", | hist(dg, breaks=brk, col="#A5F5CC", | ||||||
|      xlim = c(-1,11), xaxt = "n", |      xlim = c(-1,11), xaxt = "n", | ||||||
| @@ -263,10 +266,10 @@ plot(log10(as.numeric(names(freqRank)) + 1), | |||||||
| N <- 200 | N <- 200 | ||||||
|  |  | ||||||
| set.seed(31415927)                     # set RNG seed for repeatable randomness | set.seed(31415927)                     # set RNG seed for repeatable randomness | ||||||
| GBA <- sample_pa(N, power = 0.8, directed = FALSE) | GBA <- igraph::sample_pa(N, power = 0.8, directed = FALSE) | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates | GBAxy <- igraph::layout_with_graphopt(GBA, charge=0.0001) | ||||||
|  |  | ||||||
| oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state | oPar <- par(mar= rep(0,4))             # Turn margins off, save graphics state | ||||||
| plot(GBA, | plot(GBA, | ||||||
| @@ -274,8 +277,8 @@ plot(GBA, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01), |      xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01), | ||||||
|      ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01), |      ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01), | ||||||
|      vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1], |      vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1], | ||||||
|      vertex.size = 200 + (30 * degree(GBA)), |      vertex.size = 200 + (30 * igraph::degree(GBA)), | ||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar)                              # restore grphics state | par(oPar)                              # restore grphics state | ||||||
|  |  | ||||||
| @@ -287,7 +290,7 @@ par(oPar)                              # restore grphics state | |||||||
| # singletons. | # singletons. | ||||||
|  |  | ||||||
| # What's the degree distribution of this graph? | # What's the degree distribution of this graph? | ||||||
| (dg <- degree(GBA)) | (dg <- igraph::degree(GBA)) | ||||||
| brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) | brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1) | ||||||
| hist(dg, breaks=brk, col="#DCF5B5", | hist(dg, breaks=brk, col="#DCF5B5", | ||||||
|      xlim = c(0,max(dg)+1), xaxt = "n", |      xlim = c(0,max(dg)+1), xaxt = "n", | ||||||
| @@ -307,8 +310,8 @@ plot(log10(as.numeric(names(freqRank)) + 1), | |||||||
| # Sort-of linear, but many of the higher ranked nodes have a frequency of only | # Sort-of linear, but many of the higher ranked nodes have a frequency of only | ||||||
| # one. That behaviour smooths out in larger graphs: | # one. That behaviour smooths out in larger graphs: | ||||||
| # | # | ||||||
| X <- sample_pa(100000, power = 0.8, directed = FALSE)  # 100,000 nodes | X <- igraph::sample_pa(1e5, power = 0.8, directed = FALSE)  # 100,000 nodes | ||||||
| freqRank <- table(degree(X)) | freqRank <- table(igraph::degree(X)) | ||||||
| plot(log10(as.numeric(names(freqRank)) + 1), | plot(log10(as.numeric(names(freqRank)) + 1), | ||||||
|      log10(as.numeric(freqRank)), type = "b", |      log10(as.numeric(freqRank)), type = "b", | ||||||
|      xlab = "log(Rank)", ylab = "log(frequency)", |      xlab = "log(Rank)", ylab = "log(frequency)", | ||||||
| @@ -404,7 +407,7 @@ rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4) | |||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
|  |  | ||||||
| myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected") | myGRG <- igraph::graph_from_adjacency_matrix(rGAM$mat, mode = "undirected") | ||||||
|  |  | ||||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | oPar <- par(mar= rep(0,4)) # Turn margins off | ||||||
| plot(myGRG, | plot(myGRG, | ||||||
| @@ -412,13 +415,13 @@ plot(myGRG, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), |      xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), | ||||||
|      ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), |      ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), | ||||||
|      vertex.color=heat.colors(max(degree(myGRG)+1))[degree(myGRG)+1], |      vertex.color=heat.colors(max(igraph::degree(myGRG)+1))[igraph::degree(myGRG)+1], | ||||||
|      vertex.size = 0.1 + (0.2 * degree(myGRG)), |      vertex.size = 0.1 + (0.2 * igraph::degree(myGRG)), | ||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar) | par(oPar) | ||||||
|  |  | ||||||
| # degree distribution: | # degree distribution: | ||||||
| (dg <- degree(myGRG)) | (dg <- igraph::degree(myGRG)) | ||||||
| brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1) | brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1) | ||||||
| hist(dg, breaks = brk, col = "#FCC6D2", | hist(dg, breaks = brk, col = "#FCC6D2", | ||||||
|      xlim = c(0, 25), xaxt = "n", |      xlim = c(0, 25), xaxt = "n", | ||||||
| @@ -450,7 +453,7 @@ summary(myG) | |||||||
|  |  | ||||||
| # This output means: this is an IGRAPH graph, with U = UN-directed edges | # This output means: this is an IGRAPH graph, with U = UN-directed edges | ||||||
| #  and N = named nodes, that has 20 nodes and 20 edges. For details, see | #  and N = named nodes, that has 20 nodes and 20 edges. For details, see | ||||||
| ?print.igraph | ?igraph::print.igraph | ||||||
|  |  | ||||||
| mode(myG) | mode(myG) | ||||||
| class(myG) | class(myG) | ||||||
| @@ -463,11 +466,11 @@ class(myG) | |||||||
| # recipes, called _games_ in this package. | # recipes, called _games_ in this package. | ||||||
|  |  | ||||||
| # Two basic functions retrieve nodes "Vertices", and "Edges": | # Two basic functions retrieve nodes "Vertices", and "Edges": | ||||||
| V(myG) | igraph::V(myG) | ||||||
| E(myG) | igraph::E(myG) | ||||||
|  |  | ||||||
| # additional properties can be retrieved from the Vertices ... | # additional properties can be retrieved from the Vertices ... | ||||||
| V(myG)$name | igraph::V(myG)$name | ||||||
|  |  | ||||||
|  |  | ||||||
| # As with many R objects, loading the package provides special functions that | # As with many R objects, loading the package provides special functions that | ||||||
| @@ -487,12 +490,12 @@ plot(myG)  # this is the result of default plot parameters | |||||||
| # Plot with some customizing parameters | # Plot with some customizing parameters | ||||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | oPar <- par(mar= rep(0,4)) # Turn margins off | ||||||
| plot(myG, | plot(myG, | ||||||
|      layout = layout_with_fr(myG), |      layout = igraph::layout_with_fr(myG), | ||||||
|      vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], |      vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], | ||||||
|      vertex.size = 9 + (2 * degree(myG)), |      vertex.size = 9 + (2 * igraph::degree(myG)), | ||||||
|      vertex.label.cex = 0.5 + (0.05 * degree(myG)), |      vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)), | ||||||
|      edge.width = 2, |      edge.width = 2, | ||||||
|      vertex.label = V(myG)$name, |      vertex.label = igraph::V(myG)$name, | ||||||
|      vertex.label.family = "sans", |      vertex.label.family = "sans", | ||||||
|      vertex.label.cex = 0.9) |      vertex.label.cex = 0.9) | ||||||
| par(oPar) | par(oPar) | ||||||
| @@ -500,12 +503,12 @@ par(oPar) | |||||||
| # ... or with a different layout: | # ... or with a different layout: | ||||||
| oPar <- par(mar= rep(0,4)) # Turn margins off | oPar <- par(mar= rep(0,4)) # Turn margins off | ||||||
| plot(myG, | plot(myG, | ||||||
|      layout = layout_in_circle(myG), |      layout = igraph::layout_in_circle(myG), | ||||||
|      vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1], |      vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], | ||||||
|      vertex.size = 9 + (2 * degree(myG)), |      vertex.size = 9 + (2 * igraph::degree(myG)), | ||||||
|      vertex.label.cex = 0.5 + (0.05 * degree(myG)), |      vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)), | ||||||
|      edge.width = 2, |      edge.width = 2, | ||||||
|      vertex.label = V(myG)$name, |      vertex.label = igraph::V(myG)$name, | ||||||
|      vertex.label.family = "sans", |      vertex.label.family = "sans", | ||||||
|      vertex.label.cex = 0.9) |      vertex.label.cex = 0.9) | ||||||
| par(oPar) | par(oPar) | ||||||
| @@ -518,18 +521,18 @@ par(oPar) | |||||||
|  |  | ||||||
| # The igraph function components() tells us whether there are components of the | # The igraph function components() tells us whether there are components of the | ||||||
| # graph in which there is no path to other components. | # graph in which there is no path to other components. | ||||||
| components(myG) | igraph::components(myG) | ||||||
|  |  | ||||||
| # In the _membership_ vector, nodes are annotated with the index of the | # In the _membership_ vector, nodes are annotated with the index of the | ||||||
| # component they are part of. Sui7 is the only node of component 2, Cyj1 is in | # component they are part of. Sui7 is the only node of component 2, Cyj1 is in | ||||||
| # the third component etc. This is perhaps more clear if we sort by component | # the third component etc. This is perhaps more clear if we sort by component | ||||||
| # index | # index | ||||||
| sort(components(myG)$membership, decreasing = TRUE) | sort(igraph::components(myG)$membership, decreasing = TRUE) | ||||||
|  |  | ||||||
| # Retrieving e.g. the members of the first component from the list can be done by subsetting: | # Retrieving e.g. the members of the first component from the list can be done by subsetting: | ||||||
|  |  | ||||||
| (sel <- components(myG)$membership == 1)  # boolean vector .. | (sel <- igraph::components(myG)$membership == 1)  # boolean vector .. | ||||||
| (c1 <- components(myG)$membership[sel]) | (c1 <- igraph::components(myG)$membership[sel]) | ||||||
| names(c1) | names(c1) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -542,9 +545,9 @@ names(c1) | |||||||
| # preferential-attachment ... but igraph has ways to simulate the basic ones | # preferential-attachment ... but igraph has ways to simulate the basic ones | ||||||
| # (and we could easily simulate our own). Look at the following help pages: | # (and we could easily simulate our own). Look at the following help pages: | ||||||
|  |  | ||||||
| ?sample_gnm                # see also sample_gnp for the Erdös-Rényi models | ?igraph::sample_gnm             # see also sample_gnp for the Erdös-Rényi models | ||||||
| ?sample_smallworld         # for the Watts & Strogatz model | ?igraph::sample_smallworld      # for the Watts & Strogatz model | ||||||
| ?sample_pa                 # for the Barabasi-Albert model | ?igraph::sample_pa              # for the Barabasi-Albert model | ||||||
|  |  | ||||||
| # But note that there are many more sample_ functions. Check out the docs! | # But note that there are many more sample_ functions. Check out the docs! | ||||||
|  |  | ||||||
| @@ -554,7 +557,7 @@ names(c1) | |||||||
| # layout drawas them, obviously. | # layout drawas them, obviously. | ||||||
|  |  | ||||||
| set.seed(112358)                       # set RNG seed for repeatable randomness | set.seed(112358)                       # set RNG seed for repeatable randomness | ||||||
| myGxy <- layout_with_fr(myG)           # calculate layout coordinates | myGxy <- igraph::layout_with_fr(myG)   # calculate layout coordinates | ||||||
| set.seed(NULL)                         # reset the RNG | set.seed(NULL)                         # reset the RNG | ||||||
|  |  | ||||||
| oPar <- par(mar = rep(0, 4))           # turn margins off, save graphics state | oPar <- par(mar = rep(0, 4))           # turn margins off, save graphics state | ||||||
| @@ -563,30 +566,31 @@ plot(myG, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), |      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), | ||||||
|      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), |      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), | ||||||
|      vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1], |      vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1], | ||||||
|      vertex.size = 20 + (10 * degree(myG)), |      vertex.size = 20 + (10 * igraph::degree(myG)), | ||||||
|      vertex.label = V(myG)$name, |      vertex.label = igraph::V(myG)$name, | ||||||
|      vertex.label.family = "sans", |      vertex.label.family = "sans", | ||||||
|      vertex.label.cex = 0.8) |      vertex.label.cex = 0.8) | ||||||
| par(oPar)                              # restore graphics state | par(oPar)                              # restore graphics state | ||||||
|  |  | ||||||
| # ==   4.1  Diameter  ========================================================== | # ==   4.1  Diameter  ========================================================== | ||||||
|  |  | ||||||
| diameter(myG)  # The diameter of a graph is its maximum length shortest path. | igraph::diameter(myG)  # The diameter of a graph is its maximum length | ||||||
|  |                        # shortest path. | ||||||
|  |  | ||||||
| # let's plot this path: here are the nodes ... | # let's plot this path: here are the nodes ... | ||||||
| get_diameter(myG) | igraph::get_diameter(myG) | ||||||
|  |  | ||||||
| # ... and we can get the x, y coordinates from iGxy by subsetting with the node | # ... and we can get the x, y coordinates from iGxy by subsetting with the node | ||||||
| # names. The we draw the diameter-path with a transparent, thick pink line: | # names. The we draw the diameter-path with a transparent, thick pink line: | ||||||
| lines(myGxy[get_diameter(myG),], lwd=10, col="#ff63a788") | lines(myGxy[igraph::get_diameter(myG),], lwd=10, col="#ff63a788") | ||||||
|  |  | ||||||
| # == Centralization scores | # == Centralization scores | ||||||
|  |  | ||||||
| ?centralize | ?igraph::centralize | ||||||
| # replot our graph, and color by log_betweenness: | # replot our graph, and color by log_betweenness: | ||||||
|  |  | ||||||
| bC <- centr_betw(myG)  # calculate betweenness centrality | bC <- igraph::centr_betw(myG)  # calculate betweenness centrality | ||||||
| nodeBetw <- bC$res | nodeBetw <- bC$res | ||||||
| nodeBetw <- round(log(nodeBetw +1)) + 1 | nodeBetw <- round(log(nodeBetw +1)) + 1 | ||||||
|  |  | ||||||
| @@ -597,8 +601,8 @@ plot(myG, | |||||||
|      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), |      xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01), | ||||||
|      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), |      ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01), | ||||||
|      vertex.color=heat.colors(max(nodeBetw))[nodeBetw], |      vertex.color=heat.colors(max(nodeBetw))[nodeBetw], | ||||||
|      vertex.size = 20 + (10 * degree(myG)), |      vertex.size = 20 + (10 * igraph::degree(myG)), | ||||||
|      vertex.label = V(myG)$name, |      vertex.label = igraph::V(myG)$name, | ||||||
|      vertex.label.family = "sans", |      vertex.label.family = "sans", | ||||||
|      vertex.label.cex = 0.7) |      vertex.label.cex = 0.7) | ||||||
| par(oPar) | par(oPar) | ||||||
| @@ -613,7 +617,7 @@ par(oPar) | |||||||
| # | # | ||||||
| # Lets plot betweenness centrality for our random geometric graph: | # Lets plot betweenness centrality for our random geometric graph: | ||||||
|  |  | ||||||
| bCmyGRG <- centr_betw(myGRG)  # calculate betweenness centrality | bCmyGRG <- igraph::centr_betw(myGRG)  # calculate betweenness centrality | ||||||
|  |  | ||||||
| nodeBetw <- bCmyGRG$res | nodeBetw <- bCmyGRG$res | ||||||
| nodeBetw <- round((log(nodeBetw +1))^2.5) + 1 | nodeBetw <- round((log(nodeBetw +1))^2.5) + 1 | ||||||
| @@ -630,9 +634,9 @@ plot(myGRG, | |||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar) | par(oPar) | ||||||
|  |  | ||||||
| diameter(myGRG) | igraph::diameter(myGRG) | ||||||
| lines(rGAM$x[get_diameter(myGRG)], | lines(rGAM$x[igraph::get_diameter(myGRG)], | ||||||
|       rGAM$y[get_diameter(myGRG)], |       rGAM$y[igraph::get_diameter(myGRG)], | ||||||
|       lwd = 10, |       lwd = 10, | ||||||
|       col = "#ff335533") |       col = "#ff335533") | ||||||
|  |  | ||||||
| @@ -648,11 +652,11 @@ lines(rGAM$x[get_diameter(myGRG)], | |||||||
| # http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org | # http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org | ||||||
|  |  | ||||||
|  |  | ||||||
| myGRGclusters <- cluster_infomap(myGRG) | myGRGclusters <- igraph::cluster_infomap(myGRG) | ||||||
| modularity(myGRGclusters) # ... measures how separated the different membership | igraph::modularity(myGRGclusters)  # ... measures how separated the different | ||||||
|                          # types are from each other |                                    # membership types are from each other | ||||||
| membership(myGRGclusters) # which nodes are in what cluster? | igraph::membership(myGRGclusters)         # which nodes are in what cluster? | ||||||
| table(membership(myGRGclusters))  # how large are the clusters? | table(igraph::membership(myGRGclusters))  # how large are the clusters? | ||||||
|  |  | ||||||
| # The largest cluster has 48 members, the second largest has 25, etc. | # The largest cluster has 48 members, the second largest has 25, etc. | ||||||
|  |  | ||||||
| @@ -661,7 +665,7 @@ table(membership(myGRGclusters))  # how large are the clusters? | |||||||
| # their cluster membership: | # their cluster membership: | ||||||
|  |  | ||||||
| # first, make a vector with as many grey colors as we have communities ... | # first, make a vector with as many grey colors as we have communities ... | ||||||
| commColors <- rep("#f1eef6", max(membership(myGRGclusters))) | commColors <- rep("#f1eef6", max(igraph::membership(myGRGclusters))) | ||||||
| # ... then overwrite the first five with "real colors" - something like rust, | # ... then overwrite the first five with "real colors" - something like rust, | ||||||
| # lilac, pink, and mauve or so. | # lilac, pink, and mauve or so. | ||||||
| commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da") | commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da") | ||||||
| @@ -673,8 +677,8 @@ plot(myGRG, | |||||||
|      rescale = FALSE, |      rescale = FALSE, | ||||||
|      xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), |      xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1), | ||||||
|      ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), |      ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1), | ||||||
|      vertex.color=commColors[membership(myGRGclusters)], |      vertex.color=commColors[igraph::membership(myGRGclusters)], | ||||||
|      vertex.size = 0.1 + (0.1 * degree(myGRG)), |      vertex.size = 0.1 + (0.1 * igraph::degree(myGRG)), | ||||||
|      vertex.label = NA) |      vertex.label = NA) | ||||||
| par(oPar) | par(oPar) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the FND-STA-Probability_distribution unit. | #              R code accompanying the FND-STA-Probability_distribution unit. | ||||||
| # | # | ||||||
| # Version:  1.2 | # Version:  1.3 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2019  01 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.3    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
| #           1.2    Update set.seed() usage | #           1.2    Update set.seed() usage | ||||||
| #           1.1    Corrected empirical p-value | #           1.1    Corrected empirical p-value | ||||||
| #           1.0    First code live version | #           1.0    First code live version | ||||||
| @@ -28,21 +30,21 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                         Line | #TOC>   Section  Title                                                         Line | ||||||
| #TOC> ------------------------------------------------------------------------- | #TOC> ----------------------------------------------------------------------------- | ||||||
| #TOC>   1        Introduction                                                50 | #TOC>   1        Introduction                                                    52 | ||||||
| #TOC>   2        Three fundamental distributions                            113 | #TOC>   2        Three fundamental distributions                                115 | ||||||
| #TOC>   2.1        The Poisson Distribution                                 116 | #TOC>   2.1        The Poisson Distribution                                     118 | ||||||
| #TOC>   2.2        The uniform distribution                                 170 | #TOC>   2.2        The uniform distribution                                     172 | ||||||
| #TOC>   2.3        The Normal Distribution                                  190 | #TOC>   2.3        The Normal Distribution                                      192 | ||||||
| #TOC>   3        quantile-quantile comparison                               231 | #TOC>   3        quantile-quantile comparison                                   233 | ||||||
| #TOC>   3.1        qqnorm()                                                 241 | #TOC>   3.1        qqnorm()                                                     243 | ||||||
| #TOC>   3.2        qqplot()                                                 307 | #TOC>   3.2        qqplot()                                                     309 | ||||||
| #TOC>   4        Quantifying the difference                                 324 | #TOC>   4        Quantifying the difference                                     326 | ||||||
| #TOC>   4.1        Chi2 test for discrete distributions                     359 | #TOC>   4.1        Chi2 test for discrete distributions                         361 | ||||||
| #TOC>   4.2        Kullback-Leibler divergence                              451 | #TOC>   4.2        Kullback-Leibler divergence                                  452 | ||||||
| #TOC>   4.2.1          An example from tossing dice                         462 | #TOC>   4.2.1          An example from tossing dice                             463 | ||||||
| #TOC>   4.2.2          An example from lognormal distributions              585 | #TOC>   4.2.2          An example from lognormal distributions                  586 | ||||||
| #TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions     628 | #TOC>   4.3        Kolmogorov-Smirnov test for continuous distributions         629 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -385,9 +387,8 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4]) | |||||||
| # package information - plotrix has _many_ useful utilities to enhance | # package information - plotrix has _many_ useful utilities to enhance | ||||||
| # plots or produce informative visualizations. | # plots or produce informative visualizations. | ||||||
|  |  | ||||||
| if (! require(plotrix, quietly=TRUE)) { | if (! requireNamespace("plotrix", quietly = TRUE)) { | ||||||
|   install.packages("plotrix") |   install.packages("plotrix") | ||||||
|   library(plotrix) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = plotrix)       # basic information | #  library(help = plotrix)       # basic information | ||||||
| @@ -395,7 +396,7 @@ if (! require(plotrix, quietly=TRUE)) { | |||||||
| #  data(package = "plotrix")     # available datasets | #  data(package = "plotrix")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| h <- multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ), | h <- plotrix::multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ), | ||||||
|                        breaks = myBreaks, |                        breaks = myBreaks, | ||||||
|                        col = myCols) |                        col = myCols) | ||||||
| legend("topright", | legend("topright", | ||||||
|   | |||||||
							
								
								
									
										151
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							
							
						
						
									
										151
									
								
								RPR-Biostrings.R
									
									
									
									
									
								
							| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Biostrings unit. | #              R code accompanying the RPR-Biostrings unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  20 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.0    2017 Revisions | #           1.0    2017 Revisions | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -28,19 +31,19 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC> | #TOC> | ||||||
| #TOC>   Section  Title                                           Line | #TOC>   Section  Title                                           Line | ||||||
| #TOC> --------------------------------------------------------- | #TOC> --------------------------------------------------------------- | ||||||
| #TOC>   1        The Biostrings Package                      52 | #TOC>   1        The Biostrings Package                            55 | ||||||
| #TOC>   2        Getting Data into Biostrings Objects        85 | #TOC>   2        Getting Data into Biostrings Objects              86 | ||||||
| #TOC>   3        Working with Biostrings Objects            106 | #TOC>   3        Working with Biostrings Objects                  108 | ||||||
| #TOC>   3.1      Properties                                 109 | #TOC>   3.1        Properties                                     125 | ||||||
| #TOC>   3.2      Subsetting                                 146 | #TOC>   3.2        Subsetting                                     163 | ||||||
| #TOC>   3.3      Operators                                  158 | #TOC>   3.3        Operators                                      175 | ||||||
| #TOC>   3.4      Transformations                            165 | #TOC>   3.4        Transformations                                182 | ||||||
| #TOC>   4        Getting Data out of Biostrings Objects     172 | #TOC>   4        Getting Data out of Biostrings Objects           189 | ||||||
| #TOC>   5        More                                       181 | #TOC>   5        More                                             198 | ||||||
| #TOC>   5.1      Views                                      183 | #TOC>   5.1        Views                                          200 | ||||||
| #TOC>   5.2      Iranges                                    195 | #TOC>   5.2        Iranges                                        214 | ||||||
| #TOC>   5.3      StringSets                                 201 | #TOC>   5.3        StringSets                                     220 | ||||||
| #TOC> | #TOC> | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -54,14 +57,12 @@ | |||||||
|  |  | ||||||
| # First, we install and load the Biostrings package from bioconductor | # First, we install and load the Biostrings package from bioconductor | ||||||
|  |  | ||||||
| if (! require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
|  |  | ||||||
| # Examine the package information: | # Examine the package information: | ||||||
| library(help = Biostrings)       # basic information | library(help = Biostrings)       # basic information | ||||||
| browseVignettes("Biostrings")    # available vignettes | browseVignettes("Biostrings")    # available vignettes | ||||||
| @@ -72,72 +73,88 @@ data(package = "Biostrings")     # available datasets | |||||||
| # of a "class" in R as a special kind of list), that can take on particular | # of a "class" in R as a special kind of list), that can take on particular | ||||||
| # flavours for RNA, DNA or amino acid sequence information. | # flavours for RNA, DNA or amino acid sequence information. | ||||||
|  |  | ||||||
| class(RNAString("AUG")) | class(Biostrings::RNAString("AUG")) | ||||||
| class(DNAString("ATG")) | class(Biostrings::DNAString("ATG")) | ||||||
| class(AAString("M")) | class(Biostrings::AAString("M")) | ||||||
|  |  | ||||||
| # An essential property of Biostrings objects is that they only allow letters | # An essential property of Biostrings objects is that they only allow letters | ||||||
| # from the applicable IUPAC alphabet: | # from the applicable IUPAC alphabet: | ||||||
| RNAString("AUG") | Biostrings::RNAString("AUG") | ||||||
| DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | Biostrings::DNAString("AUG")  # Error! No "U" in IUPAC DNA codes | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    2  Getting Data into Biostrings Objects  ================================ | # =    2  Getting Data into Biostrings Objects  ================================ | ||||||
|  |  | ||||||
|  |  | ||||||
| # Example: read FASTA. Extract sequence. Convert to DNAString object. | # Example: read FASTA. Extract sequence. Convert to DNAString object. | ||||||
| x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa") | ||||||
| x <- dbSanitizeSequence(x) | rawSeq <- dbSanitizeSequence(rawSeq) | ||||||
| myDNAseq <- DNAString(x)   # takes the nucleotide sequence and converts into a | biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence | ||||||
| # object of class DNAstring |                                             # into an object of class DNAstring | ||||||
|  |  | ||||||
| # Multi FASTA files can be read directly as a "XStringSet) ... | # Multi FASTA files can be read directly as a "XStringSet) ... | ||||||
| (myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa")) | rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa" | ||||||
|  | (biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile)) | ||||||
|  |  | ||||||
| # ... and if you subset one sequence from the set, you get an XString object | # ... and if you subset one sequence from the set, you get an XString object | ||||||
| # back again. | # back again. | ||||||
| (Xseq <- myDNASet[[1]]) | (Xseq <- biosDNASet[[1]]) | ||||||
|  |  | ||||||
| myDNAseq == Xseq           # the comparison evaluates to TRUE ... | biosDNAseq == Xseq           # the comparison evaluates to TRUE ... | ||||||
| identical(myDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | identical(biosDNAseq, Xseq)  # ... and indeed the objects are deemed identical. | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Working with Biostrings Objects  ===================================== | # =    3  Working with Biostrings Objects  ===================================== | ||||||
|  |  | ||||||
|  | # Biostrings is a highly engineered package that is tightly integrated into | ||||||
|  | # the Bioconductor world - unfortunately that brings with it a somewhat | ||||||
|  | # undesirable level of computational overhead and dependencies. Using the | ||||||
|  | # package as we normally do - i.e. calling required functions with their | ||||||
|  | # explicit package prefix is therefore not advisable. There are generics | ||||||
|  | # that won't be propery dispatched. If you only need a small number of | ||||||
|  | # functions for a very specific context, you will probably get away with | ||||||
|  | # Biostrings::<function>() - but even in the demonstration code of this script | ||||||
|  | # not everything works out of the box. We'll therefore load the library, | ||||||
|  | # but we'll (redundantly) use the prefix anyway so as to emphasize where | ||||||
|  | # the functions come from. | ||||||
|  |  | ||||||
|  | library(Biostrings) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.1  Properties  ======================================================== | # ==   3.1  Properties  ======================================================== | ||||||
| str(myDNAseq) | str(rawSeq) | ||||||
| length(myDNAseq)  # This gives you the _number of nucleotides_! | str(biosDNAseq) | ||||||
| # By comparison ... |  | ||||||
| length(x)         # ... is 1: one string only. To get the number of |  | ||||||
| # characters in a string, you need nchar(). |  | ||||||
| nchar(x)          # However ... |  | ||||||
| nchar(myDNAseq)   # ... also works. |  | ||||||
|  |  | ||||||
| uniqueLetters(myDNAseq) | length(rawSeq)       # ... is 1: one string only. To get the number of | ||||||
|  |                      # characters in a string, you need nchar(). | ||||||
|  | length(biosDNAseq)   # but the length of a "Bstring" is the number of elements | ||||||
|  | nchar(rawSeq) | ||||||
|  | nchar(biosDNAseq)    # ... but nchar() works too. | ||||||
|  |  | ||||||
|  | (uL <- Biostrings::uniqueLetters(biosDNAseq)) | ||||||
|  |  | ||||||
| # Count frequencies - with strings, you would strsplit() into a character | # Count frequencies - with strings, you would strsplit() into a character | ||||||
| # vector and then use table(). biost | # vector and then use table(). biost | ||||||
| alphabetFrequency(myDNAseq) | Biostrings::alphabetFrequency(biosDNAseq) | ||||||
|  |  | ||||||
| # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | # letterFrequency() works with a defined alphabet - such as what uniqueLetters() | ||||||
| # returns. | # returns. | ||||||
| letterFrequency(myDNAseq, uniqueLetters(myDNAseq)) | Biostrings::letterFrequency(biosDNAseq, uL) | ||||||
|  | sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) / | ||||||
|  |   length(biosDNAseq) # GC contents | ||||||
|  |  | ||||||
| sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents | Biostrings::dinucleotideFrequency(biosDNAseq) | ||||||
|  | barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5) | ||||||
|  |  | ||||||
| dinucleotideFrequency(myDNAseq) | (triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq)) | ||||||
| barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5) |  | ||||||
|  |  | ||||||
| (triNuc <- trinucleotideFrequency(myDNAseq)) |  | ||||||
| barplot(sort(triNuc), col="#4499EE33") | barplot(sort(triNuc), col="#4499EE33") | ||||||
| triNuc[triNuc == max(triNuc)] | triNuc[triNuc == max(triNuc)] | ||||||
| triNuc[triNuc == min(triNuc)] | triNuc[triNuc == min(triNuc)] | ||||||
| max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | max(triNuc) / min(triNuc)  # AAA is more than 13 times as frequent as CGT | ||||||
|  |  | ||||||
| # compare to a shuffled sequence: | # compare to a shuffled sequence: | ||||||
| (triNuc <- trinucleotideFrequency(sample(myDNAseq))) | (triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq))) | ||||||
| barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | ||||||
|  |  | ||||||
| # Interpret this plot. | # Interpret this plot. | ||||||
| @@ -146,34 +163,34 @@ barplot(sort(triNuc), col="#EEEE4433", add = TRUE) | |||||||
| # ==   3.2  Subsetting  ======================================================== | # ==   3.2  Subsetting  ======================================================== | ||||||
|  |  | ||||||
| # Subsetting any XString object works as expected: | # Subsetting any XString object works as expected: | ||||||
| myDNAseq[4:15] | biosDNAseq[4:15] | ||||||
|  |  | ||||||
| # ... well - maybe not expected, because x[4:15] would not work. | # ... well - maybe not expected, because rawSeq[4:15] would not work. | ||||||
|  |  | ||||||
| # Alternatively to the "[" operator, use the subseq() function - especially for | # Alternatively to the "[" operator, use the subseq() function - especially for | ||||||
| # long sequences. This is far more efficient. | # long sequences. This is far more efficient. | ||||||
| subseq(myDNAseq, start = 1, end = 30) | Biostrings::subseq(biosDNAseq, start = 1, end = 30) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.3  Operators  ========================================================= | # ==   3.3  Operators  ========================================================= | ||||||
|  |  | ||||||
| # RNAstring() and DNAstring() objects compare U and T as equals! | # RNAstring() and DNAstring() objects compare U and T as equals! | ||||||
| RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == |   Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") == | ||||||
|   DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") |   Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT") | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.4  Transformations  =================================================== | # ==   3.4  Transformations  =================================================== | ||||||
|  |  | ||||||
| myDNAseq[4:15] | biosDNAseq[4:15] | ||||||
| reverseComplement(myDNAseq[4:15]) | Biostrings::reverseComplement(biosDNAseq[4:15]) | ||||||
| translate(myDNAseq[4:15]) | Biostrings::translate(biosDNAseq[4:15]) | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    4  Getting Data out of Biostrings Objects  ============================== | # =    4  Getting Data out of Biostrings Objects  ============================== | ||||||
|  |  | ||||||
| # If you need a character object, use toString(): | # If you need a character object, use toString(): | ||||||
|  |  | ||||||
| toString(myDNAseq[4:15]) | Biostrings::toString(biosDNAseq[4:15]) | ||||||
|  |  | ||||||
| # save() and load() works like on all other R objects. | # save() and load() works like on all other R objects. | ||||||
|  |  | ||||||
| @@ -185,7 +202,9 @@ toString(myDNAseq[4:15]) | |||||||
| # Biostring "Views" are objects that store multiple substrings of one | # Biostring "Views" are objects that store multiple substrings of one | ||||||
| # Biostring object. | # Biostring object. | ||||||
|  |  | ||||||
| (myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45))) | (myView <- Biostrings::Views(biosDNAseq, | ||||||
|  |                              start = c(1, 19, 37), | ||||||
|  |                              end = c(15, 30, 45))) | ||||||
|  |  | ||||||
| # Views are convenient to store feature annotations | # Views are convenient to store feature annotations | ||||||
| names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | names(myView) <- c("Feature-A", "Feature-B", "Feature-C") | ||||||
| @@ -202,20 +221,20 @@ cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView )) | |||||||
|  |  | ||||||
| # Biostring "StringSets" store multiple sequences. | # Biostring "StringSets" store multiple sequences. | ||||||
| # | # | ||||||
| ompA <- AAString("MKKTAIAIAVALAGFATVAQA") | ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA") | ||||||
| sample(ompA) # sample can work directly on a Biostring object to shuffle it | sample(ompA) # sample can work directly on a Biostring object to shuffle it | ||||||
|  |  | ||||||
| x[1] <- toString(ompA) | x <- Biostrings::toString(ompA) | ||||||
| for (i in 2:10) { | for (i in 2:10) { | ||||||
|   x[i] <- toString(sample(ompA)) |   x[i] <- Biostrings::toString(sample(ompA)) | ||||||
| } | } | ||||||
| shuffledPeptideSet <- AAStringSet(x) | shuffledPeptideSet <- Biostrings::AAStringSet(x) | ||||||
| names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep="")) | ||||||
| shuffledPeptideSet | shuffledPeptideSet | ||||||
|  |  | ||||||
| length(shuffledPeptideSet) | length(shuffledPeptideSet) | ||||||
| width(shuffledPeptideSet) | Biostrings::width(shuffledPeptideSet) | ||||||
| alphabetFrequency(shuffledPeptideSet) | Biostrings::alphabetFrequency(shuffledPeptideSet) | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
							
								
								
									
										167
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							
							
						
						
									
										167
									
								
								RPR-GEO2R.R
									
									
									
									
									
								
							| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR_GEO2R unit. | #              R code accompanying the RPR_GEO2R unit. | ||||||
| # | # | ||||||
| # Version:  1.1 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017 09  -  2018 01 | # Date:     2017 09  -  2019 01 | ||||||
| # Author:   Boris Steipe <boris.steipe@utoronto.ca> | # Author:   Boris Steipe <boris.steipe@utoronto.ca> | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.1    Add section on GPL annotations | #           1.1    Add section on GPL annotations | ||||||
| #           1.0    Updates for BCH441 2017 | #           1.0    Updates for BCH441 2017 | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| @@ -33,19 +36,19 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                      Line | #TOC>   Section  Title                                                      Line | ||||||
| #TOC> -------------------------------------------------------------------- | #TOC> -------------------------------------------------------------------------- | ||||||
| #TOC>   1        Preparations                                           53 | #TOC>   1        Preparations                                                 56 | ||||||
| #TOC>   2        Loading a GEO Dataset                                  84 | #TOC>   2        Loading a GEO Dataset                                        82 | ||||||
| #TOC>   3        Column wise analysis - time points                    154 | #TOC>   3        Column wise analysis - time points                          152 | ||||||
| #TOC>   3.1      Task - Comparison of experiments                      160 | #TOC>   3.1        Task - Comparison of experiments                          158 | ||||||
| #TOC>   3.2      Grouped Samples                                       207 | #TOC>   3.2        Grouped Samples                                           205 | ||||||
| #TOC>   4        Row-wise Analysis: Expression Profiles                242 | #TOC>   4        Row-wise Analysis: Expression Profiles                      240 | ||||||
| #TOC>   4.1      Task - Read a table of features                       277 | #TOC>   4.1        Task - Read a table of features                           275 | ||||||
| #TOC>   4.2      Selected Expression profiles                          325 | #TOC>   4.2        Selected Expression profiles                              323 | ||||||
| #TOC>   5        Differential Expression                               366 | #TOC>   5        Differential Expression                                     364 | ||||||
| #TOC>   5.1      Final task: Gene descriptions                         490 | #TOC>   5.1        Final task: Gene descriptions                             504 | ||||||
| #TOC>   6        Improving on Discovery by Differential Expression     495 | #TOC>   6        Improving on Discovery by Differential Expression           510 | ||||||
| #TOC>   7        Annotation data                                       577 | #TOC>   7        Annotation data                                             594 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -55,12 +58,11 @@ | |||||||
| # To load and analyze GEO datasets we use a number of Bioconductor packages: | # To load and analyze GEO datasets we use a number of Bioconductor packages: | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(Biobase, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biobase") | if (! requireNamespace("Biobase", quietly = TRUE)) { | ||||||
|   library(Biobase) |   BiocManager::install("Biobase") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biobase)       # basic information | #  library(help = Biobase)       # basic information | ||||||
| @@ -68,12 +70,8 @@ if (! require(Biobase, quietly=TRUE)) { | |||||||
| #  data(package = "Biobase")     # available datasets | #  data(package = "Biobase")     # available datasets | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(GEOquery, quietly=TRUE)) { | if (! requireNamespace("GEOquery", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("GEOquery") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("GEOquery") |  | ||||||
|   library(GEOquery) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = GEOquery)       # basic information | #  library(help = GEOquery)       # basic information | ||||||
| @@ -94,7 +92,7 @@ if (! require(GEOquery, quietly=TRUE)) { | |||||||
| # I have experienced outages over several hours. If the command below does | # I have experienced outages over several hours. If the command below does | ||||||
| # not work for you, skip ahead to the fallback procedure. | # not work for you, skip ahead to the fallback procedure. | ||||||
|  |  | ||||||
| GSE3635 <- getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE) | GSE3635 <- GEOquery::getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE) | ||||||
| # Note: GEO2R scripts call the expression data set | # Note: GEO2R scripts call the expression data set | ||||||
| #       "gset" throughout ... in this script I give | #       "gset" throughout ... in this script I give | ||||||
| #       it the name "GSE3635" for clarity. | #       it the name "GSE3635" for clarity. | ||||||
| @@ -136,14 +134,14 @@ help("ExpressionSet-class") | |||||||
| GSE3635 | GSE3635 | ||||||
|  |  | ||||||
| # Access contents via methods: | # Access contents via methods: | ||||||
| featureNames(GSE3635)[1:20]   # Rows. What are these features? | Biobase::featureNames(GSE3635)[1:20]   # Rows. What are these features? | ||||||
| sampleNames(GSE3635)[1:10]    # Columns. What are these columns? | Biobase::sampleNames(GSE3635)[1:10]    # Columns. What are these columns? | ||||||
|  |  | ||||||
| # Access contents by subsetting: | # Access contents by subsetting: | ||||||
| ( tmp <- GSE3635[12:17, 1:6] ) | ( tmp <- GSE3635[12:17, 1:6] ) | ||||||
|  |  | ||||||
| # Access data | # Access data | ||||||
| exprs(tmp)   # exprs() gives us the actual expression values. | Biobase::exprs(tmp)   # exprs() gives us the actual expression values. | ||||||
|  |  | ||||||
|  |  | ||||||
| #TASK> What are the data: | #TASK> What are the data: | ||||||
| @@ -160,9 +158,9 @@ exprs(tmp)   # exprs() gives us the actual expression values. | |||||||
| # ==   3.1  Task - Comparison of experiments  ================================== | # ==   3.1  Task - Comparison of experiments  ================================== | ||||||
|  |  | ||||||
| # Get an overview of the distribution of data values in individual columns | # Get an overview of the distribution of data values in individual columns | ||||||
| summary(exprs(GSE3635)[ , 1]) | summary(Biobase::exprs(GSE3635)[ , 1]) | ||||||
| summary(exprs(GSE3635)[ , 4]) | summary(Biobase::exprs(GSE3635)[ , 4]) | ||||||
| summary(exprs(GSE3635)[ , 7]) | summary(Biobase::exprs(GSE3635)[ , 7]) | ||||||
|  |  | ||||||
| # as a boxplot | # as a boxplot | ||||||
| cyclicPalette <- colorRampPalette(c("#00AAFF", | cyclicPalette <- colorRampPalette(c("#00AAFF", | ||||||
| @@ -173,7 +171,7 @@ cyclicPalette <- colorRampPalette(c("#00AAFF", | |||||||
|                                     "#FFAA00", |                                     "#FFAA00", | ||||||
|                                     "#00AAFF")) |                                     "#00AAFF")) | ||||||
| tCols <- cyclicPalette(13) | tCols <- cyclicPalette(13) | ||||||
| boxplot(exprs(GSE3635), col = tCols) | boxplot(Biobase::exprs(GSE3635), col = tCols) | ||||||
|  |  | ||||||
|  |  | ||||||
| #TASK>     Study this boxplot. What's going on? Are these expression values? | #TASK>     Study this boxplot. What's going on? Are these expression values? | ||||||
| @@ -181,11 +179,11 @@ boxplot(exprs(GSE3635), col = tCols) | |||||||
|  |  | ||||||
|  |  | ||||||
| # Lets plot the distributions of values in a more fine-grained manner: | # Lets plot the distributions of values in a more fine-grained manner: | ||||||
| hT0  <- hist(exprs(GSE3635)[ ,  1], breaks = 100) | hT0  <- hist(Biobase::exprs(GSE3635)[ ,  1], breaks = 100) | ||||||
| hT3  <- hist(exprs(GSE3635)[ ,  4], breaks = 100) | hT3  <- hist(Biobase::exprs(GSE3635)[ ,  4], breaks = 100) | ||||||
| hT6  <- hist(exprs(GSE3635)[ ,  7], breaks = 100) | hT6  <- hist(Biobase::exprs(GSE3635)[ ,  7], breaks = 100) | ||||||
| hT9  <- hist(exprs(GSE3635)[ , 10], breaks = 100) | hT9  <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100) | ||||||
| hT12 <- hist(exprs(GSE3635)[ , 13], breaks = 100) | hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100) | ||||||
|  |  | ||||||
|  |  | ||||||
| plot(  hT0$mids,  hT0$counts,  type = "l", col =  tCols[1], xlim = c(-0.5, 0.5)) | plot(  hT0$mids,  hT0$counts,  type = "l", col =  tCols[1], xlim = c(-0.5, 0.5)) | ||||||
| @@ -218,7 +216,7 @@ for (i in 1:nchar(gsms)) { | |||||||
| sml <- paste("G", sml, sep="")  # set group names | sml <- paste("G", sml, sep="")  # set group names | ||||||
|  |  | ||||||
| # order samples by group | # order samples by group | ||||||
| ex <- exprs(GSE3635)[ , order(sml)] | ex <- Biobase::exprs(GSE3635)[ , order(sml)] | ||||||
| sml <- sml[order(sml)] | sml <- sml[order(sml)] | ||||||
| fl <- as.factor(sml) | fl <- as.factor(sml) | ||||||
| labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we | labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we | ||||||
| @@ -231,8 +229,8 @@ labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we | |||||||
| GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5", | GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5", | ||||||
|              "#dff4e4", "#f4dff4",  "#AABBCC") |              "#dff4e4", "#f4dff4",  "#AABBCC") | ||||||
| dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window | dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window | ||||||
| par(mar = c(2 + round(max(nchar(sampleNames(GSE3635))) / 2), 4, 2, 1)) | par(mar = c(2 + round(max(nchar(Biobase::sampleNames(GSE3635))) / 2), 4, 2, 1)) | ||||||
| title <- paste ("GSE3635", '/', annotation(GSE3635), | title <- paste ("GSE3635", '/', Biobase::annotation(GSE3635), | ||||||
|                 " grouped samples", sep ='') |                 " grouped samples", sep ='') | ||||||
| boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE, | boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE, | ||||||
|         las = 2, col = GEOcols[fl]) |         las = 2, col = GEOcols[fl]) | ||||||
| @@ -331,7 +329,7 @@ gName <- "MBP1" | |||||||
| (iFeature <- which(SGD_features$name == gName)) | (iFeature <- which(SGD_features$name == gName)) | ||||||
| (iExprs   <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) | (iExprs   <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) | ||||||
| plot(seq(0, 120, by = 10), | plot(seq(0, 120, by = 10), | ||||||
|      exprs(GSE3635)[iExprs, ], |      Biobase::exprs(GSE3635)[iExprs, ], | ||||||
|      main = paste("Expression profile for", gName), |      main = paste("Expression profile for", gName), | ||||||
|      xlab = "time (min)", |      xlab = "time (min)", | ||||||
|      ylab = "expression", |      ylab = "expression", | ||||||
| @@ -368,12 +366,8 @@ SGD_features$description[iFeature] | |||||||
| # GEO2R discovers the top differentially expressed  expressed genes by | # GEO2R discovers the top differentially expressed  expressed genes by | ||||||
| # using functions in the Bioconductor limma package. | # using functions in the Bioconductor limma package. | ||||||
|  |  | ||||||
| if (! require(limma, quietly=TRUE)) { | if (! requireNamespace("limma", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   BiocManager::install("limma") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
|   } |  | ||||||
|   biocLite("limma") |  | ||||||
|   library(limma) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = limma)       # basic information | #  library(help = limma)       # basic information | ||||||
| @@ -392,6 +386,20 @@ if (! require(limma, quietly=TRUE)) { | |||||||
| #     the groups | #     the groups | ||||||
| # 4. Format results. | # 4. Format results. | ||||||
|  |  | ||||||
|  | # Biobase is a highly engineered package that is tightly integrated into | ||||||
|  | # the Bioconductor world - unfortunately that brings with it a somewhat | ||||||
|  | # undesirable level of computational overhead and dependencies. Using the | ||||||
|  | # package as we normally do - i.e. calling required functions with their | ||||||
|  | # explicit package prefix is therefore not advisable. There are generics | ||||||
|  | # that won't be propery dispatched. If you only need a small number of | ||||||
|  | # functions for a very specific context, you will probably get away with | ||||||
|  | # Biobase::<function>() - but even in the demonstration code of this script | ||||||
|  | # not everything works out of the box. We'll therefore load the library, | ||||||
|  | # but we'll (redundantly) use the prefix anyway so as to emphasize where | ||||||
|  | # the functions come from. | ||||||
|  |  | ||||||
|  | library(Biobase) | ||||||
|  |  | ||||||
| # We are recapitulating the experiment in which we assigned the 0, 10, 60 and | # We are recapitulating the experiment in which we assigned the 0, 10, 60 and | ||||||
| # 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to | # 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to | ||||||
| # another group, and calculated differential expression values between these | # another group, and calculated differential expression values between these | ||||||
| @@ -415,15 +423,15 @@ myDesign | |||||||
|  |  | ||||||
| # Now we can calculate the fit of all rows to a linear model that depends | # Now we can calculate the fit of all rows to a linear model that depends | ||||||
| # on the two groups as specified in the design: | # on the two groups as specified in the design: | ||||||
| myFit <- lmFit(mySet, myDesign) | myFit <- limma::lmFit(mySet, myDesign) | ||||||
|  |  | ||||||
| # Next we calculate the contrasts, given the fit ... | # Next we calculate the contrasts, given the fit ... | ||||||
| myCont.matrix <- makeContrasts(A - B, levels = myDesign) | myCont.matrix <- limma::makeContrasts(A - B, levels = myDesign) | ||||||
| myFit2 <- contrasts.fit(myFit, myCont.matrix) | myFit2 <- limma::contrasts.fit(myFit, myCont.matrix) | ||||||
|  |  | ||||||
| # ... compute appropriate probabilites from a modified t-test | # ... compute appropriate probabilites from a modified t-test | ||||||
| #     (empirical Bayes) ... | #     (empirical Bayes) ... | ||||||
| myFit2 <- eBayes(myFit2, 0.01) | myFit2 <- limma::eBayes(myFit2, 0.01) | ||||||
|  |  | ||||||
| # ... add the gene names to the fit - object ... | # ... add the gene names to the fit - object ... | ||||||
| myFit2$genes <- featureNames(mySet) | myFit2$genes <- featureNames(mySet) | ||||||
| @@ -433,7 +441,10 @@ myFit2$genes <- featureNames(mySet) | |||||||
| #     gave us only the top 250 genes, but we might as well do 1000, just so we | #     gave us only the top 250 genes, but we might as well do 1000, just so we | ||||||
| #     can be reasonable sure that our gens of interest are included. | #     can be reasonable sure that our gens of interest are included. | ||||||
| N <- 1000 | N <- 1000 | ||||||
| myTable <- topTable(myFit2, adjust.method = "fdr", sort.by = "B", number = N) | myTable <- limma::topTable(myFit2, | ||||||
|  |                            adjust.method = "fdr", | ||||||
|  |                            sort.by = "B", | ||||||
|  |                            number = N) | ||||||
|  |  | ||||||
| str(myTable) | str(myTable) | ||||||
| # The gene names are now in the $ID column | # The gene names are now in the $ID column | ||||||
| @@ -461,7 +472,7 @@ abline(h = 0, col = "#00000055") | |||||||
|  |  | ||||||
| for (i in 1:10) { | for (i in 1:10) { | ||||||
|   thisID <- myTable$ID[i] |   thisID <- myTable$ID[i] | ||||||
|   points(seq(0, 120, by = 10), exprs(GSE3635)[thisID, ], type = "b") |   points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b") | ||||||
| } | } | ||||||
|  |  | ||||||
| # Our guess that we might discover interesting genes be selecting groups A and B | # Our guess that we might discover interesting genes be selecting groups A and B | ||||||
| @@ -480,7 +491,10 @@ for (i in 1:10) { | |||||||
| myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3") | myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3") | ||||||
| for (name in toupper(myControls)) { | for (name in toupper(myControls)) { | ||||||
|   thisID <- SGD_features$sysName[which(SGD_features$name == name)] |   thisID <- SGD_features$sysName[which(SGD_features$name == name)] | ||||||
|   points(seq(0, 120, by=10), exprs(GSE3635)[thisID, ], type="b", col="#AA0000") |   points(seq(0, 120, by=10), | ||||||
|  |          Biobase::exprs(GSE3635)[thisID, ], | ||||||
|  |          type="b", | ||||||
|  |          col="#AA0000") | ||||||
| } | } | ||||||
|  |  | ||||||
| # Indeed, the discovered gene profiles look much "cleaner" than the real cycle | # Indeed, the discovered gene profiles look much "cleaner" than the real cycle | ||||||
| @@ -504,7 +518,7 @@ for (name in toupper(myControls)) { | |||||||
| gName <- "CLN2" | gName <- "CLN2" | ||||||
| (iFeature <- which(SGD_features$name == gName)) | (iFeature <- which(SGD_features$name == gName)) | ||||||
| (iExprs   <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) | (iExprs   <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature])) | ||||||
| Cln2Profile <- exprs(GSE3635)[iExprs, ] | Cln2Profile <- Biobase::exprs(GSE3635)[iExprs, ] | ||||||
| plot(seq(0, 120, by = 10), | plot(seq(0, 120, by = 10), | ||||||
|      Cln2Profile, |      Cln2Profile, | ||||||
|      ylim = c(-1, 1), |      ylim = c(-1, 1), | ||||||
| @@ -519,16 +533,16 @@ abline(v = 60, col = "#00000055") | |||||||
| # Set up a vector of correlation values | # Set up a vector of correlation values | ||||||
|  |  | ||||||
|  |  | ||||||
| myCorrelations <- numeric(nrow(exprs(GSE3635))) | myCorrelations <- numeric(nrow(Biobase::exprs(GSE3635))) | ||||||
| names(myCorrelations) <- featureNames(GSE3635) | names(myCorrelations) <- Biobase::featureNames(GSE3635) | ||||||
| for (i in 1:length(myCorrelations)) { | for (i in 1:length(myCorrelations)) { | ||||||
|   myCorrelations[i] <- cor(Cln2Profile, exprs(GSE3635)[i, ]) |   myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ]) | ||||||
| } | } | ||||||
|  |  | ||||||
| myTopC <- order(myCorrelations, decreasing = TRUE)[1:10]  # top ten | myTopC <- order(myCorrelations, decreasing = TRUE)[1:10]  # top ten | ||||||
|  |  | ||||||
| # Number 1 | # Number 1 | ||||||
| (ID <- featureNames(GSE3635)[myTopC[1]]) | (ID <- Biobase::featureNames(GSE3635)[myTopC[1]]) | ||||||
|  |  | ||||||
| # Get information | # Get information | ||||||
| SGD_features[which(SGD_features$sysName == ID), ] | SGD_features[which(SGD_features$sysName == ID), ] | ||||||
| @@ -537,12 +551,13 @@ SGD_features[which(SGD_features$sysName == ID), ] | |||||||
|  |  | ||||||
| # Let's plot the rest | # Let's plot the rest | ||||||
| for (i in 2:length(myTopC)) { | for (i in 2:length(myTopC)) { | ||||||
|   ID <- featureNames(GSE3635)[myTopC[i]] |   ID <- Biobase::featureNames(GSE3635)[myTopC[i]] | ||||||
|   points(seq(0, 120, by = 10), |   points(seq(0, 120, by = 10), | ||||||
|        exprs(GSE3635)[ID, ], |          Biobase::exprs(GSE3635)[ID, ], | ||||||
|        type = "b", |        type = "b", | ||||||
|        col= "chartreuse") |        col= "chartreuse") | ||||||
|   print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")]) |   print(SGD_features[which(SGD_features$sysName == ID), | ||||||
|  |                      c("name", "description")]) | ||||||
| } | } | ||||||
|  |  | ||||||
| # Note that all of these genes are highly correlated with a known cell cycle | # Note that all of these genes are highly correlated with a known cell cycle | ||||||
| @@ -554,12 +569,13 @@ for (i in 2:length(myTopC)) { | |||||||
| # And we haven't even looked at the anticorrelated genes yet... | # And we haven't even looked at the anticorrelated genes yet... | ||||||
| myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10]  # bottom ten | myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10]  # bottom ten | ||||||
| for (i in 1:length(myBottomC)) { | for (i in 1:length(myBottomC)) { | ||||||
|   ID <- featureNames(GSE3635)[myBottomC[i]] |   ID <- Biobase::featureNames(GSE3635)[myBottomC[i]] | ||||||
|   points(seq(0, 120, by = 10), |   points(seq(0, 120, by = 10), | ||||||
|          exprs(GSE3635)[ID, ], |          Biobase::exprs(GSE3635)[ID, ], | ||||||
|          type = "b", |          type = "b", | ||||||
|          col= "coral") |          col= "coral") | ||||||
|   print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")]) |   print(SGD_features[which(SGD_features$sysName == ID), | ||||||
|  |                      c("name", "description")]) | ||||||
| } | } | ||||||
| # ... which are very interesting in their own right. | # ... which are very interesting in their own right. | ||||||
|  |  | ||||||
| @@ -583,7 +599,7 @@ for (i in 1:length(myBottomC)) { | |||||||
| # we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL | # we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL | ||||||
| # annotations were not loaded. We could use getGPL = TRUE instead ... | # annotations were not loaded. We could use getGPL = TRUE instead ... | ||||||
|  |  | ||||||
| GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE) | GSE3635annot <- GEOquery::getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE) | ||||||
| GSE3635annot <- GSE3635annot[[1]] | GSE3635annot <- GSE3635annot[[1]] | ||||||
|  |  | ||||||
| # ... and the feature data is then available in the GSE3635@featureData@data | # ... and the feature data is then available in the GSE3635@featureData@data | ||||||
| @@ -597,13 +613,8 @@ GSE3635annot@featureData@data[ 1:20 , ] | |||||||
| myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")] | myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")] | ||||||
| str(myAnnot) | str(myAnnot) | ||||||
|  |  | ||||||
| # ... Note that this is a data frame, but - oy veh - the gene symbols are | # ... Note that this is a data frame and it is easy to find things we | ||||||
| #     factors. Really, we need to fix this! To convert a factor into a string, | #  might be looking for ... | ||||||
| #     we need to cast it to character. |  | ||||||
|  |  | ||||||
| myAnnot$Gene <- as.character(myAnnot$Gene) |  | ||||||
|  |  | ||||||
| # ... whereupon we can find things we might be looking for ... |  | ||||||
| myAnnot[which(myAnnot$Gene == "MBP1"), ] | myAnnot[which(myAnnot$Gene == "MBP1"), ] | ||||||
|  |  | ||||||
| # ... or identify rows that might give us trouble, such as probes that | # ... or identify rows that might give us trouble, such as probes that | ||||||
| @@ -614,13 +625,11 @@ myAnnot[which(myAnnot$Gene == "MBP1"), ] | |||||||
| GSE3635@annotation   # "GPL1914" | GSE3635@annotation   # "GPL1914" | ||||||
|  |  | ||||||
| # ... and downloaded it directly from NCBI: | # ... and downloaded it directly from NCBI: | ||||||
| GPL1914 <- getGEO("GPL1914") | GPL1914 <- GEOquery::getGEO("GPL1914") | ||||||
| str(GPL1914) | str(GPL1914) | ||||||
|  |  | ||||||
| # ... from which we can get the data - which is however NOT necessarily | # ... from which we can get the data - which is however NOT necessarily | ||||||
| # matched to the rows of our expression dataset. Note that here too: the | # matched to the rows of our expression dataset. | ||||||
| # majority of data elements are factors and will likely have to be converted |  | ||||||
| # before use. |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -3,12 +3,15 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Genetic_code_optimality unit. | #              R code accompanying the RPR-Genetic_code_optimality unit. | ||||||
| # | # | ||||||
| # Version:  1.1 | # Version:  1.2 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2019  01 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.2    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
|  | #                      use Biocmanager:: not biocLite() | ||||||
| #           1.1      Update set.seed() usage | #           1.1      Update set.seed() usage | ||||||
| #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | #           1.0.1    Fixed two bugs discovered by Suan Chin Yeo. | ||||||
| #           1.0      New material. | #           1.0      New material. | ||||||
| @@ -30,16 +33,16 @@ | |||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                          Line | #TOC>   Section  Title                                          Line | ||||||
| #TOC> -------------------------------------------------------------- | #TOC> -------------------------------------------------------------- | ||||||
| #TOC>   1        Designing a computational experiment             54 | #TOC>   1        Designing a computational experiment             57 | ||||||
| #TOC>   2        Setting up the tools                             70 | #TOC>   2        Setting up the tools                             73 | ||||||
| #TOC>   2.1        Natural and alternative genetic codes          73 | #TOC>   2.1        Natural and alternative genetic codes          76 | ||||||
| #TOC>   2.2        Effect of mutations                           132 | #TOC>   2.2        Effect of mutations                           134 | ||||||
| #TOC>   2.2.1          reverse-translate                         143 | #TOC>   2.2.1          reverse-translate                         145 | ||||||
| #TOC>   2.2.2          Randomly mutate                           168 | #TOC>   2.2.2          Randomly mutate                           170 | ||||||
| #TOC>   2.2.3          Forward- translate                        193 | #TOC>   2.2.3          Forward- translate                        195 | ||||||
| #TOC>   2.2.4          measure effect                            211 | #TOC>   2.2.4          measure effect                            213 | ||||||
| #TOC>   3        Run the experiment                              258 | #TOC>   3        Run the experiment                              260 | ||||||
| #TOC>   4        Task solutions                                  351 | #TOC>   4        Task solutions                                  356 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -73,12 +76,11 @@ | |||||||
| # ==   2.1  Natural and alternative genetic codes  ============================= | # ==   2.1  Natural and alternative genetic codes  ============================= | ||||||
|  |  | ||||||
| # Load genetic code tables from the Biostrings package | # Load genetic code tables from the Biostrings package | ||||||
| if (! require(Biostrings, quietly=TRUE)) { | if (! requireNamespace("BiocManager", quietly = TRUE)) { | ||||||
|   if (! exists("biocLite")) { |   install.packages("BiocManager") | ||||||
|     source("https://bioconductor.org/biocLite.R") |  | ||||||
| } | } | ||||||
|   biocLite("Biostrings") | if (! requireNamespace("Biostrings", quietly = TRUE)) { | ||||||
|   library(Biostrings) |   BiocManager::install("Biostrings") | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = Biostrings)       # basic information | #  library(help = Biostrings)       # basic information | ||||||
| @@ -257,52 +259,55 @@ evalMut <- function(nat, mut) { | |||||||
|  |  | ||||||
| # =    3  Run the experiment  ================================================== | # =    3  Run the experiment  ================================================== | ||||||
|  |  | ||||||
|  | # Fetch the standard Genetic code from Biostrings:: | ||||||
|  |  | ||||||
|  | stdCode <- Biostrings::GENETIC_CODE | ||||||
|  |  | ||||||
| # Fetch the nucleotide sequence for MBP1: | # Fetch the nucleotide sequence for MBP1: | ||||||
|  |  | ||||||
| myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1] | ||||||
| myDNA <- paste0(myDNA, collapse = "") | myDNA <- paste0(myDNA, collapse = "") | ||||||
| myDNA <- as.character(codons(DNAString(myDNA))) | myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA))) | ||||||
| myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | myDNA <- myDNA[-length(myDNA)]  # drop the stop codon | ||||||
|  |  | ||||||
| myAA <- traFor(myDNA, GENETIC_CODE) | myAA <- traFor(myDNA, stdCode) | ||||||
|  |  | ||||||
| # Mutate and evaluate | # Mutate and evaluate | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| x <- randMut(myDNA) | x <- randMut(myDNA) | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
| x <- traFor(x, GENETIC_CODE) | x <- traFor(x, stdCode) | ||||||
| evalMut(myAA, x)  # 166.4 | evalMut(myAA, x)  # 166.4 | ||||||
|  |  | ||||||
| # Try this 200 times, and see how the values are distributed. | # Try this 200 times, and see how the values are distributed. | ||||||
| N <- 200 | N <- 200 | ||||||
| valUGC <- numeric(N) | valSTDC <- numeric(N) | ||||||
|  |  | ||||||
| set.seed(112358)                   # set RNG seed for repeatable randomness | set.seed(112358)                   # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   x <- randMut(myDNA)              # mutate |   x <- randMut(myDNA)              # mutate | ||||||
|   x <- traFor(x, GENETIC_CODE)     # translate |   x <- traFor(x, stdCode)     # translate | ||||||
|   valUGC[i] <- evalMut(myAA, x)    # evaluate |   valSTDC[i] <- evalMut(myAA, x)    # evaluate | ||||||
| } | } | ||||||
| set.seed(NULL)                     # reset the RNG | set.seed(NULL)                     # reset the RNG | ||||||
|  |  | ||||||
| hist(valUGC, | hist(valSTDC, | ||||||
|      breaks = 15, |      breaks = 15, | ||||||
|      col = "palegoldenrod", |      col = "palegoldenrod", | ||||||
|      xlim = c(0, 400), |      xlim = c(0, 400), | ||||||
|      ylim = c(0, N/4), |      ylim = c(0, N/4), | ||||||
|      main = "Universal vs. Synthetic Genetic Code", |      main = "Standard vs. Synthetic Genetic Code", | ||||||
|      xlab = "Mutation penalty") |      xlab = "Mutation penalty") | ||||||
|  |  | ||||||
| # This looks like a normal distribution. Let's assume the effect of mutations | # This looks like a normal distribution. Let's assume the effect of mutations | ||||||
| # under the universal genetic code is the mean of this distribution: | # under the standard genetic code is the mean of this distribution: | ||||||
| effectUGC <- mean(valUGC)  # 178.1 | effectSTDC <- mean(valSTDC)  # 178.1 | ||||||
|  |  | ||||||
| # Now we can look at the effects of alternate genetic codes: | # Now we can look at the effects of alternate genetic codes: | ||||||
|  |  | ||||||
| set.seed(112358) | set.seed(112358) | ||||||
| # choose a new code | # choose a new code | ||||||
| GC <- randomGC(GENETIC_CODE) | GC <- randomGC(stdCode) | ||||||
| set.seed(NULL) | set.seed(NULL) | ||||||
|  |  | ||||||
| # reverse translate hypothetical sequence according to the new code | # reverse translate hypothetical sequence according to the new code | ||||||
| @@ -321,7 +326,7 @@ valXGC <- numeric(N) | |||||||
|  |  | ||||||
| set.seed(1414214)                # set RNG seed for repeatable randomness | set.seed(1414214)                # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   GC <- randomGC(GENETIC_CODE)   # Choose code |   GC <- randomGC(stdCode)   # Choose code | ||||||
|   x <- traRev(myAA, GC)          # reverse translate |   x <- traRev(myAA, GC)          # reverse translate | ||||||
|   x <- randMut(x)                # mutate |   x <- randMut(x)                # mutate | ||||||
|   x <- traFor(x, GC)             # translate |   x <- traFor(x, GC)             # translate | ||||||
| @@ -355,7 +360,7 @@ valSGC <- numeric(N) | |||||||
|  |  | ||||||
| set.seed(2718282)                # set RNG seed for repeatable randomness | set.seed(2718282)                # set RNG seed for repeatable randomness | ||||||
| for (i in 1:N) { | for (i in 1:N) { | ||||||
|   GC <- swappedGC(GENETIC_CODE)  # Choose code |   GC <- swappedGC(stdCode)  # Choose code | ||||||
|   x <- traRev(myAA, GC)          # reverse translate |   x <- traRev(myAA, GC)          # reverse translate | ||||||
|   x <- randMut(x)                # mutate |   x <- randMut(x)                # mutate | ||||||
|   x <- traFor(x, GC)             # translate |   x <- traFor(x, GC)             # translate | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.0.1 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  -  2018  12 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout, | ||||||
| #           1.0.1  Updates for slightly changed interfaces | #           1.0.1  Updates for slightly changed interfaces | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| @@ -29,10 +31,10 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                                 Line | #TOC>   Section  Title                                                 Line | ||||||
| #TOC> --------------------------------------------------------------- | #TOC> --------------------------------------------------------------------- | ||||||
| #TOC>   1        Constructing a POST command from a Web query      44 | #TOC>   1        Constructing a POST command from a Web query            42 | ||||||
| #TOC>   1.1      Task - fetchPrositeFeatures() function           145 | #TOC>   1.1        Task - fetchPrositeFeatures() function               142 | ||||||
| #TOC>   2        Task solutions                                   153 | #TOC>   2        Task solutions                                         150 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -40,9 +42,8 @@ | |||||||
| # =    1  Constructing a POST command from a Web query  ======================== | # =    1  Constructing a POST command from a Web query  ======================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(httr, quietly=TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
|   library(httr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| @@ -60,7 +61,7 @@ UniProtID <- "P39678" | |||||||
|  |  | ||||||
| URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | ||||||
|  |  | ||||||
| response <- POST(URL, | response <- httr::POST(URL, | ||||||
|                        body = list(meta = "opt1", |                        body = list(meta = "opt1", | ||||||
|                                    meta1_protein = "opt1", |                                    meta1_protein = "opt1", | ||||||
|                                    seq = UniProtID, |                                    seq = UniProtID, | ||||||
| @@ -70,14 +71,14 @@ response <- POST(URL, | |||||||
| # Send off this request, and you should have a response in a few | # Send off this request, and you should have a response in a few | ||||||
| # seconds. Let's check the status first: | # seconds. Let's check the status first: | ||||||
|  |  | ||||||
| status_code(response)  # If this is not 200, something went wrong and it | httr::status_code(response)  # If this is not 200, something went wrong and it | ||||||
|                              # makes no sense to continue. If this persists, ask |                              # makes no sense to continue. If this persists, ask | ||||||
|                              # on the mailing list what to do. |                              # on the mailing list what to do. | ||||||
|  |  | ||||||
|  |  | ||||||
| # The text contents of the response is available with the | # The text contents of the response is available with the | ||||||
| # content() function: | # content() function: | ||||||
| content(response, "text") | httr::content(response, "text") | ||||||
|  |  | ||||||
| # ... should show you the same as the page contents that | # ... should show you the same as the page contents that | ||||||
| # you have seen in the browser. The date we need Now we need to extract | # you have seen in the browser. The date we need Now we need to extract | ||||||
| @@ -86,7 +87,7 @@ content(response, "text") | |||||||
| # individual lines, since each of our data elements is on | # individual lines, since each of our data elements is on | ||||||
| # its own line. We simply split on the "\\n" newline character. | # its own line. We simply split on the "\\n" newline character. | ||||||
|  |  | ||||||
| lines <- unlist(strsplit(content(response, "text"), "\\n")) | lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | ||||||
| head(lines) | head(lines) | ||||||
|  |  | ||||||
| # Now we define a query pattern for the lines we want: | # Now we define a query pattern for the lines we want: | ||||||
|   | |||||||
							
								
								
									
										76
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								RPR-SX-PDB.R
									
									
									
									
									
								
							| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-SX-PDB unit. | #              R code accompanying the RPR-SX-PDB unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  19 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First live version, completely refactores 2016 code | #           1.0    First live version, completely refactores 2016 code | ||||||
| #                     with remarkable speed gains. Added section on x, y, z | #                     with remarkable speed gains. Added section on x, y, z | ||||||
| #                     (density) plots. | #                     (density) plots. | ||||||
| @@ -30,19 +32,19 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                      Line | #TOC>   Section  Title                                      Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------------- | ||||||
| #TOC>   1        Introduction to the bio3D package      63 | #TOC>   1        Introduction to the bio3D package            61 | ||||||
| #TOC>   2        A Ramachandran plot                   151 | #TOC>   2        A Ramachandran plot                         152 | ||||||
| #TOC>   3        Density plots                         227 | #TOC>   3        Density plots                               228 | ||||||
| #TOC>   3.1      Density-based colours                 241 | #TOC>   3.1        Density-based colours                     242 | ||||||
| #TOC>   3.2      Plotting with smoothScatter()         260 | #TOC>   3.2        Plotting with smoothScatter()             261 | ||||||
| #TOC>   3.3      Plotting hexbins                      275 | #TOC>   3.3        Plotting hexbins                          276 | ||||||
| #TOC>   3.4      Plotting density contours             299 | #TOC>   3.4        Plotting density contours                 304 | ||||||
| #TOC>   3.4.1    ... as overlay on a colored grid      333 | #TOC>   3.4.1          ... as overlay on a colored grid      337 | ||||||
| #TOC>   3.4.2    ... as filled countour                350 | #TOC>   3.4.2          ... as filled countour                354 | ||||||
| #TOC>   3.4.3    ... as a perspective plot             381 | #TOC>   3.4.3          ... as a perspective plot             385 | ||||||
| #TOC>   4        cis-peptide bonds                     399 | #TOC>   4        cis-peptide bonds                           403 | ||||||
| #TOC>   5        H-bond lengths                        414 | #TOC>   5        H-bond lengths                              418 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -59,9 +61,8 @@ | |||||||
| # =    1  Introduction to the bio3D package  =================================== | # =    1  Introduction to the bio3D package  =================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(bio3d, quietly=TRUE)) { | if (! requireNamespace("bio3d", quietly = TRUE)) { | ||||||
|   install.packages("bio3d") |   install.packages("bio3d") | ||||||
|   library(bio3d) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = bio3d)       # basic information | #  library(help = bio3d)       # basic information | ||||||
| @@ -89,8 +90,8 @@ file.show("./data/1BM8.pdb") | |||||||
| #          Are all atoms of the N-terminal residue present? | #          Are all atoms of the N-terminal residue present? | ||||||
| #          Are all atoms of the C-terminal residue present? | #          Are all atoms of the C-terminal residue present? | ||||||
|  |  | ||||||
| apses <- read.pdb("1bm8")  # load a molecule directly from the PDB via the | apses <- bio3d::read.pdb("1bm8")  # load a molecule directly from the PDB via | ||||||
|                            # Internet. (This is not your local version.) |                                   # the Internet. | ||||||
|  |  | ||||||
| # check what we have: | # check what we have: | ||||||
| apses | apses | ||||||
| @@ -121,10 +122,11 @@ apses$atom[apses$atom[,"resno"] == i, ] | |||||||
| apses$seqres[1:10]  # the "A"s here identify chain "A" | apses$seqres[1:10]  # the "A"s here identify chain "A" | ||||||
|  |  | ||||||
| # Can we convert this to one letter code? | # Can we convert this to one letter code? | ||||||
| aa321(apses$seqres[1:10]) | bio3d::aa321(apses$seqres[1:10]) | ||||||
|  |  | ||||||
| # Lets get the implicit sequence: | # Lets get the implicit sequence: | ||||||
| aa321((apses$atom$resid[apses$calpha])[1:10])  # Do you understand this code? | bio3d::aa321((apses$atom$resid[apses$calpha])[1:10]) | ||||||
|  | # Do you understand this code? | ||||||
|  |  | ||||||
| # Do explicit and implicit sequence have the same length? | # Do explicit and implicit sequence have the same length? | ||||||
| length(apses$seqres) | length(apses$seqres) | ||||||
| @@ -140,7 +142,10 @@ apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")] | |||||||
| # The introduction to bio3d tutorial at | # The introduction to bio3d tutorial at | ||||||
| #   http://thegrantlab.org/bio3d/tutorials/structure-analysis | #   http://thegrantlab.org/bio3d/tutorials/structure-analysis | ||||||
| # has the following example: | # has the following example: | ||||||
| plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") | bio3d::plot.bio3d(apses$atom$b[apses$calpha], | ||||||
|  |                   sse=apses, | ||||||
|  |                   typ="l", | ||||||
|  |                   ylab="B-factor") | ||||||
|  |  | ||||||
| # Good for now. Let's do some real work. | # Good for now. Let's do some real work. | ||||||
|  |  | ||||||
| @@ -149,7 +154,7 @@ plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor") | |||||||
| # Calculate a Ramachandran plot for the structure. The torsion.pdb() function | # Calculate a Ramachandran plot for the structure. The torsion.pdb() function | ||||||
| # calculates all dihedral angles for backbone and sidechain bonds, NA where | # calculates all dihedral angles for backbone and sidechain bonds, NA where | ||||||
| # the bond does not exist in an amino acid. | # the bond does not exist in an amino acid. | ||||||
| tor <- torsion.pdb(apses) | tor <- bio3d::torsion.pdb(apses) | ||||||
| plot(tor$phi, tor$psi, | plot(tor$phi, tor$psi, | ||||||
|      xlim = c(-180, 180), ylim = c(-180, 180), |      xlim = c(-180, 180), ylim = c(-180, 180), | ||||||
|      main = "Ramachandran plot for 1BM8", |      main = "Ramachandran plot for 1BM8", | ||||||
| @@ -164,7 +169,7 @@ abline(v = 0, lwd = 0.5, col = "#00000044") | |||||||
| # color the points for glycine residues differently. First, we | # color the points for glycine residues differently. First, we | ||||||
| # get a vector of glycine residue indices in the structure: | # get a vector of glycine residue indices in the structure: | ||||||
|  |  | ||||||
| mySeq <- pdbseq(apses) | mySeq <- bio3d::pdbseq(apses) | ||||||
|  |  | ||||||
| # Explore the result object and extract the indices of GLY resiues. | # Explore the result object and extract the indices of GLY resiues. | ||||||
|               mySeq == "G" |               mySeq == "G" | ||||||
| @@ -210,7 +215,7 @@ for (i in 1:nrow(dat)) { | |||||||
|   points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000") |   points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000") | ||||||
|   text(dat$phi[i], |   text(dat$phi[i], | ||||||
|        dat$psi[i], |        dat$psi[i], | ||||||
|        labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]), |        labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]), | ||||||
|        pos = 4, |        pos = 4, | ||||||
|        offset = 0.4, |        offset = 0.4, | ||||||
|        cex = 0.7) |        cex = 0.7) | ||||||
| @@ -272,9 +277,8 @@ abline(v = 0, lwd = 0.5, col = "#00000044") | |||||||
|  |  | ||||||
| # If we wish to approximate values in a histogram-like fashion, we can use | # If we wish to approximate values in a histogram-like fashion, we can use | ||||||
| # hexbin() | # hexbin() | ||||||
| if (! require(hexbin, quietly=TRUE)) { | if (! requireNamespace("hexbin", quietly = TRUE)) { | ||||||
|   install.packages("hexbin") |   install.packages("hexbin") | ||||||
|   library(hexbin) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = hexbin)       # basic information | #  library(help = hexbin)       # basic information | ||||||
| @@ -285,12 +289,17 @@ if (! require(hexbin, quietly=TRUE)) { | |||||||
| myColorRamp <- colorRampPalette(c("#EEEEEE", | myColorRamp <- colorRampPalette(c("#EEEEEE", | ||||||
|                                   "#3399CC", |                                   "#3399CC", | ||||||
|                                   "#2266DD")) |                                   "#2266DD")) | ||||||
| plot(hexbin(phi, psi, xbins = 10), | hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10), | ||||||
|                      colramp = myColorRamp, |                      colramp = myColorRamp, | ||||||
|                      main = "phi-psi Density Bins for 1BM8", |                      main = "phi-psi Density Bins for 1BM8", | ||||||
|                      xlab = expression(phi), |                      xlab = expression(phi), | ||||||
|                      ylab = expression(psi)) |                      ylab = expression(psi)) | ||||||
|  |  | ||||||
|  | # Note: | ||||||
|  | # Had we loaded hexbin with library(hexbin), the plot function would have | ||||||
|  | # been dispatched by the plot() generic, and we could simply have written: | ||||||
|  | #   plot(hexbin(phi, psi, xbins = 10), ... | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   3.4  Plotting density contours  ========================================= | # ==   3.4  Plotting density contours  ========================================= | ||||||
|  |  | ||||||
| @@ -305,17 +314,16 @@ plot(hexbin(phi, psi, xbins = 10), | |||||||
| # distributions. But for 2D data like or phi-psi plots, we need a function from | # distributions. But for 2D data like or phi-psi plots, we need a function from | ||||||
| # the MASS package: kde2d() | # the MASS package: kde2d() | ||||||
|  |  | ||||||
| if (! require(MASS, quietly=TRUE)) { | if (! requireNamespace("MASS", quietly = TRUE)) { | ||||||
|   install.packages("MASS") |   install.packages("MASS") | ||||||
|   library(MASS) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = MASS)       # basic information | #  library(help = MASS)       # basic information | ||||||
| #  browseVignettes("MASS")    # available vignettes | #  browseVignettes("MASS")    # available vignettes | ||||||
| #  data(package = "MASS")     # available datasets | #  data(package = "MASS")     # available datasets | ||||||
|  |  | ||||||
| ?kde2d | ?MASS::kde2d | ||||||
| dPhiPsi <-kde2d(phi, psi, | dPhiPsi <-MASS::kde2d(phi, psi, | ||||||
|                       n = 60, |                       n = 60, | ||||||
|                       lims = c(-180, 180, -180, 180)) |                       lims = c(-180, 180, -180, 180)) | ||||||
|  |  | ||||||
| @@ -469,7 +477,7 @@ ssSelect <- function(PDB, myChain = "A", ssType, myElety) { | |||||||
|  |  | ||||||
|   # get id's from PDB |   # get id's from PDB | ||||||
|  |  | ||||||
|   x <- atom.select(PDB, |   x <- bio3d::atom.select(PDB, | ||||||
|                           string = "protein", |                           string = "protein", | ||||||
|                           type = "ATOM", |                           type = "ATOM", | ||||||
|                           chain = myChain, |                           chain = myChain, | ||||||
| @@ -506,7 +514,7 @@ pairDist <- function(PDB, a, b) { | |||||||
|  |  | ||||||
|   A <- PDB$atom[a, c("x", "y", "z")] |   A <- PDB$atom[a, c("x", "y", "z")] | ||||||
|   B <- PDB$atom[b, c("x", "y", "z")] |   B <- PDB$atom[b, c("x", "y", "z")] | ||||||
|   dMat <- dist.xyz(A, B) |   dMat <- bio3d::dist.xyz(A, B) | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   return(dMat) |   return(dMat) | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  05 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -28,10 +30,10 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                      Line | #TOC>   Section  Title                                      Line | ||||||
| #TOC> ---------------------------------------------------- | #TOC> ---------------------------------------------------------- | ||||||
| #TOC>   1        UniProt files via GET                  44 | #TOC>   1        UniProt files via GET                        41 | ||||||
| #TOC>   1.1      Task - fetchUniProtSeq() function     107 | #TOC>   1.1        Task - fetchUniProtSeq() function         103 | ||||||
| #TOC>   2        Task solutions                        114 | #TOC>   2        Task solutions                              110 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -48,9 +50,8 @@ | |||||||
| # a Web browser. Since this is a short and simple request, the GET verb is the | # a Web browser. Since this is a short and simple request, the GET verb is the | ||||||
| # right tool: | # right tool: | ||||||
|  |  | ||||||
| if (! require(httr, quietly=TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
|   library(httr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = httr)       # basic information | #  library(help = httr)       # basic information | ||||||
| @@ -69,7 +70,7 @@ UniProtID <- "P39678" | |||||||
| (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | (URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)) | ||||||
|  |  | ||||||
| # the GET() function from httr will get the data. | # the GET() function from httr will get the data. | ||||||
| response <- GET(URL) | response <- httr::GET(URL) | ||||||
|  |  | ||||||
| str(response) # the response object is a bit complex ... | str(response) # the response object is a bit complex ... | ||||||
| as.character(response) # ... but it is easy to pull out the data. | as.character(response) # ... but it is easy to pull out the data. | ||||||
| @@ -82,21 +83,21 @@ dbSanitizeSequence(x) | |||||||
| # Simple. | # Simple. | ||||||
| # But what happens if there is an error, e.g. the uniprot ID does not exist? | # But what happens if there is an error, e.g. the uniprot ID does not exist? | ||||||
|  |  | ||||||
| response <- GET("http://www.uniprot.org/uniprot/X000000.fasta") | response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta") | ||||||
| as.character(response) | as.character(response) | ||||||
| # this is a large HTML page that tells us the URL was not found. So we need to | # this is a large HTML page that tells us the URL was not found. So we need to | ||||||
| # check for errors.  The Right way to do this is to evaluate the staus code that | # check for errors.  The Right Way to do this is to evaluate the staus code that | ||||||
| # every Web server returns for every transaction. | # every Web server returns for every transaction. | ||||||
| # | # | ||||||
| status_code(response)  # 404 == Page Not Found | httr::status_code(response)  # 404 == Page Not Found | ||||||
|  |  | ||||||
| # There are many possible codes, but the only code we will be happy with | # There are many possible codes, but the only code we will be happy with | ||||||
| # is 200 - oK. | # is 200 - oK. | ||||||
| # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | # (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes ) | ||||||
|  |  | ||||||
| URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID) | ||||||
| response <- GET(URL) | response <- httr::GET(URL) | ||||||
| status_code(response) | httr::status_code(response) | ||||||
|  |  | ||||||
|  |  | ||||||
| # ==   1.1  Task - fetchUniProtSeq() function  ================================= | # ==   1.1  Task - fetchUniProtSeq() function  ================================= | ||||||
|   | |||||||
| @@ -3,12 +3,13 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Unit_testing unit. | #              R code accompanying the RPR-Unit_testing unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  15 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace() | ||||||
| #           1.0    New code | #           1.0    New code | ||||||
| # | # | ||||||
| # | # | ||||||
| @@ -27,10 +28,11 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                             Line | #TOC>   Section  Title                             Line | ||||||
| #TOC> ------------------------------------------- | #TOC> ------------------------------------------------- | ||||||
| #TOC>   1        Unit Tests with testthat      43 | #TOC>   1        Unit Tests with testthat            40 | ||||||
| #TOC>   2        Organizing your tests        156 | #TOC>   2        Organizing your tests              159 | ||||||
| #TOC>   3        Task solutions               181 | #TOC>   2.1        Testing scripts                  183 | ||||||
|  | #TOC>   3        Task solutions                     198 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -39,15 +41,22 @@ | |||||||
|  |  | ||||||
| # The testthat package supports writing and executing unit tests in many ways. | # The testthat package supports writing and executing unit tests in many ways. | ||||||
|  |  | ||||||
| if (! require(testthat, quietly=TRUE)) { | if (! requireNamespace("testthat", quietly = TRUE)) { | ||||||
|   install.packages("testthat") |   install.packages("testthat") | ||||||
|   library(testthat) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = testthat)       # basic information | #  library(help = testthat)       # basic information | ||||||
| #  browseVignettes("testthat")    # available vignettes | #  browseVignettes("testthat")    # available vignettes | ||||||
| #  data(package = "testthat")     # available datasets | #  data(package = "testthat")     # available datasets | ||||||
|  |  | ||||||
|  | # testthat is one of those packages that we either use A LOT in a script, | ||||||
|  | # or not at all. Therfore it's more reasonable to depart from our usual | ||||||
|  | # <package>::<function>() idiom, and load the entire library. In fact, if | ||||||
|  | # we author packages, it is common practice to load testthat in the part | ||||||
|  | # of the package that automates testing. | ||||||
|  |  | ||||||
|  | library(testthat) | ||||||
|  |  | ||||||
| # An atomic test consists of an expectation about the bahaviour of a function or | # An atomic test consists of an expectation about the bahaviour of a function or | ||||||
| # the existence of an object. testthat provides a number of useful expectations: | # the existence of an object. testthat provides a number of useful expectations: | ||||||
|  |  | ||||||
| @@ -171,6 +180,20 @@ test_file("./tests/test_biCode.R") | |||||||
| # .. or execute all the test files in the directory: | # .. or execute all the test files in the directory: | ||||||
| test_dir("./tests") | test_dir("./tests") | ||||||
|  |  | ||||||
|  | # ==   2.1  Testing scripts  =================================================== | ||||||
|  |  | ||||||
|  | # Scripts need special consideration since we do not necessarily source() them | ||||||
|  | # entirely. Therefore automated testing is not reasonable. What you can do | ||||||
|  | # instead is to place a conditional block at the end of your script, that | ||||||
|  | # never gets executed - then you can manually execute the code in the block | ||||||
|  | # whenever you wish to test your functions. For example: | ||||||
|  |  | ||||||
|  | if (FALSE) { | ||||||
|  |   # ... your tests go here | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # =    3  Task solutions  ====================================================== | # =    3  Task solutions  ====================================================== | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,12 +3,14 @@ | |||||||
| # Purpose:  A Bioinformatics Course: | # Purpose:  A Bioinformatics Course: | ||||||
| #              R code accompanying the RPR-Scripting_data_downloads unit. | #              R code accompanying the RPR-Scripting_data_downloads unit. | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  05 | # Date:     2017  10  05 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First ABC units version | #           1.0    First ABC units version | ||||||
| #           0.1    First code copied from 2016 material. | #           0.1    First code copied from 2016 material. | ||||||
| # | # | ||||||
| @@ -28,10 +30,10 @@ | |||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                       Line | #TOC>   Section  Title                                       Line | ||||||
| #TOC> ----------------------------------------------------- | #TOC> ----------------------------------------------------------- | ||||||
| #TOC>   1        Working with NCBI eUtils                44 | #TOC>   1        Working with NCBI eUtils                      41 | ||||||
| #TOC>   1.1      Task - fetchNCBItaxData() function     162 | #TOC>   1.1        Task - fetchNCBItaxData() function         144 | ||||||
| #TOC>   2        Task solutions                         169 | #TOC>   2        Task solutions                               151 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
| @@ -40,26 +42,11 @@ | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # To begin, we load some libraries with functions | # To begin, we load the xml2 package that contains functions | ||||||
| # we need... | # we need to receive and parse html data. NCBI's eUtils send information in | ||||||
|  | # XML format so we need to be able to parse XML. | ||||||
| # ... the package httr, which sends and receives information via the http | if (! requireNamespace("xml2", quietly=TRUE)) { | ||||||
| # protocol, just like a Web browser. |  | ||||||
| if (! require(httr, quietly=TRUE)) { |  | ||||||
|   install.packages("httr") |  | ||||||
|   library(httr) |  | ||||||
| } |  | ||||||
| # Package information: |  | ||||||
| #  library(help = httr)       # basic information |  | ||||||
| #  browseVignettes("httr")    # available vignettes |  | ||||||
| #  data(package = "httr")     # available datasets |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # ...plus the package xml2: NCBI's eUtils send information in XML format so we |  | ||||||
| # need to be able to parse XML. |  | ||||||
| if (! require(xml2, quietly=TRUE)) { |  | ||||||
|   install.packages("xml2") |   install.packages("xml2") | ||||||
|   library(xml2) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = xml2)       # basic information | #  library(help = xml2)       # basic information | ||||||
| @@ -91,24 +78,23 @@ URL <- paste(eUtilsBase, | |||||||
| # what the response should look like. | # what the response should look like. | ||||||
| URL | URL | ||||||
|  |  | ||||||
| # To fetch a response in R, we use the function GET() from the httr package | # To fetch a response in R, we use the function read_xml() | ||||||
| # with our URL as its argument. | # with our URL as its argument. | ||||||
| myXML <- read_xml(URL) | (myXML <- xml2::read_xml(URL)) | ||||||
| myXML |  | ||||||
|  |  | ||||||
| # This is XML. We can take the response apart into | # This is XML. We can take the response apart into | ||||||
| # its indvidual components with the as_list() function. | # its indvidual components with the as_list() function. | ||||||
|  |  | ||||||
| as_list(myXML) | xml2::as_list(myXML) | ||||||
|  |  | ||||||
| # Note how the XML "tree" is represented as a list of | # Note how the XML "tree" is represented as a list of | ||||||
| # lists of lists ... | # lists of lists ... | ||||||
| # If we know exactly what elelement we are looking for, | # If we know exactly what elelement we are looking for, | ||||||
| # we can extract it from this structure: | # we can extract it from this structure: | ||||||
| as_list(myXML)[["IdList"]][["Id"]][[1]] | xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]] | ||||||
|  |  | ||||||
| # But this is not very robust, it would break with the | # But this is not very robust, it would break with the | ||||||
| # slightest change that the NCBI makes to their response | # slightest change that the NCBI makes to their data format - | ||||||
| # and the NCBI changes things A LOT! | # and the NCBI changes things A LOT! | ||||||
|  |  | ||||||
| # Somewhat more robust is to specify the type of element | # Somewhat more robust is to specify the type of element | ||||||
| @@ -116,11 +102,12 @@ as_list(myXML)[["IdList"]][["Id"]][[1]] | |||||||
| # element, and use the XPath XML parsing language to | # element, and use the XPath XML parsing language to | ||||||
| # retrieve it. | # retrieve it. | ||||||
|  |  | ||||||
| xml_find_all(myXML, "//Id") # returns a "node set" | xml2::xml_find_all(myXML, "//Id") # returns a "node set" | ||||||
|  |  | ||||||
| xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set | xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents | ||||||
|  |                                                   # of the node set | ||||||
|  |  | ||||||
| # We will need doing this a lot, so we write a function | # We will need to do this more than once, so we write a function | ||||||
| # for it... | # for it... | ||||||
| node2text <- function(doc, tag) { | node2text <- function(doc, tag) { | ||||||
|   # an extractor function for the contents of elements |   # an extractor function for the contents of elements | ||||||
| @@ -128,8 +115,8 @@ node2text <- function(doc, tag) { | |||||||
|   # Contents of all matching elements is returned in |   # Contents of all matching elements is returned in | ||||||
|   # a vector of strings. |   # a vector of strings. | ||||||
|   path <- paste0("//", tag) |   path <- paste0("//", tag) | ||||||
|   nodes <- xml_find_all(doc, path) |   nodes <- xml2::xml_find_all(doc, path) | ||||||
|   return(xml_text(nodes)) |   return(xml2::xml_text(nodes)) | ||||||
| } | } | ||||||
|  |  | ||||||
| # using node2text() ... | # using node2text() ... | ||||||
| @@ -145,7 +132,7 @@ URL <- paste0(eUtilsBase, | |||||||
|               "&id=", |               "&id=", | ||||||
|               GID, |               GID, | ||||||
|               "&version=2.0") |               "&version=2.0") | ||||||
| (myXML <- read_xml(URL)) | (myXML <- xml2::read_xml(URL)) | ||||||
|  |  | ||||||
| (taxID <- node2text(myXML, "TaxId")) | (taxID <- node2text(myXML, "TaxId")) | ||||||
| (organism <- node2text(myXML, "Organism")) | (organism <- node2text(myXML, "Organism")) | ||||||
|   | |||||||
| @@ -22,17 +22,18 @@ setwd("<your/project/directory>") | |||||||
|  |  | ||||||
|  |  | ||||||
| # ====  PACKAGES  ============================================================== | # ====  PACKAGES  ============================================================== | ||||||
| # Load all required packages. | # Check that required packages have been installed. Install if needed. | ||||||
|  |  | ||||||
| if (! require(seqinr, quietly=TRUE)) { | if (! requireNamespace("seqinr", quietly=TRUE)) { | ||||||
|   install.packages("seqinr") |   install.packages("seqinr") | ||||||
|   library(seqinr) |  | ||||||
| } | } | ||||||
| # Package information: | # Package information: | ||||||
| #  library(help = seqinr)       # basic information | #  library(help = seqinr)       # basic information | ||||||
| #  browseVignettes("seqinr")    # available vignettes | #  browseVignettes("seqinr")    # available vignettes | ||||||
| #  data(package = "seqinr")     # available datasets | #  data(package = "seqinr")     # available datasets | ||||||
|  |  | ||||||
|  | # Note: use package functions with the :: operator - eg. | ||||||
|  | # seqinr::aaa("K") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,21 +9,18 @@ | |||||||
| # ====== PACKAGES ============================================================== | # ====== PACKAGES ============================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(jsonlite, quietly = TRUE)) { | if (! requireNamespace("jsonlite", quietly = TRUE)) { | ||||||
|   install.packages("jsonlite") |   install.packages("jsonlite") | ||||||
|   library(jsonlite) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(httr, quietly = TRUE)) { | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
|   library(httr) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(xml2, quietly = TRUE)) { | if (! requireNamespace("xml2", quietly = TRUE)) { | ||||||
|   install.packages("xml2") |   install.packages("xml2") | ||||||
|   library(xml2) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -226,10 +223,10 @@ dbFetchUniProtSeq <- function(ID) { | |||||||
|  |  | ||||||
|   URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID) |   URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID) | ||||||
|  |  | ||||||
|   response <- GET(URL) |   response <- httr::GET(URL) | ||||||
|  |  | ||||||
|   mySeq <- character() |   mySeq <- character() | ||||||
|   if (status_code(response) == 200) { |   if (httr::status_code(response) == 200) { | ||||||
|     x <- as.character(response) |     x <- as.character(response) | ||||||
|     x <- strsplit(x, "\n") |     x <- strsplit(x, "\n") | ||||||
|     mySeq <- dbSanitizeSequence(x) |     mySeq <- dbSanitizeSequence(x) | ||||||
| @@ -253,7 +250,7 @@ dbFetchPrositeFeatures <- function(ID) { | |||||||
|  |  | ||||||
|   URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" |   URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi" | ||||||
|  |  | ||||||
|   response <- POST(URL, |   response <- httr::POST(URL, | ||||||
|                          body = list(meta = "opt1", |                          body = list(meta = "opt1", | ||||||
|                                      meta1_protein = "opt1", |                                      meta1_protein = "opt1", | ||||||
|                                      seq = ID, |                                      seq = ID, | ||||||
| @@ -261,9 +258,9 @@ dbFetchPrositeFeatures <- function(ID) { | |||||||
|                                      output = "tabular")) |                                      output = "tabular")) | ||||||
|  |  | ||||||
|   myFeatures <- data.frame() |   myFeatures <- data.frame() | ||||||
|   if (status_code(response) == 200) { |   if (httr::status_code(response) == 200) { | ||||||
|  |  | ||||||
|     lines <- unlist(strsplit(content(response, "text"), "\\n")) |     lines <- unlist(strsplit(httr::content(response, "text"), "\\n")) | ||||||
|  |  | ||||||
|     patt <- sprintf("\\|%s\\|", UniProtID) |     patt <- sprintf("\\|%s\\|", UniProtID) | ||||||
|     lines <- lines[grep(patt, lines)] |     lines <- lines[grep(patt, lines)] | ||||||
| @@ -289,8 +286,8 @@ node2text <- function(doc, tag) { | |||||||
|   # Contents of all matching elements is returned in |   # Contents of all matching elements is returned in | ||||||
|   # a vector of strings. |   # a vector of strings. | ||||||
|   path <- paste0("//", tag) |   path <- paste0("//", tag) | ||||||
|   nodes <- xml_find_all(doc, path) |   nodes <- xml2::xml_find_all(doc, path) | ||||||
|   return(xml_text(nodes)) |   return(xml2::xml_text(nodes)) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -309,7 +306,7 @@ dbFetchNCBItaxData <- function(ID) { | |||||||
|                "db=protein", |                "db=protein", | ||||||
|                "&term=", ID, |                "&term=", ID, | ||||||
|                sep="") |                sep="") | ||||||
|   myXML <- read_xml(URL) |   myXML <- xml2::read_xml(URL) | ||||||
|   GID <- node2text(myXML, "Id") |   GID <- node2text(myXML, "Id") | ||||||
|  |  | ||||||
|   URL <- paste0(eUtilsBase, |   URL <- paste0(eUtilsBase, | ||||||
| @@ -318,7 +315,7 @@ dbFetchNCBItaxData <- function(ID) { | |||||||
|                 "&id=", |                 "&id=", | ||||||
|                 GID, |                 GID, | ||||||
|                 "&version=2.0") |                 "&version=2.0") | ||||||
|   myXML <- read_xml(URL) |   myXML <- xml2::read_xml(URL) | ||||||
|  |  | ||||||
|   x <- as.integer(node2text(myXML, "TaxId")) |   x <- as.integer(node2text(myXML, "TaxId")) | ||||||
|   y <- node2text(myXML, "Organism") |   y <- node2text(myXML, "Organism") | ||||||
| @@ -346,14 +343,14 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | |||||||
|   #    for IDs that are not mapped. |   #    for IDs that are not mapped. | ||||||
|  |  | ||||||
|   URL <- "https://www.uniprot.org/uploadlists/" |   URL <- "https://www.uniprot.org/uploadlists/" | ||||||
|   response <- POST(URL, |   response <- httr::POST(URL, | ||||||
|                          body = list(from = mapFrom, |                          body = list(from = mapFrom, | ||||||
|                                      to = mapTo, |                                      to = mapTo, | ||||||
|                                      format = "tab", |                                      format = "tab", | ||||||
|                                      query = s)) |                                      query = s)) | ||||||
|  |  | ||||||
|   if (status_code(response) == 200) { # 200: oK |   if (httr::status_code(response) == 200) { # 200: oK | ||||||
|     myMap <- read.delim(file = textConnection(content(response)), |     myMap <- read.delim(file = textConnection(httr::content(response)), | ||||||
|                         sep = "\t", |                         sep = "\t", | ||||||
|                         stringsAsFactors = FALSE) |                         stringsAsFactors = FALSE) | ||||||
|     myMap <- myMap[ , c(1,3)] |     myMap <- myMap[ , c(1,3)] | ||||||
| @@ -362,12 +359,23 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") { | |||||||
|     myMap <- data.frame() |     myMap <- data.frame() | ||||||
|     warning(paste("No uniProt ID mapping returned:", |     warning(paste("No uniProt ID mapping returned:", | ||||||
|                   "server sent status", |                   "server sent status", | ||||||
|                   status_code(response))) |                   httr::status_code(response))) | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return(myMap) |   return(myMap) | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # ====== TESTS ================================================================= | ||||||
|  |  | ||||||
|  | if (FALSE) { | ||||||
|  |   if (! requireNamespace("testthat", quietly = TRUE)) { | ||||||
|  |     install.packages("testthat") | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   # ToDo: test everything here | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # [END] | # [END] | ||||||
|   | |||||||
| @@ -3,14 +3,16 @@ | |||||||
| # Purpose:  Create a list of genome sequenced fungi with protein annotations and | # Purpose:  Create a list of genome sequenced fungi with protein annotations and | ||||||
| #               Mbp1 homologues. | #               Mbp1 homologues. | ||||||
| # | # | ||||||
| # Version: 1.1.2 | # Version: 1.2 | ||||||
| # | # | ||||||
| # Date:    2016 09 - 2017 09 | # Date:    2016  09  -  2019  01 | ||||||
| # Author:  Boris Steipe (boris.steipe@utoronto.ca) | # Author:  Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # V 1.1.2  Moved BLAST.R to ./scripts directory | # Versions | ||||||
| # V 1.1    Update 2017 | #          1.2    Change from require() to requireNamespace() | ||||||
| # V 1.0    First code 2016 | #          1.1.2  Moved BLAST.R to ./scripts directory | ||||||
|  | #          1.1    Update 2017 | ||||||
|  | #          1.0    First code 2016 | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| # | # | ||||||
| @@ -31,27 +33,25 @@ | |||||||
| # the respective intermediate results. | # the respective intermediate results. | ||||||
| # | # | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC>   Section  Title                                     Line | #TOC>   Section  Title                                     Line | ||||||
| #TOC> --------------------------------------------------- | #TOC> --------------------------------------------------------- | ||||||
| #TOC>   1        The strategy                          54 | #TOC>   1        The strategy                                55 | ||||||
| #TOC>   2        GOLD species                          66 | #TOC>   2        GOLD species                                67 | ||||||
| #TOC>   2.1      Initialize                            71 | #TOC>   2.1        Initialize                                72 | ||||||
| #TOC>   2.2      Import                                77 | #TOC>   2.2        Import                                    79 | ||||||
| #TOC>   2.3      Unique species                       129 | #TOC>   2.3        Unique species                           131 | ||||||
| #TOC>   3        BLAST species                        171 | #TOC>   3        BLAST species                              173 | ||||||
| #TOC>   3.1      find homologous proteins             178 | #TOC>   3.1        find homologous proteins                 180 | ||||||
| #TOC>   3.2      Identify species in "hits"           202 | #TOC>   3.2        Identify species in "hits"               204 | ||||||
| #TOC>   4        Intersect GOLD and BLAST species     247 | #TOC>   4        Intersect GOLD and BLAST species           249 | ||||||
| #TOC>   5        Cleanup and finish                   265 | #TOC>   5        Cleanup and finish                         267 | ||||||
| #TOC>  | #TOC>  | ||||||
| #TOC> ========================================================================== | #TOC> ========================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| #TOC> |  | ||||||
| #TOC> |  | ||||||
|  |  | ||||||
| # =    1  The strategy  ======================================================== | # =    1  The strategy  ======================================================== | ||||||
|  |  | ||||||
| # This script will create a list of "MYSPE" species and save it in an R object | # This script will create a list of "MYSPE" species and save it in an R object | ||||||
| @@ -70,9 +70,10 @@ | |||||||
| #  (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI. | #  (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI. | ||||||
|  |  | ||||||
| # ==   2.1  Initialize  ======================================================== | # ==   2.1  Initialize  ======================================================== | ||||||
| if (! require(httr)) { # httr provides interfaces to Webservers on the Internet |  | ||||||
|  | # httr provides interfaces to Webservers on the Internet | ||||||
|  | if (! requireNamespace("httr", quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
|     library(httr) |  | ||||||
| } | } | ||||||
|  |  | ||||||
| # ==   2.2  Import  ============================================================ | # ==   2.2  Import  ============================================================ | ||||||
|   | |||||||
| @@ -15,12 +15,14 @@ | |||||||
| #        Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab | #        Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab | ||||||
| # | # | ||||||
| # | # | ||||||
| # Version:  1.0 | # Version:  1.1 | ||||||
| # | # | ||||||
| # Date:     2017  10  06 | # Date:     2017  10  -  2019  01 | ||||||
| # Author:   Boris Steipe (boris.steipe@utoronto.ca) | # Author:   Boris Steipe (boris.steipe@utoronto.ca) | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #           1.1    Change from require() to requireNamespace(), | ||||||
|  | #                      use <package>::<function>() idiom throughout | ||||||
| #           1.0    First code copied from 2016 material. | #           1.0    First code copied from 2016 material. | ||||||
| # | # | ||||||
| # TODO: | # TODO: | ||||||
| @@ -28,16 +30,16 @@ | |||||||
| # | # | ||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
| if (! require(readr, quietly = TRUE)) { | if (! requireNamespace("readr", quietly = TRUE)) { | ||||||
|   install.packages("readr") |   install.packages("readr") | ||||||
|   library(readr) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| # STRING functional interaction data | # STRING functional interaction data | ||||||
|  |  | ||||||
| # Read STRING Data (needs to be downloaded from database, see URL in Notes) | # Read STRING Data (needs to be downloaded from database, see URL in Notes) | ||||||
| STR <- read_delim("./data/4932.protein.links.full.v10.5.txt", delim = " ") | STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt", | ||||||
|  |                          delim = " ") | ||||||
|  |  | ||||||
| # Subset only IDs and combined_score column | # Subset only IDs and combined_score column | ||||||
| STR <- STR[ , c("protein1", "protein2", "combined_score")] | STR <- STR[ , c("protein1", "protein2", "combined_score")] | ||||||
| @@ -61,7 +63,7 @@ myIntxGenes <- unique(c(STR$protein1, STR$protein2))  # yeast systematic gene | |||||||
| # | # | ||||||
| # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | # Read GOSlim data  (needs to be downloaded from database, see URL in Notes) | ||||||
|  |  | ||||||
| Gsl <- read_tsv("./data/go_slim_mapping.tab", | Gsl <- readr::read_tsv("./data/go_slim_mapping.tab", | ||||||
|                        col_names = c("ID", |                        col_names = c("ID", | ||||||
|                                      "name", |                                      "name", | ||||||
|                                      "SGDId", |                                      "SGDId", | ||||||
|   | |||||||
| @@ -7,11 +7,13 @@ | |||||||
| #          https://ncbi.github.io/blast-cloud/dev/api.html | #          https://ncbi.github.io/blast-cloud/dev/api.html | ||||||
| # | # | ||||||
| # | # | ||||||
| # Version: 3 | # Version: 3.1 | ||||||
| # Date:    2016 09 - 2017 11 | # Date:    2016 09 - 2019 01 | ||||||
| # Author:  Boris Steipe | # Author:  Boris Steipe | ||||||
| # | # | ||||||
| # Versions: | # Versions: | ||||||
|  | #    3.1   Change from require() to requireNamespace(), | ||||||
|  | #          use <package>::<function>() idiom throughout | ||||||
| #    3     parsing logic had not been fully implemented; Fixed. | #    3     parsing logic had not been fully implemented; Fixed. | ||||||
| #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | #    2.1   bugfix in BLAST(), bug was blanking non-split deflines; | ||||||
| #          refactored parseBLASTalignment() to handle lists with multiple hits. | #          refactored parseBLASTalignment() to handle lists with multiple hits. | ||||||
| @@ -29,9 +31,8 @@ | |||||||
| # ============================================================================== | # ============================================================================== | ||||||
|  |  | ||||||
|  |  | ||||||
| if (! require(httr, quietly = TRUE)) { | if (! requireNamespace(httr, quietly = TRUE)) { | ||||||
|   install.packages("httr") |   install.packages("httr") | ||||||
|   library(httr) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -92,13 +93,13 @@ BLAST <- function(q, | |||||||
|       } |       } | ||||||
|  |  | ||||||
|       # send it off ... |       # send it off ... | ||||||
|       response <- GET(results$query) |       response <- httr::GET(results$query) | ||||||
|       if (http_status(response)$category != "Success" ) { |       if (httr::http_status(response)$category != "Success" ) { | ||||||
|         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", |         stop(sprintf("PANIC: Can't send query. BLAST server status error: %s", | ||||||
|                      http_status(response)$message)) |                      httr::http_status(response)$message)) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       txt <- content(response, "text", encoding = "UTF-8") |       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|       patt <- "RID = (\\w+)" # match the request id |       patt <- "RID = (\\w+)" # match the request id | ||||||
|       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] |       results$rid  <- regmatches(txt, regexec(patt,  txt))[[1]][2] | ||||||
| @@ -127,13 +128,13 @@ BLAST <- function(q, | |||||||
|  |  | ||||||
|     while (TRUE) { |     while (TRUE) { | ||||||
|       # Check whether the result is ready |       # Check whether the result is ready | ||||||
|       response <- GET(checkStatus) |       response <- httr::GET(checkStatus) | ||||||
|       if (http_status(response)$category != "Success" ) { |       if (httr::http_status(response)$category != "Success" ) { | ||||||
|         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", |         stop(sprintf("PANIC: Can't check status. BLAST server status error: %s", | ||||||
|                      http_status(response)$message)) |                      httr::http_status(response)$message)) | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       txt <- content(response, "text", encoding = "UTF-8") |       txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|       if (length(grep("Status=WAITING",  txt)) > 0) { |       if (length(grep("Status=WAITING",  txt)) > 0) { | ||||||
|         myTimeout <- myTimeout - EXTRAWAIT |         myTimeout <- myTimeout - EXTRAWAIT | ||||||
| @@ -184,13 +185,13 @@ BLAST <- function(q, | |||||||
|                       "&FORMAT_TYPE=Text", |                       "&FORMAT_TYPE=Text", | ||||||
|                       sep = "") |                       sep = "") | ||||||
|  |  | ||||||
|     response <- GET(retrieve) |     response <- httr::GET(retrieve) | ||||||
|     if (http_status(response)$category != "Success" ) { |     if (httr::http_status(response)$category != "Success" ) { | ||||||
|       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", |       stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s", | ||||||
|                    http_status(response)$message)) |                    httr::http_status(response)$message)) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     txt <- content(response, "text", encoding = "UTF-8") |     txt <- httr::content(response, "text", encoding = "UTF-8") | ||||||
|  |  | ||||||
|     # txt contains the whole set of results. Process: |     # txt contains the whole set of results. Process: | ||||||
|  |  | ||||||
| @@ -357,7 +358,7 @@ parseBLASTalignment <- function(hit) { | |||||||
| # ==== TESTS =================================================================== | # ==== TESTS =================================================================== | ||||||
|  |  | ||||||
| # define query: | # define query: | ||||||
| # q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence | # q   <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain | ||||||
| #              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", | #              "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ", | ||||||
| #              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", | #              "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP", | ||||||
| #              sep="") | #              sep="") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user