Use requireNamespace(), <package>::<function>() idiom,
Biocmanager:: - not biocLite()
This commit is contained in:
parent
8cdbb9f4cc
commit
cffd803c23
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-BLAST unit.
|
# R code accompanying the BIN-ALI-BLAST unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10 23
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.1 Fixed parsing logic.
|
# 1.1 Fixed parsing logic.
|
||||||
# 1.0 First live version 2017.
|
# 1.0 First live version 2017.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
@ -29,29 +31,15 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------
|
#TOC> ---------------------------------------------------
|
||||||
#TOC> 1 Preparations 41
|
#TOC> 1 Defining the APSES domain 42
|
||||||
#TOC> 2 Defining the APSES domain 54
|
#TOC> 2 Executing the BLAST search 64
|
||||||
#TOC> 3 Executing the BLAST search 76
|
#TOC> 3 Analysing results 86
|
||||||
#TOC> 4 Analysing results 98
|
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Preparations ========================================================
|
# = 1 Defining the APSES domain ===========================================
|
||||||
|
|
||||||
if (!require(Biostrings, quietly=TRUE)) {
|
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
biocLite("Biostrings")
|
|
||||||
library(Biostrings)
|
|
||||||
}
|
|
||||||
# Package information:
|
|
||||||
# library(help = Biostrings) # basic information
|
|
||||||
# browseVignettes("Biostrings") # available vignettes
|
|
||||||
# data(package = "Biostrings") # available datasets
|
|
||||||
|
|
||||||
|
|
||||||
# = 2 Defining the APSES domain ===========================================
|
|
||||||
|
|
||||||
# Load your protein database
|
# Load your protein database
|
||||||
source("makeProteinDB.R")
|
source("makeProteinDB.R")
|
||||||
@ -73,7 +61,7 @@ source("makeProteinDB.R")
|
|||||||
# BLAST search.
|
# BLAST search.
|
||||||
|
|
||||||
|
|
||||||
# = 3 Executing the BLAST search ==========================================
|
# = 2 Executing the BLAST search ==========================================
|
||||||
|
|
||||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||||
# through its Web API, and to parse results. Have a look at the script, then
|
# through its Web API, and to parse results. Have a look at the script, then
|
||||||
@ -91,11 +79,11 @@ BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
|
|||||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||||
|
|
||||||
|
|
||||||
length(BLASTresults$hits) # There should be at least one hit there. Ask for advice
|
length(BLASTresults$hits) # There should be at least one hit there. Ask for
|
||||||
# in case this step fails.
|
# advice in case this step fails.
|
||||||
|
|
||||||
|
|
||||||
# = 4 Analysing results ===================================================
|
# = 3 Analysing results ===================================================
|
||||||
|
|
||||||
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
||||||
|
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Dotplot unit.
|
# R code accompanying the BIN-ALI-Dotplot unit.
|
||||||
#
|
#
|
||||||
# Version: 0.1
|
# Version: 0.2
|
||||||
#
|
#
|
||||||
# Date: 2017 08 28
|
# Date: 2019 01 07
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 0.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -23,24 +25,37 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# = 1 ___Section___
|
|
||||||
|
|
||||||
# First, we install and load the Biostrings package.
|
#TOC> ==========================================================================
|
||||||
if (!require(Biostrings, quietly=TRUE)) {
|
#TOC>
|
||||||
if (! exists("biocLite")) {
|
#TOC> Section Title Line
|
||||||
source("https://bioconductor.org/biocLite.R")
|
#TOC> --------------------------------------
|
||||||
|
#TOC> 1 ___Section___ 39
|
||||||
|
#TOC> 2 Tasks 187
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 ___Section___ =======================================================
|
||||||
|
|
||||||
|
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||||
|
install.packages("BiocManager")
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
# browseVignettes("Biostrings") # available vignettes
|
# browseVignettes("Biostrings") # available vignettes
|
||||||
# data(package = "Biostrings") # available datasets
|
# data(package = "Biostrings") # available datasets
|
||||||
|
|
||||||
|
if (!requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
|
install.packages("seqinr")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# Let's load BLOSUM62
|
# Let's load BLOSUM62
|
||||||
data(BLOSUM62)
|
data(BLOSUM62, package = "Biostrings")
|
||||||
|
|
||||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
|
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
|
||||||
# matrix that has as many rows as one sequence, as many columns as another. Then
|
# matrix that has as many rows as one sequence, as many columns as another. Then
|
||||||
@ -51,10 +66,10 @@ data(BLOSUM62)
|
|||||||
|
|
||||||
# First we fetch our sequences and split them into single characters.
|
# First we fetch our sequences and split them into single characters.
|
||||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||||
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
|
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
|
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
# Check that we have two character vectors of the expected length.
|
# Check that we have two character vectors of the expected length.
|
||||||
str(MBP1_SACCE)
|
str(MBP1_SACCE)
|
||||||
@ -136,7 +151,7 @@ axis(4, at = c(1, seq(10, len, by=10)))
|
|||||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||||
# there already is a dotplot function in the seqinr package:
|
# there already is a dotplot function in the seqinr package:
|
||||||
|
|
||||||
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||||
|
|
||||||
# Which one do you prefer? You can probably see the block patterns that arise
|
# Which one do you prefer? You can probably see the block patterns that arise
|
||||||
@ -169,7 +184,7 @@ dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 1 Tasks
|
# = 2 Tasks ===============================================================
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
131
BIN-ALI-MSA.R
131
BIN-ALI-MSA.R
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-MSA unit.
|
# R code accompanying the BIN-ALI-MSA unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.1 Added fetchMSAmotif()
|
# 1.1 Added fetchMSAmotif()
|
||||||
# 1.0 Fully refactored and rewritten for 2017
|
# 1.0 Fully refactored and rewritten for 2017
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
@ -29,22 +31,22 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------
|
#TOC> ------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 51
|
#TOC> 1 Preparations 54
|
||||||
#TOC> 2 Aligning full length MBP1 proteins 99
|
#TOC> 2 Aligning full length MBP1 proteins 96
|
||||||
#TOC> 2.1 Preparing Sequences 110
|
#TOC> 2.1 Preparing Sequences 107
|
||||||
#TOC> 2.2 Compute the MSA 135
|
#TOC> 2.2 Compute the MSA 132
|
||||||
#TOC> 3 Analyzing an MSA 156
|
#TOC> 3 Analyzing an MSA 153
|
||||||
#TOC> 4 Comparing MSAs 227
|
#TOC> 4 Comparing MSAs 224
|
||||||
#TOC> 4.1 Importing an alignment to msa 236
|
#TOC> 4.1 Importing an alignment to msa 233
|
||||||
#TOC> 4.1.1 importing an .aln file 245
|
#TOC> 4.1.1 importing an .aln file 242
|
||||||
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276
|
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 273
|
||||||
#TOC> 4.2 More alignments 313
|
#TOC> 4.2 More alignments 324
|
||||||
#TOC> 4.3 Computing comparison metrics 325
|
#TOC> 4.3 Computing comparison metrics 336
|
||||||
#TOC> 5 Profile-Profile alignments 462
|
#TOC> 5 Profile-Profile alignments 473
|
||||||
#TOC> 6 Sequence Logos 539
|
#TOC> 6 Sequence Logos 546
|
||||||
#TOC> 6.1 Subsetting an alignment by motif 548
|
#TOC> 6.1 Subsetting an alignment by motif 555
|
||||||
#TOC> 6.2 Plot a Sequence Logo 591
|
#TOC> 6.2 Plot a Sequence Logo 604
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -59,28 +61,22 @@
|
|||||||
source("makeProteinDB.R")
|
source("makeProteinDB.R")
|
||||||
|
|
||||||
|
|
||||||
# Multiple sequence alignment algorithms are provided in
|
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||||
# the Bioconductor msa package.
|
install.packages("BiocManager")
|
||||||
|
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
|
||||||
if (! exists("biocLite")) {
|
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
# browseVignettes("Biostrings") # available vignettes
|
# browseVignettes("Biostrings") # available vignettes
|
||||||
# data(package = "Biostrings") # available datasets
|
# data(package = "Biostrings") # available datasets
|
||||||
|
|
||||||
|
# Multiple sequence alignment algorithms are provided in
|
||||||
|
# the Bioconductor msa package.
|
||||||
|
|
||||||
if (! require(msa, quietly=TRUE)) {
|
if (! requireNamespace("msa", quietly=TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("msa")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("msa")
|
|
||||||
library(msa)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help=msa) # basic information
|
# library(help=msa) # basic information
|
||||||
@ -115,7 +111,7 @@ help(package = "msa")
|
|||||||
# of sequence.
|
# of sequence.
|
||||||
|
|
||||||
sel <- grep("MBP1", myDB$protein$name)
|
sel <- grep("MBP1", myDB$protein$name)
|
||||||
MBP1set <- AAStringSet(myDB$protein$sequence[sel])
|
MBP1set <- Biostrings::AAStringSet(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
# To help us make sense of the alignment we need to add the names for
|
# To help us make sense of the alignment we need to add the names for
|
||||||
# the sequences. Names for a seqSet object are held in the ranges slot...
|
# the sequences. Names for a seqSet object are held in the ranges slot...
|
||||||
@ -142,10 +138,10 @@ MBP1set
|
|||||||
|
|
||||||
|
|
||||||
# Let's run an alignment with "Muscle"
|
# Let's run an alignment with "Muscle"
|
||||||
(msaM <- msaMuscle( MBP1set, order = "aligned"))
|
(msaM <- msa::msaMuscle( MBP1set, order = "aligned"))
|
||||||
|
|
||||||
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
|
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
|
||||||
print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
msa::print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
||||||
|
|
||||||
|
|
||||||
# You see that the alignment object has sequence strings with hyphens as
|
# You see that the alignment object has sequence strings with hyphens as
|
||||||
@ -173,7 +169,7 @@ print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
|||||||
|
|
||||||
data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package
|
data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package
|
||||||
|
|
||||||
msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
|
msaMScores <- msa::msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
|
||||||
plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
|
plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
|
||||||
|
|
||||||
# That plot shows the well-aligned regions (domains ?) of the sequence, but it
|
# That plot shows the well-aligned regions (domains ?) of the sequence, but it
|
||||||
@ -246,17 +242,17 @@ for (i in seq_along(highScoringRanges$lengths)) {
|
|||||||
# === 4.1.1 importing an .aln file
|
# === 4.1.1 importing an .aln file
|
||||||
|
|
||||||
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
||||||
if (! require(seqinr, quietly=TRUE)) {
|
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
install.packages(seqinr)
|
install.packages("seqinr")
|
||||||
library(seqinr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help=seqinr) # basic information
|
# library(help=seqinr) # basic information
|
||||||
# browseVignettes("seqinr") # available vignettes
|
# browseVignettes("seqinr") # available vignettes
|
||||||
# data(package = "seqinr") # available datasets
|
# data(package = "seqinr") # available datasets
|
||||||
|
|
||||||
# read the donwloaded file
|
# read the T-coffee aligned file that you donwloaded from the EBI MSA tools
|
||||||
tmp <- read.alignment("msaT.aln", format = "clustal")
|
# (cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/BIN-ALI-MSA)
|
||||||
|
tmp <- seqinr::read.alignment("msaT.aln", format = "clustal")
|
||||||
|
|
||||||
# read.alignment() returns a list. $seq is a list of strings, one for each
|
# read.alignment() returns a list. $seq is a list of strings, one for each
|
||||||
# complete alignment. However, they are converted to lower case.
|
# complete alignment. However, they are converted to lower case.
|
||||||
@ -278,12 +274,12 @@ for (i in seq_along(x)) {
|
|||||||
|
|
||||||
# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
|
# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
|
||||||
# in their @unmasked slot, and a few additional items. Rather then build the
|
# in their @unmasked slot, and a few additional items. Rather then build the
|
||||||
# object from scratch, we copy an axisting object, and overwrite the dta in its
|
# object from scratch, we copy an existing object, and overwrite the data in its
|
||||||
# slots with what we need. Our goal is pragmatic, we want an object that msa's
|
# slots with what we need. Our goal is pragmatic, we want an object that msa's
|
||||||
# functions will accept as input.
|
# functions will accept as input.
|
||||||
|
|
||||||
# First: convert our named char vector into an AAstringSet
|
# First: convert our named char vector into an AAstringSet
|
||||||
x <- AAStringSet(x)
|
x <- Biostrings::AAStringSet(x)
|
||||||
|
|
||||||
# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
|
# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
|
||||||
# defined what such an object should look like, with the SetClass() function. To
|
# defined what such an object should look like, with the SetClass() function. To
|
||||||
@ -294,8 +290,22 @@ x <- AAStringSet(x)
|
|||||||
|
|
||||||
str(msaM)
|
str(msaM)
|
||||||
|
|
||||||
|
# There is a catch however in the way R makes such operations specific to
|
||||||
|
# the packages they need them: the function that creates the class is
|
||||||
|
# defined as a "generic", and when it is called, R looks in the package
|
||||||
|
# namespace for a more specific function with precise instructions what
|
||||||
|
# to do. However, we have not loaded the package namespace - we access all
|
||||||
|
# of the functions directly with the msa:: prefix. This method breaks down
|
||||||
|
# when generic functions are involved. I.e. - we could make it work, but
|
||||||
|
# the amount of code we need then is unreasonable. The straightforward
|
||||||
|
# way is to load the package. We can still use the prefix notation for
|
||||||
|
# its functions, just to emphasize where the function comes from. But since
|
||||||
|
# the namespace then exists, we ensure that generics are properly dispatched.
|
||||||
|
|
||||||
|
library(msa) # load the msa package namespace
|
||||||
|
|
||||||
msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
|
msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
|
||||||
unmasked = x, # "unmasked" slot takes an AASringSet
|
unmasked = x, # "unmasked" slot takes an AAStringSet
|
||||||
version = "T-Coffee", # "version" slot takes a string
|
version = "T-Coffee", # "version" slot takes a string
|
||||||
params = list(), # "params" takes a list(), we leave the
|
params = list(), # "params" takes a list(), we leave the
|
||||||
# list empty, but we could add the
|
# list empty, but we could add the
|
||||||
@ -309,18 +319,18 @@ str(msaT)
|
|||||||
msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
|
msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
|
||||||
# use the msa package functions on it
|
# use the msa package functions on it
|
||||||
|
|
||||||
msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
|
msaTScores <- msa::msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
# == 4.2 More alignments ===================================================
|
# == 4.2 More alignments ===================================================
|
||||||
|
|
||||||
# Next, we calculate alignments with msa's two other alignment options:
|
# Next, we calculate alignments with msa's two other alignment options:
|
||||||
# CLUSTAL Omega
|
# CLUSTAL Omega
|
||||||
(msaO <- msaClustalOmega( MBP1set, order = "aligned"))
|
(msaO <- msa::msaClustalOmega( MBP1set, order = "aligned"))
|
||||||
msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
|
msaOScores <- msa::msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
# CLUSTAL W
|
# CLUSTAL W
|
||||||
(msaW <- msaClustalW( MBP1set, order = "aligned"))
|
(msaW <- msa::msaClustalW( MBP1set, order = "aligned"))
|
||||||
msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
|
msaWScores <- msa::msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
|
||||||
|
|
||||||
|
|
||||||
# == 4.3 Computing comparison metrics ======================================
|
# == 4.3 Computing comparison metrics ======================================
|
||||||
@ -454,7 +464,7 @@ legend("bottomright",
|
|||||||
|
|
||||||
# Your alignment is going to be different from mine, due to the inclusion of
|
# Your alignment is going to be different from mine, due to the inclusion of
|
||||||
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
||||||
# achieves this with fewer indels then most, and the lowest number of gaps of
|
# achieves this with fewer indels than most, and the lowest number of gaps of
|
||||||
# all algorithms.
|
# all algorithms.
|
||||||
|
|
||||||
# To actually compare regions of alignments, we need to align alignments.
|
# To actually compare regions of alignments, we need to align alignments.
|
||||||
@ -470,12 +480,8 @@ legend("bottomright",
|
|||||||
# to compare two MSAs with each other, by aligning them. The algorithm is
|
# to compare two MSAs with each other, by aligning them. The algorithm is
|
||||||
# provided by the DECIPHER package.
|
# provided by the DECIPHER package.
|
||||||
|
|
||||||
if (! require(DECIPHER, quietly=TRUE)) {
|
if (! requireNamespace("DECIPHER", quietly=TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("DECIPHER")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("DECIPHER")
|
|
||||||
library(DECIPHER)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = DECIPHER) # basic information
|
# library(help = DECIPHER) # basic information
|
||||||
@ -484,14 +490,14 @@ if (! require(DECIPHER, quietly=TRUE)) {
|
|||||||
|
|
||||||
# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
|
# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
|
||||||
# CLUSTAL W alignments: we could do this directly ...
|
# CLUSTAL W alignments: we could do this directly ...
|
||||||
AlignProfiles(msaW@unmasked, msaM@unmasked)
|
DECIPHER::AlignProfiles(msaW@unmasked, msaM@unmasked)
|
||||||
|
|
||||||
# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
|
# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
|
||||||
# alignment into the same order as the MUSCLE alignment:
|
# alignment into the same order as the MUSCLE alignment:
|
||||||
m <- as.character(msaM)
|
m <- as.character(msaM)
|
||||||
w <- as.character(msaW)[names(m)]
|
w <- as.character(msaW)[names(m)]
|
||||||
|
|
||||||
(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
|
(ppa <- DECIPHER::AlignProfiles(msa::AAStringSet(w), msa::AAStringSet(m)))
|
||||||
|
|
||||||
# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
|
# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
|
||||||
# writeALN function to show it. Here is an arbitrary block, from somewhere in
|
# writeALN function to show it. Here is an arbitrary block, from somewhere in
|
||||||
@ -533,8 +539,8 @@ writeALN(ppa2, range = c(800, 960))
|
|||||||
# Again, go explore, and get a sense of what's going on. You may find that
|
# Again, go explore, and get a sense of what's going on. You may find that
|
||||||
# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
|
# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
|
||||||
# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
|
# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
|
||||||
# expect from an algorithm that builds alignments from pairwise local
|
# expect from an algorithm that builds alignments incrementally from pairwise
|
||||||
# alignments, without global refinement.
|
# local alignments, without global refinement.
|
||||||
|
|
||||||
|
|
||||||
# = 6 Sequence Logos ======================================================
|
# = 6 Sequence Logos ======================================================
|
||||||
@ -602,16 +608,15 @@ writeALN(fetchMSAmotif(msaM, wing))
|
|||||||
# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
|
# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
|
||||||
# the EBI.
|
# the EBI.
|
||||||
|
|
||||||
if (! require(ggseqlogo, quietly=TRUE)) {
|
if (! requireNamspace("ggseqlogo", quietly=TRUE)) {
|
||||||
install.packages(("ggseqlogo"))
|
install.packages(("ggseqlogo"))
|
||||||
library(ggseqlogo)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help=ggseqlogo) # basic information
|
# library(help=ggseqlogo) # basic information
|
||||||
# browseVignettes("ggseqlogo") # available vignettes
|
# browseVignettes("ggseqlogo") # available vignettes
|
||||||
# data(package = "ggseqlogo") # available datasets
|
# data(package = "ggseqlogo") # available datasets
|
||||||
|
|
||||||
ggseqlogo(as.character(motifAli))
|
ggseqlogo::ggseqlogo(as.character(motifAli))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
||||||
#
|
#
|
||||||
# Version: 1.4
|
# Version: 1.5
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2017 11
|
# Date: 2017 09 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.5 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
||||||
# entire library.
|
# entire library.
|
||||||
# 1.3 Updated confirmation task with correct logic
|
# 1.3 Updated confirmation task with correct logic
|
||||||
@ -34,27 +36,30 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------------
|
||||||
#TOC> 1 Prepare 52
|
#TOC> 1 Prepare 54
|
||||||
#TOC> 2 Biostrings Pairwise Alignment 66
|
#TOC> 2 Biostrings Pairwise Alignment 71
|
||||||
#TOC> 2.1 Optimal global alignment 84
|
#TOC> 2.1 Optimal global alignment 89
|
||||||
#TOC> 2.2 Optimal local alignment 147
|
#TOC> 2.2 Optimal local alignment 152
|
||||||
#TOC> 3 APSES Domain annotation by alignment 171
|
#TOC> 3 APSES Domain annotation by alignment 176
|
||||||
#TOC> 4 Update your database script 252
|
#TOC> 4 Update your database script 257
|
||||||
#TOC> 4.1 Preparing an annotation file ... 258
|
#TOC> 4.1 Preparing an annotation file ... 263
|
||||||
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260
|
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 265
|
||||||
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303
|
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 308
|
||||||
#TOC> 4.2 Execute and Validate 327
|
#TOC> 4.2 Execute and Validate 332
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Prepare =============================================================
|
# = 1 Prepare =============================================================
|
||||||
|
|
||||||
# To simplify code, we pull the function s2c(x) from the seqinr package,
|
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
# rather than using the lengthier idiom unlist(strsplit(x, "").
|
install.packages("seqinr")
|
||||||
# This assumes that the seqinr package has been installed previously.
|
}
|
||||||
s2c <- seqinr::s2c
|
# You can get package information with the following commands:
|
||||||
|
# library(help = seqinr) # basic information
|
||||||
|
# browseVignettes("seqinr") # available vignettes
|
||||||
|
# data(package = "seqinr") # available datasets
|
||||||
|
|
||||||
|
|
||||||
# You need to recreate the protein database that you have constructed in the
|
# You need to recreate the protein database that you have constructed in the
|
||||||
@ -66,13 +71,13 @@ source("makeProteinDB.R")
|
|||||||
# = 2 Biostrings Pairwise Alignment =======================================
|
# = 2 Biostrings Pairwise Alignment =======================================
|
||||||
|
|
||||||
|
|
||||||
if (!require(Biostrings, quietly=TRUE)) {
|
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
# browseVignettes("Biostrings") # available vignettes
|
# browseVignettes("Biostrings") # available vignettes
|
||||||
# data(package = "Biostrings") # available datasets
|
# data(package = "Biostrings") # available datasets
|
||||||
@ -88,15 +93,15 @@ if (!require(Biostrings, quietly=TRUE)) {
|
|||||||
|
|
||||||
# First: make AAString objects ...
|
# First: make AAString objects ...
|
||||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||||
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||||
|
|
||||||
?pairwiseAlignment
|
?pairwiseAlignment
|
||||||
# ... and align.
|
# ... and align.
|
||||||
# Global optimal alignment with end-gap penalties is default.
|
# Global optimal alignment with end-gap penalties is default.
|
||||||
ali1 <- pairwiseAlignment(
|
ali1 <- Biostrings::pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
aaMBP1_MYSPE,
|
aaMBP1_MYSPE,
|
||||||
substitutionMatrix = "BLOSUM62",
|
substitutionMatrix = "BLOSUM62",
|
||||||
@ -108,7 +113,7 @@ str(ali1) # ... it's complicated
|
|||||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
||||||
# tame it:
|
# tame it:
|
||||||
ali1
|
ali1
|
||||||
writePairwiseAlignments(ali1) # That should look familiar
|
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
|
||||||
|
|
||||||
# And we can make the internal structure work for us (@ is for classes as
|
# And we can make the internal structure work for us (@ is for classes as
|
||||||
# $ is for lists ...)
|
# $ is for lists ...)
|
||||||
@ -147,7 +152,7 @@ percentID(ali1)
|
|||||||
# == 2.2 Optimal local alignment ===========================================
|
# == 2.2 Optimal local alignment ===========================================
|
||||||
|
|
||||||
# Compare with local optimal alignment (like EMBOSS Water)
|
# Compare with local optimal alignment (like EMBOSS Water)
|
||||||
ali2 <- pairwiseAlignment(
|
ali2 <- Biostrings::pairwiseAlignment(
|
||||||
aaMBP1_SACCE,
|
aaMBP1_SACCE,
|
||||||
aaMBP1_MYSPE,
|
aaMBP1_MYSPE,
|
||||||
type = "local",
|
type = "local",
|
||||||
@ -155,9 +160,9 @@ ali2 <- pairwiseAlignment(
|
|||||||
gapOpening = 50,
|
gapOpening = 50,
|
||||||
gapExtension = 10)
|
gapExtension = 10)
|
||||||
|
|
||||||
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
|
Biostrings::writePairwiseAlignments(ali2)
|
||||||
# DNA binding domain - but that one has quite
|
# This has probably only aligned the N-terminal DNA binding domain - but that
|
||||||
# high sequence identity:
|
# one has quite high sequence identity:
|
||||||
percentID(ali2)
|
percentID(ali2)
|
||||||
|
|
||||||
# == TASK: ==
|
# == TASK: ==
|
||||||
@ -209,14 +214,14 @@ myDB$annotation[myDB$annotation$ID == proID &
|
|||||||
# the sequence, and used the start and end coordinates to extract a substring.
|
# the sequence, and used the start and end coordinates to extract a substring.
|
||||||
|
|
||||||
# Let's convert this to an AAstring and assign it:
|
# Let's convert this to an AAstring and assign it:
|
||||||
aaMB1_SACCE_APSES <- AAString(apses)
|
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
|
||||||
|
|
||||||
# Now let's align these two sequences of very different length without end-gap
|
# Now let's align these two sequences of very different length without end-gap
|
||||||
# penalties using the "overlap" type. "overlap" turns the
|
# penalties using the "overlap" type. "overlap" turns the
|
||||||
# end-gap penalties off and that is crucially important since
|
# end-gap penalties off and that is crucially important since
|
||||||
# the sequences have very different length.
|
# the sequences have very different length.
|
||||||
|
|
||||||
aliApses <- pairwiseAlignment(
|
aliApses <- Biostrings::pairwiseAlignment(
|
||||||
aaMB1_SACCE_APSES,
|
aaMB1_SACCE_APSES,
|
||||||
aaMBP1_MYSPE,
|
aaMBP1_MYSPE,
|
||||||
type = "overlap",
|
type = "overlap",
|
||||||
@ -228,7 +233,7 @@ aliApses <- pairwiseAlignment(
|
|||||||
# homologous, and have (almost) no indels. The entire "pattern"
|
# homologous, and have (almost) no indels. The entire "pattern"
|
||||||
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
||||||
# with the "query". Is this correct?
|
# with the "query". Is this correct?
|
||||||
writePairwiseAlignments(aliApses)
|
Biostrings::writePairwiseAlignments(aliApses)
|
||||||
|
|
||||||
# If this is correct, you can extract the matched sequence from
|
# If this is correct, you can extract the matched sequence from
|
||||||
# the alignment object. The syntax is a bit different from what
|
# the alignment object. The syntax is a bit different from what
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-ALI-Similarity unit.
|
# R code accompanying the BIN-ALI-Similarity unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 20
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,10 +30,10 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------
|
#TOC> ----------------------------------------------
|
||||||
#TOC> 1 Amino Acid Properties 43
|
#TOC> 1 Amino Acid Properties 41
|
||||||
#TOC> 2 Mutation Data matrix 163
|
#TOC> 2 Mutation Data matrix 158
|
||||||
#TOC> 3 Background score 205
|
#TOC> 3 Background score 199
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -41,9 +43,8 @@
|
|||||||
# A large collection of amino acid property tables is available via the seqinr
|
# A large collection of amino acid property tables is available via the seqinr
|
||||||
# package:
|
# package:
|
||||||
|
|
||||||
if (!require(seqinr)) {
|
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
install.packages("seqinr")
|
install.packages("seqinr")
|
||||||
library(seqinr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = seqinr) # basic information
|
# library(help = seqinr) # basic information
|
||||||
@ -127,9 +128,8 @@ text(Y$I, K$I, names(Y$I))
|
|||||||
# plots are in general unintuitive and hard to interpret. One alternative is a
|
# plots are in general unintuitive and hard to interpret. One alternative is a
|
||||||
# so-called "ternary plot":
|
# so-called "ternary plot":
|
||||||
|
|
||||||
if (!require(ggtern)) {
|
if (! requireNamespace("ggtern", quietly=TRUE)) {
|
||||||
install.packages("ggtern")
|
install.packages("ggtern")
|
||||||
library(ggtern)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = ggtern) # basic information
|
# library(help = ggtern) # basic information
|
||||||
@ -145,12 +145,11 @@ myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
|
|||||||
stringsAsFactors = FALSE)
|
stringsAsFactors = FALSE)
|
||||||
rownames(myDat) <- names(Y$I)
|
rownames(myDat) <- names(Y$I)
|
||||||
|
|
||||||
ggtern(data = myDat,
|
ggtern::ggtern(data = myDat,
|
||||||
aes(x = vol,
|
ggplot2::aes(x = vol,
|
||||||
y = phi,
|
y = phi,
|
||||||
z = pK,
|
z = pK,
|
||||||
label = rownames(myDat))) +
|
label = rownames(myDat))) + ggplot2::geom_text()
|
||||||
geom_text()
|
|
||||||
|
|
||||||
# This results in a mapping of amino acids relative to each other that is
|
# This results in a mapping of amino acids relative to each other that is
|
||||||
# similar to the Venn diagram you have seen in the notes.
|
# similar to the Venn diagram you have seen in the notes.
|
||||||
@ -162,12 +161,11 @@ ggtern(data = myDat,
|
|||||||
|
|
||||||
# The Biostrings package contains the most common mutation data matrices.
|
# The Biostrings package contains the most common mutation data matrices.
|
||||||
|
|
||||||
if (!require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help=Biostrings) # basic information
|
# library(help=Biostrings) # basic information
|
||||||
@ -200,7 +198,9 @@ BLOSUM62["W", "R"]
|
|||||||
|
|
||||||
# = 3 Background score ====================================================
|
# = 3 Background score ====================================================
|
||||||
|
|
||||||
# The mutation data matrix is designed to give high scores to homologous sequences, low scores to non-homologous sequences. What score on average should we expect for a random sequence?
|
# The mutation data matrix is designed to give high scores to homologous
|
||||||
|
# sequences, low scores to non-homologous sequences. What score on average
|
||||||
|
# should we expect for a random sequence?
|
||||||
|
|
||||||
# If we sample amino acid pairs at random, we will get a score that is the
|
# If we sample amino acid pairs at random, we will get a score that is the
|
||||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
|
# average of the individual pairscores in the matrix. Omitting the ambiguity
|
||||||
@ -219,12 +219,12 @@ sum(BLOSUM62[1:20, 1:20])/400
|
|||||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
|
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
|
||||||
# low-complexity amino acid sequence ...
|
# low-complexity amino acid sequence ...
|
||||||
|
|
||||||
aa3FG7 <- readAAStringSet("./data/3FG7.fa")[[1]]
|
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
|
||||||
|
|
||||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
|
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
|
||||||
# with an exceptionally high percentage of hydrophobic residues.
|
# with an exceptionally high percentage of hydrophobic residues.
|
||||||
|
|
||||||
aa2F1C <- readAAStringSet("./data/2F1C.fa")[[1]]
|
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
|
||||||
|
|
||||||
# Here is a function that takes two sequences and
|
# Here is a function that takes two sequences and
|
||||||
# returns their average pairscore.
|
# returns their average pairscore.
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Data_integration unit.
|
# R code accompanying the BIN-Data_integration unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2018 10 30
|
# Date: 2018 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0.1 Bugfix: UniProt ID Mapping service API change
|
# 1.0.1 Bugfix: UniProt ID Mapping service API change
|
||||||
# 1.0 First live version
|
# 1.0 First live version
|
||||||
#
|
#
|
||||||
@ -31,8 +33,8 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------
|
#TOC> -------------------------------------------------
|
||||||
#TOC> 1 Identifier mapping 40
|
#TOC> 1 Identifier mapping 42
|
||||||
#TOC> 2 Cross-referencing tables 164
|
#TOC> 2 Cross-referencing tables 165
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -54,9 +56,8 @@
|
|||||||
|
|
||||||
# To begin, we load httr, which supports sending and receiving data via the
|
# To begin, we load httr, which supports sending and receiving data via the
|
||||||
# http protocol, just like a Web browser.
|
# http protocol, just like a Web browser.
|
||||||
if (!require(httr, quietly=TRUE)) {
|
if (! requireNamespace("httpr", quietly=TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httpr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = httr) # basic information
|
# library(help = httr) # basic information
|
||||||
@ -75,22 +76,22 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036"
|
|||||||
# of the request. GET() and POST() are functions from httr.
|
# of the request. GET() and POST() are functions from httr.
|
||||||
|
|
||||||
URL <- "https://www.uniprot.org/mapping/"
|
URL <- "https://www.uniprot.org/mapping/"
|
||||||
response <- POST(URL,
|
response <- httr::POST(URL,
|
||||||
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
||||||
to = "ACC", # UniProt ID
|
to = "ACC", # UniProt ID
|
||||||
format = "tab",
|
format = "tab",
|
||||||
query = myQueryIDs))
|
query = myQueryIDs))
|
||||||
|
|
||||||
cat(content(response))
|
cat(httr::content(response))
|
||||||
|
|
||||||
# We need to check the status code - if it is not 200, an error ocurred and we
|
# We need to check the status code - if it is not 200, an error ocurred and we
|
||||||
# can't process the result:
|
# can't process the result:
|
||||||
status_code(response)
|
httr::status_code(response)
|
||||||
|
|
||||||
# If the query is successful, tabbed text is returned. We can assign that to a
|
# If the query is successful, tabbed text is returned. We can assign that to a
|
||||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
||||||
|
|
||||||
myMappedIDs <- read.delim(file = textConnection(content(response)),
|
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
|
||||||
sep = "\t",
|
sep = "\t",
|
||||||
stringsAsFactors = FALSE)
|
stringsAsFactors = FALSE)
|
||||||
myMappedIDs
|
myMappedIDs
|
||||||
@ -132,14 +133,14 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
# for IDs that are not mapped.
|
# for IDs that are not mapped.
|
||||||
|
|
||||||
URL <- "https://www.uniprot.org/uploadlists/"
|
URL <- "https://www.uniprot.org/uploadlists/"
|
||||||
response <- POST(URL,
|
response <- httr::POST(URL,
|
||||||
body = list(from = mapFrom,
|
body = list(from = mapFrom,
|
||||||
to = mapTo,
|
to = mapTo,
|
||||||
format = "tab",
|
format = "tab",
|
||||||
query = s))
|
query = s))
|
||||||
|
|
||||||
if (status_code(response) == 200) { # 200: oK
|
if (httr::status_code(response) == 200) { # 200: oK
|
||||||
myMap <- read.delim(file = textConnection(content(response)),
|
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||||
sep = "\t",
|
sep = "\t",
|
||||||
stringsAsFactors = FALSE)
|
stringsAsFactors = FALSE)
|
||||||
myMap <- myMap[ , c(1,3)]
|
myMap <- myMap[ , c(1,3)]
|
||||||
@ -148,7 +149,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
myMap <- data.frame()
|
myMap <- data.frame()
|
||||||
warning(paste("No uniProt ID mapping returned:",
|
warning(paste("No uniProt ID mapping returned:",
|
||||||
"server sent status",
|
"server sent status",
|
||||||
status_code(response)))
|
httr::status_code(response)))
|
||||||
}
|
}
|
||||||
|
|
||||||
return(myMap)
|
return(myMap)
|
||||||
@ -168,7 +169,8 @@ myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
|
|||||||
# Nomenclature commission. How do we map one set of identifiers to another one?
|
# Nomenclature commission. How do we map one set of identifiers to another one?
|
||||||
|
|
||||||
# The function to use is match().
|
# The function to use is match().
|
||||||
# Here is a tiny set of identifiers taken from a much larger table to illustrate the principle:
|
# Here is a tiny set of identifiers taken from a much larger table to
|
||||||
|
# illustrate the principle:
|
||||||
#
|
#
|
||||||
|
|
||||||
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 11 12
|
# Date: 2017 11 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0 New code.
|
# 1.0 New code.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -27,59 +30,65 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39
|
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42
|
||||||
#TOC> 2 Fetch GO Annotations 89
|
#TOC> 2 Fetch GO Annotations 98
|
||||||
#TOC> 3 Semantic Similarities 98
|
#TOC> 3 Semantic Similarities 107
|
||||||
#TOC> 4 GO Term Enrichment in Gene Sets 116
|
#TOC> 4 GO Term Enrichment in Gene Sets 125
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
||||||
|
|
||||||
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
|
install.packages("BiocManager")
|
||||||
|
}
|
||||||
|
|
||||||
# GOSim is an R-package in the Bioconductor project.
|
# GOSim is an R-package in the Bioconductor project.
|
||||||
if (! require(GOSim, quietly=TRUE)) {
|
if (! requireNamespace("GOSim", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("GOSim")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("GOSim")
|
|
||||||
library(GOSim)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = GOSim) # basic information
|
# library(help = GOSim) # basic information
|
||||||
# browseVignettes("GOSim") # available vignettes
|
# browseVignettes("GOSim") # available vignettes
|
||||||
# data(package = "GOSim") # available datasets
|
# data(package = "GOSim") # available datasets
|
||||||
|
|
||||||
|
# GOSim makes extensive assumptions about loaded packages, and many base
|
||||||
|
# methods are masked. We will thus use library(GOSim) to load it
|
||||||
|
# in its entirety and with all packages it depends on. We will still use
|
||||||
|
# the <package>::<function>() syntax in the code below, but this now serves
|
||||||
|
# more of a didactic purpose, rather than actual syntax requirements.
|
||||||
|
|
||||||
|
library(GOSim)
|
||||||
|
|
||||||
# GOSim loads human annotations by default. We load yeast annotations instead...
|
# GOSim loads human annotations by default. We load yeast annotations instead...
|
||||||
if (!require(org.Sc.sgd.db, quietly=TRUE)) {
|
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("org.Sc.sgd.db")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("org.Sc.sgd.db")
|
|
||||||
|
# Bioconductor annotation packages won't work stably unless we actually load
|
||||||
|
# them:
|
||||||
library(org.Sc.sgd.db)
|
library(org.Sc.sgd.db)
|
||||||
}
|
|
||||||
|
|
||||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
||||||
# databases exist for all model organisms. It's a kind of a fancy data frame
|
# databases exist for all model organisms. It's a kind of a fancy data frame
|
||||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
||||||
keys(org.Sc.sgd.db)[1500:1510]
|
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
|
||||||
|
|
||||||
# ... and the types of available annotations with the columns() function
|
# ... and the types of available annotations with the columns() function
|
||||||
columns(org.Sc.sgd.db)
|
AnnotationDbi::columns(org.Sc.sgd.db)
|
||||||
|
|
||||||
# Note that one of the columns is "GO" ... and we load that into the
|
# Note that one of the columns is "GO" ... and we load that into the
|
||||||
# datastructures used by GOSim:
|
# datastructures used by GOSim:
|
||||||
|
|
||||||
# Choose GOterms to use
|
# Choose GOterms to use
|
||||||
setEvidenceLevel(evidences="all",
|
GOSim::setEvidenceLevel(evidences = "all",
|
||||||
organism = org.Sc.sgdORGANISM,
|
organism = org.Sc.sgdORGANISM,
|
||||||
gomap = org.Sc.sgdGO)
|
gomap = org.Sc.sgdGO)
|
||||||
|
|
||||||
# Use Biological Process ontology
|
# Use Biological Process ontology
|
||||||
setOntology("BP", loadIC=FALSE)
|
GOSim::setOntology("BP", loadIC = FALSE)
|
||||||
|
|
||||||
# confirm that we loaded the correct ontology
|
# confirm that we loaded the correct ontology
|
||||||
head(get("gomap", envir = GOSimEnv))
|
head(get("gomap", envir = GOSimEnv))
|
||||||
@ -92,7 +101,7 @@ head(get("gomap", envir=GOSimEnv))
|
|||||||
# All keys being used here are yeast systematic names.
|
# All keys being used here are yeast systematic names.
|
||||||
|
|
||||||
# Get one set of annotations
|
# Get one set of annotations
|
||||||
getGOInfo(c("YDL056W")) # Mbp1
|
GOSim::getGOInfo(c("YDL056W")) # Mbp1
|
||||||
|
|
||||||
|
|
||||||
# = 3 Semantic Similarities ===============================================
|
# = 3 Semantic Similarities ===============================================
|
||||||
@ -105,30 +114,30 @@ getGOInfo(c("YDL056W")) # Mbp1
|
|||||||
# in this package.
|
# in this package.
|
||||||
|
|
||||||
# Mbp1 and...
|
# Mbp1 and...
|
||||||
getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex
|
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
|
||||||
getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators
|
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
|
||||||
getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator
|
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
|
||||||
getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist
|
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
|
||||||
getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist
|
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
|
||||||
getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis
|
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||||
|
|
||||||
|
|
||||||
# = 4 GO Term Enrichment in Gene Sets =====================================
|
# = 4 GO Term Enrichment in Gene Sets =====================================
|
||||||
|
|
||||||
|
|
||||||
# Calculating GO term enrichment in gene sets is done with the topGO package.
|
# Calculating GO term enrichment in gene sets is done with the Bioconductor
|
||||||
if (! require(topGO, quietly=TRUE)) {
|
# topGO package.
|
||||||
if (! exists("biocLite")) {
|
if (! requireNamespace("topGO", quietly = TRUE)) {
|
||||||
source("https://bioconductor.org/biocLite.R")
|
BiocManager::install("topGO")
|
||||||
}
|
|
||||||
biocLite("topGO")
|
|
||||||
library(topGO)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = topGO) # basic information
|
# library(help = topGO) # basic information
|
||||||
# browseVignettes("topGO") # available vignettes
|
# browseVignettes("topGO") # available vignettes
|
||||||
# data(package = "topGO") # available datasets
|
# data(package = "topGO") # available datasets
|
||||||
|
|
||||||
|
# Once again - assumptions are made by GOsim that require us to load the
|
||||||
|
# topGO package wholesale:
|
||||||
|
library(topGO)
|
||||||
|
|
||||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
||||||
mySet <- c("YFR028C", # Cdc14
|
mySet <- c("YFR028C", # Cdc14
|
||||||
@ -141,7 +150,7 @@ mySet <- c("YFR028C", # Cdc14
|
|||||||
"YPL256C", # Cln2
|
"YPL256C", # Cln2
|
||||||
"YAL040C") # Cln3
|
"YAL040C") # Cln3
|
||||||
|
|
||||||
allGenes <- keys(org.Sc.sgd.db)
|
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
|
||||||
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
||||||
# we define enrichment
|
# we define enrichment
|
||||||
|
|
||||||
@ -164,7 +173,7 @@ setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
|
|||||||
|
|
||||||
# What are these genes?
|
# What are these genes?
|
||||||
# Select annotations from the annotation database:
|
# Select annotations from the annotation database:
|
||||||
select(org.Sc.sgd.db,
|
AnnotationDbi::select(org.Sc.sgd.db,
|
||||||
keys = setdiff(fullSet, mySet),
|
keys = setdiff(fullSet, mySet),
|
||||||
columns = c("COMMON", "DESCRIPTION"))
|
columns = c("COMMON", "DESCRIPTION"))
|
||||||
|
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 31
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0 First 2017 version
|
# 1.0 First 2017 version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,12 +31,12 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------
|
#TOC> ---------------------------------------------------------
|
||||||
#TOC> 1 Preparations 41
|
#TOC> 1 Preparations 44
|
||||||
#TOC> 2 Fetching sequences 78
|
#TOC> 2 Fetching sequences 76
|
||||||
#TOC> 3 Multiple Sequence Alignment 119
|
#TOC> 3 Multiple Sequence Alignment 117
|
||||||
#TOC> 4 Reviewing and Editing Alignments 138
|
#TOC> 4 Reviewing and Editing Alignments 136
|
||||||
#TOC> 4.1 Masking workflow 154
|
#TOC> 4.1 Masking workflow 152
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -49,12 +52,11 @@ source("makeProteinDB.R")
|
|||||||
|
|
||||||
# Load packages we need
|
# Load packages we need
|
||||||
|
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
@ -62,12 +64,8 @@ if (! require(Biostrings, quietly=TRUE)) {
|
|||||||
# data(package = "Biostrings") # available datasets
|
# data(package = "Biostrings") # available datasets
|
||||||
|
|
||||||
|
|
||||||
if (! require(msa, quietly=TRUE)) {
|
if (! requireNamespace("msa", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("msa")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("msa")
|
|
||||||
library(msa)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = msa) # basic information
|
# library(help = msa) # basic information
|
||||||
@ -123,8 +121,8 @@ tail(APSI)
|
|||||||
# the MSA algorithms in Biostrings.
|
# the MSA algorithms in Biostrings.
|
||||||
#
|
#
|
||||||
|
|
||||||
APSESSet <- AAStringSet(APSI)
|
APSESSet <- Biostrings::AAStringSet(APSI)
|
||||||
APSESMsa <- msaMuscle(APSESSet, order = "aligned")
|
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
|
||||||
|
|
||||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
|
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
|
||||||
# that happens in your case, just use msaClustalOmega() instead.
|
# that happens in your case, just use msaClustalOmega() instead.
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0.2
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 31
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0.2 Typo in variable name, style changes
|
# 1.0.2 Typo in variable name, style changes
|
||||||
# 1.0.1 Wrong section heading
|
# 1.0.1 Wrong section heading
|
||||||
# 1.0 First 2017 version
|
# 1.0 First 2017 version
|
||||||
@ -31,11 +34,11 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------
|
#TOC> --------------------------------------------------
|
||||||
#TOC> 1 Preparation and Tree Plot 43
|
#TOC> 1 Preparation and Tree Plot 46
|
||||||
#TOC> 2 Tree Analysis 82
|
#TOC> 2 Tree Analysis 86
|
||||||
#TOC> 2.1 Rooting Trees 141
|
#TOC> 2.1 Rooting Trees 145
|
||||||
#TOC> 2.2 Rotating Clades 187
|
#TOC> 2.2 Rotating Clades 190
|
||||||
#TOC> 2.3 Computing tree distances 234
|
#TOC> 2.3 Computing tree distances 241
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -43,19 +46,17 @@
|
|||||||
# = 1 Preparation and Tree Plot ===========================================
|
# = 1 Preparation and Tree Plot ===========================================
|
||||||
|
|
||||||
|
|
||||||
if (!require(Rphylip, quietly=TRUE)) {
|
if (! requireNamespace("ape", quietly = TRUE)) {
|
||||||
install.packages("Rphylip")
|
install.packages("ape")
|
||||||
library(Rphylip)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Rphylip) # basic information
|
# library(help = ape) # basic information
|
||||||
# browseVignettes("Rphylip") # available vignettes
|
# browseVignettes("ape") # available vignettes
|
||||||
# data(package = "Rphylip") # available datasets
|
# data(package = "ape") # available datasets
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Read the species tree that you have created at the phyloT Website:
|
# Read the species tree that you have created at the phyloT Website:
|
||||||
fungiTree <- read.tree("fungiTree.txt")
|
fungiTree <- ape::read.tree("fungiTree.txt")
|
||||||
|
|
||||||
plot(fungiTree)
|
plot(fungiTree)
|
||||||
|
|
||||||
@ -73,7 +74,10 @@ for (i in seq_along(fungiTree$tip.label)) {
|
|||||||
|
|
||||||
# Plot the tree
|
# Plot the tree
|
||||||
plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE)
|
plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE)
|
||||||
nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
|
ape::nodelabels(text = fungiTree$node.label,
|
||||||
|
cex = 0.6,
|
||||||
|
adj = 0.2,
|
||||||
|
bg = "#D4F2DA")
|
||||||
# Note that you can use the arrow buttons in the menu above the plot to scroll
|
# Note that you can use the arrow buttons in the menu above the plot to scroll
|
||||||
# back to plots you have created earlier - so you can reference back to the
|
# back to plots you have created earlier - so you can reference back to the
|
||||||
# species tree.
|
# species tree.
|
||||||
@ -91,10 +95,10 @@ nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
|
|||||||
# trees in Newick format and visualize them elsewhere.
|
# trees in Newick format and visualize them elsewhere.
|
||||||
|
|
||||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
||||||
# print it have been defined with the Rphylip package, and the package ape that
|
# print it have been defined with the Rphylip package, and in ape. You can
|
||||||
# Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to
|
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
|
||||||
# do with <your-tree> and how to plot it. The underlying function is
|
# to plot it. The underlying function is plot.phylo(), and documentation for its
|
||||||
# plot.phylo(), and documentation for its many options can by found by typing:
|
# many options can by found by typing:
|
||||||
|
|
||||||
?plot.phylo
|
?plot.phylo
|
||||||
|
|
||||||
@ -125,40 +129,39 @@ apsTree$edge.length
|
|||||||
|
|
||||||
# show the node / edge and tip labels on a plot
|
# show the node / edge and tip labels on a plot
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
nodelabels()
|
ape::nodelabels()
|
||||||
edgelabels()
|
ape::edgelabels()
|
||||||
tiplabels()
|
ape::tiplabels()
|
||||||
|
|
||||||
# show the number of nodes, edges and tips
|
# show the number of nodes, edges and tips
|
||||||
Nnode(apsTree)
|
ape::Nnode(apsTree)
|
||||||
Nedge(apsTree)
|
ape::Nedge(apsTree)
|
||||||
Ntip(apsTree)
|
ape::Ntip(apsTree)
|
||||||
|
|
||||||
|
|
||||||
# Finally, write the tree to console in Newick format
|
# Finally, write the tree to console in Newick format
|
||||||
write.tree(apsTree)
|
ape::write.tree(apsTree)
|
||||||
|
|
||||||
# == 2.1 Rooting Trees =====================================================
|
# == 2.1 Rooting Trees =====================================================
|
||||||
|
|
||||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||||
|
|
||||||
is.rooted(apsTree)
|
ape::is.rooted(apsTree)
|
||||||
|
|
||||||
# You can root the tree with the command root() from the "ape" package. ape is
|
# You can root the tree with the command root() from the "ape" package.
|
||||||
# automatically installed and loaded with Rphylip.
|
|
||||||
|
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
|
|
||||||
# add labels for internal nodes and tips
|
# add labels for internal nodes and tips
|
||||||
nodelabels(cex = 0.5, frame = "circle")
|
ape::nodelabels(cex = 0.5, frame = "circle")
|
||||||
tiplabels(cex = 0.5, frame = "rect")
|
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||||
|
|
||||||
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
|
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
|
||||||
# number in yours. Substitute the correct node number below for "outgroup".
|
# number in yours. Substitute the correct node number below for "outgroup".
|
||||||
apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE)
|
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
is.rooted(apsTree)
|
ape::is.rooted(apsTree)
|
||||||
|
|
||||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
|
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
|
||||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
|
# an edge of length zero was added to connect the MRCA (Most Recent Common
|
||||||
@ -172,7 +175,7 @@ apsTree$edge.length
|
|||||||
# overlap.
|
# overlap.
|
||||||
apsTree$edge.length[1] <- 0.1
|
apsTree$edge.length[1] <- 0.1
|
||||||
plot(apsTree, cex = 0.7)
|
plot(apsTree, cex = 0.7)
|
||||||
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||||
|
|
||||||
|
|
||||||
# This procedure does however not assign an actual length to a root edge, and
|
# This procedure does however not assign an actual length to a root edge, and
|
||||||
@ -181,7 +184,7 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
|||||||
|
|
||||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
|
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
|
||||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
|
plot(apsTree, cex = 0.7, root.edge = TRUE)
|
||||||
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||||
|
|
||||||
|
|
||||||
# == 2.2 Rotating Clades ===================================================
|
# == 2.2 Rotating Clades ===================================================
|
||||||
@ -192,9 +195,9 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
|||||||
# We can either rotate around individual internal nodes ...
|
# We can either rotate around individual internal nodes ...
|
||||||
layout(matrix(1:2, 1, 2))
|
layout(matrix(1:2, 1, 2))
|
||||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
|
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
|
||||||
nodelabels(node = 17, cex = 0.7, bg = "#ff8866")
|
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
|
||||||
plot(rotate(apsTree, node = 17), no.margin = TRUE, root.edge = TRUE)
|
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
|
||||||
nodelabels(node = 17, cex = 0.7, bg = "#88ff66")
|
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
||||||
# Note that the species at the bottom of the clade descending from node
|
# Note that the species at the bottom of the clade descending from node
|
||||||
# 17 is now plotted at the top.
|
# 17 is now plotted at the top.
|
||||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
layout(matrix(1), widths = 1.0, heights = 1.0)
|
||||||
@ -211,11 +214,15 @@ nOrg <- length(apsTree$tip.label)
|
|||||||
layout(matrix(1:2, 1, 2))
|
layout(matrix(1:2, 1, 2))
|
||||||
plot(fungiTree,
|
plot(fungiTree,
|
||||||
no.margin = TRUE, root.edge = TRUE)
|
no.margin = TRUE, root.edge = TRUE)
|
||||||
nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA")
|
ape::nodelabels(text = fungiTree$node.label,
|
||||||
|
cex = 0.5,
|
||||||
|
adj = 0.2,
|
||||||
|
bg = "#D4F2DA")
|
||||||
|
|
||||||
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
||||||
no.margin = TRUE, root.edge = TRUE)
|
no.margin = TRUE,
|
||||||
add.scale.bar(length = 0.5)
|
root.edge = TRUE)
|
||||||
|
ape::add.scale.bar(length = 0.5)
|
||||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
layout(matrix(1), widths = 1.0, heights = 1.0)
|
||||||
|
|
||||||
# Task: Study the two trees and consider their similarities and differences.
|
# Task: Study the two trees and consider their similarities and differences.
|
||||||
@ -236,9 +243,8 @@ layout(matrix(1), widths = 1.0, heights = 1.0)
|
|||||||
|
|
||||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||||
|
|
||||||
if (!require(phangorn, quietly=TRUE)) {
|
if (! requireNamespace("phangorn", quietly = TRUE)) {
|
||||||
install.packages("phangorn")
|
install.packages("phangorn")
|
||||||
library(phangorn)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = phangorn) # basic information
|
# library(help = phangorn) # basic information
|
||||||
@ -253,14 +259,14 @@ apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
|||||||
# phangorn provides several functions to compute tree-differences (and there
|
# phangorn provides several functions to compute tree-differences (and there
|
||||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||||
# "symmetric difference"
|
# "symmetric difference"
|
||||||
treedist(fungiTree, apsTree2, check.labels = TRUE)
|
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||||
|
|
||||||
# Numbers. What do they mean? How much more similar is our apsTree to the
|
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||||
# (presumably) ground truth of fungiTree than a random tree would be?
|
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||||
# The ape package (which was loaded with RPhylip) provides the function rtree()
|
# The ape package provides the function rtree()
|
||||||
# to compute random trees.
|
# to compute random trees.
|
||||||
|
|
||||||
rtree(n = length(apsTree2$tip.label), # number of tips
|
ape::rtree(n = length(apsTree2$tip.label), # number of tips
|
||||||
rooted = TRUE, # we rooted the tree above,
|
rooted = TRUE, # we rooted the tree above,
|
||||||
# and fungiTree is rooted anyway
|
# and fungiTree is rooted anyway
|
||||||
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||||
@ -278,17 +284,17 @@ colnames(myTreeDistances) <- c("symm", "path")
|
|||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
xTree <- rtree(n = length(apsTree2$tip.label),
|
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
||||||
rooted = TRUE,
|
rooted = TRUE,
|
||||||
tip.label = apsTree2$tip.label,
|
tip.label = apsTree2$tip.label,
|
||||||
br = NULL)
|
br = NULL)
|
||||||
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree)
|
||||||
}
|
}
|
||||||
set.seed(NULL) # reset the random number generator
|
set.seed(NULL) # reset the random number generator
|
||||||
|
|
||||||
table(myTreeDistances[, "symm"])
|
table(myTreeDistances[, "symm"])
|
||||||
|
|
||||||
(symmObs <- treedist(fungiTree, apsTree2)[1])
|
(symmObs <- phangorn::treedist(fungiTree, apsTree2)[1])
|
||||||
|
|
||||||
# Random events less-or-equal to observation, divided by total number of
|
# Random events less-or-equal to observation, divided by total number of
|
||||||
# events gives us the empirical p-value.
|
# events gives us the empirical p-value.
|
||||||
@ -298,7 +304,7 @@ cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n
|
|||||||
hist(myTreeDistances[, "path"],
|
hist(myTreeDistances[, "path"],
|
||||||
col = "aliceblue",
|
col = "aliceblue",
|
||||||
main = "Distances of random Trees to fungiTree")
|
main = "Distances of random Trees to fungiTree")
|
||||||
(pathObs <- treedist(fungiTree, apsTree2)[2])
|
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
|
||||||
abline(v = pathObs, col = "chartreuse")
|
abline(v = pathObs, col = "chartreuse")
|
||||||
|
|
||||||
# Random events less-or-equal to observation, divided by total number of
|
# Random events less-or-equal to observation, divided by total number of
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10. 31
|
# Date: 2017 10. 31
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
# 1.0 First 2017 version
|
# 1.0 First 2017 version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -29,14 +31,14 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------------
|
#TOC> -----------------------------------------------------------
|
||||||
#TOC> 1 Calculating Trees 43
|
#TOC> 1 Calculating Trees 46
|
||||||
#TOC> 1.1 PROMLPATH ... 64
|
#TOC> 1.1 PROMLPATH ... 66
|
||||||
#TOC> 1.1.1 ... on the Mac 69
|
#TOC> 1.1.1 ... on the Mac 71
|
||||||
#TOC> 1.1.2 ... on Windows 80
|
#TOC> 1.1.2 ... on Windows 82
|
||||||
#TOC> 1.1.3 ... on Linux 94
|
#TOC> 1.1.3 ... on Linux 96
|
||||||
#TOC> 1.1.4 Confirming PROMLPATH 99
|
#TOC> 1.1.4 Confirming PROMLPATH 101
|
||||||
#TOC> 1.2 Building a maximum likelihood tree 108
|
#TOC> 1.2 Building a maximum likelihood tree 110
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -50,9 +52,8 @@
|
|||||||
# After you have installed Phylip on your computer, install the R package that
|
# After you have installed Phylip on your computer, install the R package that
|
||||||
# provides an interface to the Phylip functions.
|
# provides an interface to the Phylip functions.
|
||||||
|
|
||||||
if (!require(Rphylip, quietly=TRUE)) {
|
if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
||||||
install.packages("Rphylip")
|
install.packages("Rphylip")
|
||||||
library(Rphylip)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Rphylip) # basic information
|
# library(help = Rphylip) # basic information
|
||||||
@ -110,7 +111,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
|||||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||||
|
|
||||||
apsIn <- read.protein("APSESphyloSet.mfa")
|
apsIn <- Rphylip::read.protein("APSESphyloSet.mfa")
|
||||||
|
|
||||||
# ... and you are ready to build a tree.
|
# ... and you are ready to build a tree.
|
||||||
|
|
||||||
@ -125,7 +126,7 @@ apsIn <- read.protein("APSESphyloSet.mfa")
|
|||||||
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
|
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
|
||||||
# of coffee while you are waiting.
|
# of coffee while you are waiting.
|
||||||
|
|
||||||
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
|
||||||
|
|
||||||
# A quick first look:
|
# A quick first look:
|
||||||
|
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PPI-Analysis unit.
|
# R code accompanying the BIN-PPI-Analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 08 - 2017 11
|
# Date: 2017 08 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0 First live version
|
# 1.0 First live version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -29,13 +32,13 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------
|
||||||
#TOC> 1 Setup and data 43
|
#TOC> 1 Setup and data 46
|
||||||
#TOC> 2 Functional Edges in the Human Proteome 80
|
#TOC> 2 Functional Edges in the Human Proteome 82
|
||||||
#TOC> 2.1 Cliques 123
|
#TOC> 2.1 Cliques 125
|
||||||
#TOC> 2.2 Communities 164
|
#TOC> 2.2 Communities 166
|
||||||
#TOC> 2.3 Betweenness Centrality 178
|
#TOC> 2.3 Betweenness Centrality 180
|
||||||
#TOC> 3 biomaRt 224
|
#TOC> 3 biomaRt 226
|
||||||
#TOC> 4 Task for submission 295
|
#TOC> 4 Task for submission 296
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -45,9 +48,8 @@
|
|||||||
|
|
||||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
||||||
|
|
||||||
if (!require(igraph, quietly=TRUE)) {
|
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||||
install.packages("igraph")
|
install.packages("igraph")
|
||||||
library(igraph)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = igraph) # basic information
|
# library(help = igraph) # basic information
|
||||||
@ -88,9 +90,9 @@ head(STRINGedges)
|
|||||||
|
|
||||||
|
|
||||||
# Make a graph from this dataframe
|
# Make a graph from this dataframe
|
||||||
?graph_from_data_frame
|
?igraph::graph_from_data_frame
|
||||||
|
|
||||||
gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
|
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||||
|
|
||||||
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
||||||
# layout of such large graphs is possible, but requires specialized code. Google
|
# layout of such large graphs is possible, but requires specialized code. Google
|
||||||
@ -99,13 +101,13 @@ gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
|
|||||||
|
|
||||||
# Of course simple computations on this graph are reasonably fast:
|
# Of course simple computations on this graph are reasonably fast:
|
||||||
|
|
||||||
compSTR <- components(gSTR)
|
compSTR <- igraph::components(gSTR)
|
||||||
summary(compSTR) # our graph is fully connected!
|
summary(compSTR) # our graph is fully connected!
|
||||||
|
|
||||||
hist(log(degree(gSTR)), col="#FEE0AF")
|
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
|
||||||
# this actually does look rather scale-free
|
# this actually does look rather scale-free
|
||||||
|
|
||||||
(freqRank <- table(degree(gSTR)))
|
(freqRank <- table(igraph::degree(gSTR)))
|
||||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||||
log10(as.numeric(freqRank)), type = "b",
|
log10(as.numeric(freqRank)), type = "b",
|
||||||
pch = 21, bg = "#FEE0AF",
|
pch = 21, bg = "#FEE0AF",
|
||||||
@ -126,29 +128,29 @@ abline(regressionLine, col = "firebrick")
|
|||||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
||||||
# Biological complexes often appear as cliques in interaction graphs.
|
# Biological complexes often appear as cliques in interaction graphs.
|
||||||
|
|
||||||
clique_num(gSTR)
|
igraph::clique_num(gSTR)
|
||||||
# The largest clique has 63 members.
|
# The largest clique has 63 members.
|
||||||
|
|
||||||
(C <- largest_cliques(gSTR)[[1]])
|
(C <- igraph::largest_cliques(gSTR)[[1]])
|
||||||
|
|
||||||
# Pick one of the proteins and find out what this fully connected cluster of 63
|
# Pick one of the proteins and find out what this fully connected cluster of 63
|
||||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||||
|
|
||||||
# Plot this ...
|
# Plot this ...
|
||||||
R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices
|
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
|
||||||
|
|
||||||
# color the vertices along a color spectrum
|
# color the vertices along a color spectrum
|
||||||
vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes
|
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
|
||||||
|
|
||||||
# color the edges to have the same color as the originating node
|
# color the edges to have the same color as the originating node
|
||||||
eCol <- character()
|
eCol <- character()
|
||||||
for (i in seq_along(vCol)) {
|
for (i in seq_along(vCol)) {
|
||||||
eCol <- c(eCol, rep(vCol[i], gorder(R)))
|
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
|
||||||
}
|
}
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||||
plot(R,
|
plot(R,
|
||||||
layout = layout_in_circle(R),
|
layout = igraph::layout_in_circle(R),
|
||||||
vertex.size = 3,
|
vertex.size = 3,
|
||||||
vertex.color = vCol,
|
vertex.color = vCol,
|
||||||
edge.color = eCol,
|
edge.color = eCol,
|
||||||
@ -164,14 +166,14 @@ par(oPar)
|
|||||||
# == 2.2 Communities =======================================================
|
# == 2.2 Communities =======================================================
|
||||||
|
|
||||||
set.seed(112358) # set RNG seed for repeatable randomness
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
gSTRclusters <- cluster_infomap(gSTR)
|
gSTRclusters <- igraph::cluster_infomap(gSTR)
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
modularity(gSTRclusters) # ... measures how separated the different membership
|
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
||||||
# types are from each other
|
# membership types are from each other
|
||||||
tMem <- table(membership(gSTRclusters))
|
tMem <- table(igraph::membership(gSTRclusters))
|
||||||
length(tMem) # More than 2000 communities identified
|
length(tMem) # More than 2000 communities identified
|
||||||
hist(tMem, breaks = 50) # most clusters are small ...
|
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
||||||
range(tMem) # ... but one has > 100 members
|
range(tMem) # ... but one has > 100 members
|
||||||
|
|
||||||
|
|
||||||
@ -179,14 +181,14 @@ range(tMem) # ... but one has > 100 members
|
|||||||
|
|
||||||
# Let's find the nodes with the 10 - highest betweenness centralities.
|
# Let's find the nodes with the 10 - highest betweenness centralities.
|
||||||
#
|
#
|
||||||
BC <- centr_betw(gSTR)
|
BC <- igraph::centr_betw(gSTR)
|
||||||
|
|
||||||
# remember: BC$res contains the results
|
# remember: BC$res contains the results
|
||||||
head(BC$res)
|
head(BC$res)
|
||||||
|
|
||||||
BC$res[1] # betweeness centrality of node 1 in the graph ...
|
BC$res[1] # betweeness centrality of node 1 in the graph ...
|
||||||
# ... which one is node 1?
|
# ... which one is node 1?
|
||||||
V(gSTR)[1]
|
igraph::V(gSTR)[1]
|
||||||
|
|
||||||
# to get the ten-highest nodes, we simply label the elements of BC with their
|
# to get the ten-highest nodes, we simply label the elements of BC with their
|
||||||
# index ...
|
# index ...
|
||||||
@ -203,7 +205,7 @@ head(sBC)
|
|||||||
|
|
||||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
||||||
# IDs...
|
# IDs...
|
||||||
(ENSPsel <- names(V(gSTR)[BCsel]))
|
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
||||||
|
|
||||||
# We are going to use these IDs to produce some output for a submitted task:
|
# We are going to use these IDs to produce some output for a submitted task:
|
||||||
# so I need you to personalize ENSPsel with the following
|
# so I need you to personalize ENSPsel with the following
|
||||||
@ -231,12 +233,11 @@ set.seed(NULL) # reset the random number generator
|
|||||||
# day), simply a few lines of sample code to get you started on the specific use
|
# day), simply a few lines of sample code to get you started on the specific use
|
||||||
# case of retrieving descriptions for ensembl protein IDs.
|
# case of retrieving descriptions for ensembl protein IDs.
|
||||||
|
|
||||||
if (!require(biomaRt, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("biomaRt")
|
if (! requireNamespace("biomaRt", quietly = TRUE)) {
|
||||||
library(biomaRt)
|
BiocManager::install("biomaRt")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = biomaRt) # basic information
|
# library(help = biomaRt) # basic information
|
||||||
@ -244,14 +245,14 @@ if (!require(biomaRt, quietly=TRUE)) {
|
|||||||
# data(package = "biomaRt") # available datasets
|
# data(package = "biomaRt") # available datasets
|
||||||
|
|
||||||
# define which dataset to use ...
|
# define which dataset to use ...
|
||||||
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||||
|
|
||||||
# what filters are defined?
|
# what filters are defined?
|
||||||
(filters <- listFilters(myMart))
|
(filters <- biomaRt::listFilters(myMart))
|
||||||
|
|
||||||
|
|
||||||
# and what attributes can we filter for?
|
# and what attributes can we filter for?
|
||||||
(attributes <- listAttributes(myMart))
|
(attributes <- biomaRt::listAttributes(myMart))
|
||||||
|
|
||||||
|
|
||||||
# Soooo many options - let's look for the correct name of filters that are
|
# Soooo many options - let's look for the correct name of filters that are
|
||||||
@ -264,7 +265,7 @@ attributes[grep("description", attributes$description, ignore.case=TRUE), ]
|
|||||||
|
|
||||||
|
|
||||||
# ... so we can put this together: here is a syntax example:
|
# ... so we can put this together: here is a syntax example:
|
||||||
getBM(filters = "ensembl_peptide_id",
|
biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||||
attributes = c("hgnc_symbol",
|
attributes = c("hgnc_symbol",
|
||||||
"wikigene_description",
|
"wikigene_description",
|
||||||
"interpro_description",
|
"interpro_description",
|
||||||
@ -279,7 +280,7 @@ CPdefs <- list() # Since we don't know how many matches one of our queries
|
|||||||
# will return, we'll put the result dataframes into a list.
|
# will return, we'll put the result dataframes into a list.
|
||||||
|
|
||||||
for (ID in ENSPsel) {
|
for (ID in ENSPsel) {
|
||||||
CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id",
|
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||||
attributes = c("hgnc_symbol",
|
attributes = c("hgnc_symbol",
|
||||||
"wikigene_description",
|
"wikigene_description",
|
||||||
"interpro_description",
|
"interpro_description",
|
||||||
|
@ -3,13 +3,17 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-SEQA-Comparison unit
|
# R code accompanying the BIN-SEQA-Comparison unit
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 11 17
|
# Date: 2017 11 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# V 1.0 First live version 2017
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
|
# Versions:
|
||||||
|
# 1.0 First live version 2017
|
||||||
|
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
@ -27,24 +31,23 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------
|
#TOC> ----------------------------------------------------------
|
||||||
#TOC> 1 Preparation 41
|
#TOC> 1 Preparation 47
|
||||||
#TOC> 2 Aggregate properties 63
|
#TOC> 2 Aggregate properties 68
|
||||||
#TOC> 3 Sequence Composition Enrichment 106
|
#TOC> 3 Sequence Composition Enrichment 111
|
||||||
#TOC> 3.1 Barplot, and side-by-side barplot 129
|
#TOC> 3.1 Barplot, and side-by-side barplot 134
|
||||||
#TOC> 3.2 Plotting ratios 164
|
#TOC> 3.2 Plotting ratios 169
|
||||||
#TOC> 3.3 Plotting log ratios 180
|
#TOC> 3.3 Plotting log ratios 185
|
||||||
#TOC> 3.4 Sort by frequency 195
|
#TOC> 3.4 Sort by frequency 200
|
||||||
#TOC> 3.5 Color by amino acid type 210
|
#TOC> 3.5 Color by amino acid type 215
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
# = 1 Preparation =========================================================
|
# = 1 Preparation =========================================================
|
||||||
|
|
||||||
if (!require(seqinr, quietly=TRUE)) {
|
if (! requireNamespace("seqinr", quietly = TRUE)) {
|
||||||
install.packages("seqinr")
|
install.packages("seqinr")
|
||||||
library(seqinr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = seqinr) # basic information
|
# library(help = seqinr) # basic information
|
||||||
@ -66,7 +69,7 @@ if (!require(seqinr, quietly=TRUE)) {
|
|||||||
|
|
||||||
|
|
||||||
# Let's try a simple function from seqinr: computing the pI of the sequence
|
# Let's try a simple function from seqinr: computing the pI of the sequence
|
||||||
?computePI
|
?seqinr::computePI
|
||||||
|
|
||||||
# This takes as input a vector of upper-case AA codes
|
# This takes as input a vector of upper-case AA codes
|
||||||
|
|
||||||
@ -82,12 +85,12 @@ s <- unlist(s) # strsplit() returns a list! Why?
|
|||||||
# the function s2c() to convert strings into
|
# the function s2c() to convert strings into
|
||||||
# character vectors (and c2s to convert them back).
|
# character vectors (and c2s to convert them back).
|
||||||
|
|
||||||
s2c(mySeq)
|
seqinr::s2c(mySeq)
|
||||||
|
|
||||||
|
|
||||||
computePI(s2c(mySeq)) # isoelectric point
|
seqinr::computePI(s2c(mySeq)) # isoelectric point
|
||||||
pmw(s2c(mySeq)) # molecular weight
|
seqinr::pmw(s2c(mySeq)) # molecular weight
|
||||||
AAstat(s2c(mySeq)) # This also plots the distribution of
|
seqinr::AAstat(s2c(mySeq)) # This also plots the distribution of
|
||||||
# values along the sequence
|
# values along the sequence
|
||||||
|
|
||||||
# A true Labor of Love has gone into the
|
# A true Labor of Love has gone into the
|
||||||
@ -117,7 +120,7 @@ aaindex[[459]]$D
|
|||||||
# with the amino acid counts in our sequence.
|
# with the amino acid counts in our sequence.
|
||||||
|
|
||||||
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
||||||
names(refData) <- a(names(refData)) # change names to single-letter
|
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
|
||||||
# code using seqinr's "a()" function
|
# code using seqinr's "a()" function
|
||||||
sum(refData)
|
sum(refData)
|
||||||
refData # ... in %
|
refData # ... in %
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-Sequence unit.
|
# R code accompanying the BIN-Sequence unit.
|
||||||
#
|
#
|
||||||
# Version: 1.3
|
# Version: 1.4
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2019 01
|
# Date: 2017 09 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.4 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.3 Update set.seed() usage
|
# 1.3 Update set.seed() usage
|
||||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||||
# 1.1 Add chartr()
|
# 1.1 Add chartr()
|
||||||
@ -30,23 +33,23 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------
|
#TOC> ----------------------------------------------------
|
||||||
#TOC> 1 Prepare 60
|
#TOC> 1 Prepare 63
|
||||||
#TOC> 2 Storing Sequence 78
|
#TOC> 2 Storing Sequence 80
|
||||||
#TOC> 3 String properties 107
|
#TOC> 3 String properties 109
|
||||||
#TOC> 4 Substrings 114
|
#TOC> 4 Substrings 116
|
||||||
#TOC> 5 Creating strings: sprintf() 135
|
#TOC> 5 Creating strings: sprintf() 137
|
||||||
#TOC> 6 Changing strings 170
|
#TOC> 6 Changing strings 172
|
||||||
#TOC> 6.1.1 Changing case 172
|
#TOC> 6.1.1 Changing case 174
|
||||||
#TOC> 6.1.2 Reverse 177
|
#TOC> 6.1.2 Reverse 179
|
||||||
#TOC> 6.1.3 Change characters 181
|
#TOC> 6.1.3 Change characters 183
|
||||||
#TOC> 6.1.4 Substitute characters 209
|
#TOC> 6.1.4 Substitute characters 211
|
||||||
#TOC> 6.2 stringi and stringr 229
|
#TOC> 6.2 stringi and stringr 231
|
||||||
#TOC> 6.3 dbSanitizeSequence() 239
|
#TOC> 6.3 dbSanitizeSequence() 241
|
||||||
#TOC> 7 Permuting and sampling 251
|
#TOC> 7 Permuting and sampling 253
|
||||||
#TOC> 7.1 Permutations 258
|
#TOC> 7.1 Permutations 260
|
||||||
#TOC> 7.2 Sampling 304
|
#TOC> 7.2 Sampling 306
|
||||||
#TOC> 7.2.1 Equiprobable characters 306
|
#TOC> 7.2.1 Equiprobable characters 308
|
||||||
#TOC> 7.2.2 Defined probability vector 348
|
#TOC> 7.2.2 Defined probability vector 350
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -62,12 +65,11 @@
|
|||||||
# Much basic sequence handling is supported by the Bioconductor package
|
# Much basic sequence handling is supported by the Bioconductor package
|
||||||
# Biostrings.
|
# Biostrings.
|
||||||
|
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
@ -86,7 +88,7 @@ if (! require(Biostrings, quietly=TRUE)) {
|
|||||||
|
|
||||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
|
# ... or as more complex objects with rich metadata e.g. as a Biostrings
|
||||||
# DNAstring, RNAstring, AAString, etc.
|
# DNAstring, RNAstring, AAString, etc.
|
||||||
(a <- AAString("DIVMTQ"))
|
(a <- Biostrings::AAString("DIVMTQ"))
|
||||||
|
|
||||||
# ... and all of these representations can be interconverted:
|
# ... and all of these representations can be interconverted:
|
||||||
|
|
||||||
@ -314,6 +316,7 @@ N <- 100
|
|||||||
set.seed(16818) # set RNG seed for repeatable randomness
|
set.seed(16818) # set RNG seed for repeatable randomness
|
||||||
v <- sample(nuc, N, replace = TRUE)
|
v <- sample(nuc, N, replace = TRUE)
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
(mySeq <- paste(v, collapse = ""))
|
(mySeq <- paste(v, collapse = ""))
|
||||||
|
|
||||||
# What's the GC content?
|
# What's the GC content?
|
||||||
@ -323,9 +326,8 @@ sum(table(v)[c("G", "C")]) # 51 is close to expected
|
|||||||
# What's the number of CpG motifs? Easy to check with the stringi
|
# What's the number of CpG motifs? Easy to check with the stringi
|
||||||
# stri_match_all() function
|
# stri_match_all() function
|
||||||
|
|
||||||
if (! require(stringi, quietly=TRUE)) {
|
if (! requireNamespace("stringi", quietly = TRUE)) {
|
||||||
install.packages("stringi")
|
install.packages("stringi")
|
||||||
library(stringi)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = stringi) # basic information
|
# library(help = stringi) # basic information
|
||||||
|
@ -29,27 +29,27 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------
|
||||||
#TOC> 1 A Relational Datamodel in R: review 62
|
#TOC> 1 A Relational Datamodel in R: review 57
|
||||||
#TOC> 1.1 Building a sample database structure 102
|
#TOC> 1.1 Building a sample database structure 97
|
||||||
#TOC> 1.1.1 completing the database 213
|
#TOC> 1.1.1 completing the database 208
|
||||||
#TOC> 1.2 Querying the database 248
|
#TOC> 1.2 Querying the database 243
|
||||||
#TOC> 1.3 Task: submit for credit (part 1/2) 277
|
#TOC> 1.3 Task: submit for credit (part 1/2) 272
|
||||||
#TOC> 2 Implementing the protein datamodel 289
|
#TOC> 2 Implementing the protein datamodel 284
|
||||||
#TOC> 2.1 JSON formatted source data 315
|
#TOC> 2.1 JSON formatted source data 310
|
||||||
#TOC> 2.2 "Sanitizing" sequence data 355
|
#TOC> 2.2 "Sanitizing" sequence data 350
|
||||||
#TOC> 2.3 Create a protein table for our data model 375
|
#TOC> 2.3 Create a protein table for our data model 370
|
||||||
#TOC> 2.3.1 Initialize the database 377
|
#TOC> 2.3.1 Initialize the database 372
|
||||||
#TOC> 2.3.2 Add data 389
|
#TOC> 2.3.2 Add data 384
|
||||||
#TOC> 2.4 Complete the database 409
|
#TOC> 2.4 Complete the database 404
|
||||||
#TOC> 2.4.1 Examples of navigating the database 436
|
#TOC> 2.4.1 Examples of navigating the database 431
|
||||||
#TOC> 2.5 Updating the database 468
|
#TOC> 2.5 Updating the database 463
|
||||||
#TOC> 3 Add your own data 480
|
#TOC> 3 Add your own data 475
|
||||||
#TOC> 3.1 Find a protein 488
|
#TOC> 3.1 Find a protein 483
|
||||||
#TOC> 3.2 Put the information into JSON files 517
|
#TOC> 3.2 Put the information into JSON files 512
|
||||||
#TOC> 3.3 Create an R script to create your own database 540
|
#TOC> 3.3 Create an R script to create your own database 535
|
||||||
#TOC> 3.3.1 Check and validate 560
|
#TOC> 3.3.1 Check and validate 555
|
||||||
#TOC> 3.4 Task: submit for credit (part 2/2) 601
|
#TOC> 3.4 Task: submit for credit (part 2/2) 596
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -328,11 +328,11 @@ file.show("./data/MBP1_SACCE.json")
|
|||||||
# sanitize the sequence at some point. But since we need to do that
|
# sanitize the sequence at some point. But since we need to do that
|
||||||
# anyway, it is easier to see the whole sequence if we store it in chunks.
|
# anyway, it is easier to see the whole sequence if we store it in chunks.
|
||||||
|
|
||||||
# Let's load the "jsonlite" package and have a look at how it reads this data.
|
# Let's make sure the "jsonlite" package exists on your computer, then we'll
|
||||||
|
# explore how it reads this data.
|
||||||
|
|
||||||
if (! require(jsonlite, quietly=TRUE)) {
|
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||||
install.packages("jsonlite")
|
install.packages("jsonlite")
|
||||||
library(jsonlite)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = jsonlite) # basic information
|
# library(help = jsonlite) # basic information
|
||||||
@ -340,7 +340,7 @@ if (! require(jsonlite, quietly=TRUE)) {
|
|||||||
# data(package = "jsonlite") # available datasets
|
# data(package = "jsonlite") # available datasets
|
||||||
|
|
||||||
|
|
||||||
x <- fromJSON("./data/MBP1_SACCE.json")
|
x <- jsonlite::fromJSON("./data/MBP1_SACCE.json")
|
||||||
str(x)
|
str(x)
|
||||||
|
|
||||||
x$name
|
x$name
|
||||||
@ -389,7 +389,7 @@ str(myDB)
|
|||||||
|
|
||||||
dbAddProtein
|
dbAddProtein
|
||||||
|
|
||||||
myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json"))
|
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||||
str(myDB)
|
str(myDB)
|
||||||
|
|
||||||
# Lets check that the 833 amino acids of the yeast MBP1 sequence have
|
# Lets check that the 833 amino acids of the yeast MBP1 sequence have
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-Genetic_code unit.
|
# R code accompanying the FND-Genetic_code unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 12
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0.1 Comment on "incomplete final line" warning in FASTA
|
# 1.0.1 Comment on "incomplete final line" warning in FASTA
|
||||||
# 1.0 First live version
|
# 1.0 First live version
|
||||||
#
|
#
|
||||||
@ -27,14 +30,14 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------------
|
#TOC> ----------------------------------------------------------------
|
||||||
#TOC> 1 Storing the genetic code 47
|
#TOC> 1 Storing the genetic code 45
|
||||||
#TOC> 1.1 Genetic code in Biostrings 65
|
#TOC> 1.1 Genetic code in Biostrings 63
|
||||||
#TOC> 2 Working with the genetic code 97
|
#TOC> 2 Working with the genetic code 94
|
||||||
#TOC> 2.1 Translate a sequence. 126
|
#TOC> 2.1 Translate a sequence. 129
|
||||||
#TOC> 3 An alternative representation: 3D array 208
|
#TOC> 3 An alternative representation: 3D array 212
|
||||||
#TOC> 3.1 Print a Genetic code table 241
|
#TOC> 3.1 Print a Genetic code table 246
|
||||||
#TOC> 4 Tasks 267
|
#TOC> 4 Tasks 272
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -63,12 +66,11 @@ x["TAA"]
|
|||||||
# available in the Bioconductor "Biostrings" package:
|
# available in the Bioconductor "Biostrings" package:
|
||||||
|
|
||||||
|
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
@ -77,45 +79,51 @@ if (! require(Biostrings, quietly=TRUE)) {
|
|||||||
|
|
||||||
|
|
||||||
# The standard genetic code vector
|
# The standard genetic code vector
|
||||||
GENETIC_CODE
|
Biostrings::GENETIC_CODE
|
||||||
|
|
||||||
# The table of genetic codes. This information corresponds to this page
|
# The table of genetic codes. This information corresponds to this page
|
||||||
# at the NCBI:
|
# at the NCBI:
|
||||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
||||||
GENETIC_CODE_TABLE
|
Biostrings::GENETIC_CODE_TABLE
|
||||||
|
|
||||||
# Most of the alternative codes are mitochondrial codes. The id of the
|
# Most of the alternative codes are mitochondrial codes. The id of the
|
||||||
# Alternative Yeast Nuclear code is "12"
|
# Alternative Yeast Nuclear code is "12"
|
||||||
getGeneticCode("12") # Alternative Yeast Nuclear
|
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
|
||||||
|
|
||||||
|
|
||||||
# = 2 Working with the genetic code =======================================
|
# = 2 Working with the genetic code =======================================
|
||||||
|
|
||||||
# GENETIC_CODE is a "named vector"
|
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
|
||||||
|
# to a "local" variable, rather than retrieving it from the package all the
|
||||||
|
# time.
|
||||||
|
|
||||||
str(GENETIC_CODE)
|
genCode <- Biostrings::GENETIC_CODE
|
||||||
|
|
||||||
|
# This is a named vector of characters ...
|
||||||
|
|
||||||
|
str(genCode)
|
||||||
|
|
||||||
# ... which also stores the alternative initiation codons TTG and CTG in
|
# ... which also stores the alternative initiation codons TTG and CTG in
|
||||||
# an attribute of the vector. (Alternative initiation codons sometimes are
|
# an attribute of the vector. (Alternative initiation codons sometimes are
|
||||||
# used instead of ATG to intiate translation, if if not ATG they are translated
|
# used instead of ATG to intiate translation, if if not ATG they are translated
|
||||||
# with fMet.)
|
# with fMet.)
|
||||||
|
|
||||||
attr(GENETIC_CODE, "alt_init_codons")
|
attr(genCode, "alt_init_codons")
|
||||||
|
|
||||||
# But the key to use this vector is in the "names" which we use for subsetting
|
# But the key to use this vector is in the "names" which we use for subsetting
|
||||||
# the list of amino acids in whatever way we need.
|
# the list of amino acids in whatever way we need.
|
||||||
names(GENETIC_CODE)
|
names(genCode)
|
||||||
|
|
||||||
# The translation of "TGG" ...
|
# The translation of "TGG" ...
|
||||||
GENETIC_CODE["TGG"]
|
genCode["TGG"]
|
||||||
|
|
||||||
# All stop codons
|
# All stop codons
|
||||||
names(GENETIC_CODE)[GENETIC_CODE == "*"]
|
names(genCode)[genCode == "*"]
|
||||||
|
|
||||||
# All start codons
|
# All start codons
|
||||||
names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or
|
names(genCode)[genCode == "M"] # ... or
|
||||||
c(names(GENETIC_CODE)[GENETIC_CODE == "M"],
|
c(names(genCode)[genCode == "M"],
|
||||||
attr(GENETIC_CODE, "alt_init_codons"))
|
attr(genCode, "alt_init_codons"))
|
||||||
|
|
||||||
|
|
||||||
# == 2.1 Translate a sequence. =============================================
|
# == 2.1 Translate a sequence. =============================================
|
||||||
@ -165,7 +173,7 @@ nchar(mbp1)/3
|
|||||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
||||||
# with DNAstring(), then split it up, then convert it into a plain
|
# with DNAstring(), then split it up, then convert it into a plain
|
||||||
# character vector.
|
# character vector.
|
||||||
mbp1Codons <- as.character(codons(DNAString(mbp1)))
|
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
|
||||||
|
|
||||||
head(mbp1Codons)
|
head(mbp1Codons)
|
||||||
|
|
||||||
@ -173,7 +181,7 @@ head(mbp1Codons)
|
|||||||
|
|
||||||
mbp1AA <- character(834)
|
mbp1AA <- character(834)
|
||||||
for (i in seq_along(mbp1Codons)) {
|
for (i in seq_along(mbp1Codons)) {
|
||||||
mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]]
|
mbp1AA[i] <- genCode[mbp1Codons[i]]
|
||||||
}
|
}
|
||||||
|
|
||||||
head(mbp1Codons)
|
head(mbp1Codons)
|
||||||
@ -196,7 +204,8 @@ sort(table(mbp1AA), decreasing = TRUE)
|
|||||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
||||||
tail(mbp1AA) # Note the stop is gone!
|
tail(mbp1AA) # Note the stop is gone!
|
||||||
|
|
||||||
# paste it together, collapsing the elements without separation-character
|
# paste it together, collapsing the elements using an empty string as the
|
||||||
|
# separation-character (i.e.: nothing)
|
||||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
||||||
|
|
||||||
|
|
||||||
@ -204,14 +213,15 @@ tail(mbp1AA) # Note the stop is gone!
|
|||||||
|
|
||||||
|
|
||||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
||||||
# here is a good opportunity to review the syntax with a genetic code cube:
|
# here is a good opportunity to review the syntax of 3D arrays with a
|
||||||
|
# genetic code cube:
|
||||||
|
|
||||||
# Initialize, using A C G T as the names of the elements in each dimension
|
# Initialize, using A G C T as the names of the elements in each dimension
|
||||||
cCube <- array(data = character(64),
|
cCube <- array(data = character(64),
|
||||||
dim = c(4, 4, 4),
|
dim = c(4, 4, 4),
|
||||||
dimnames = list(c("A", "C", "G", "T"),
|
dimnames = list(c("A", "G", "C", "T"),
|
||||||
c("A", "C", "G", "T"),
|
c("A", "G", "C", "T"),
|
||||||
c("A", "C", "G", "T")))
|
c("A", "G", "C", "T")))
|
||||||
|
|
||||||
# fill it with amino acid codes using three nested loops
|
# fill it with amino acid codes using three nested loops
|
||||||
for (i in 1:4) {
|
for (i in 1:4) {
|
||||||
@ -222,7 +232,7 @@ for (i in 1:4) {
|
|||||||
dimnames(cCube)[[3]][k],
|
dimnames(cCube)[[3]][k],
|
||||||
sep = "",
|
sep = "",
|
||||||
collapse = "")
|
collapse = "")
|
||||||
cCube[i, j, k] <- GENETIC_CODE[myCodon]
|
cCube[i, j, k] <- genCode[myCodon]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -291,14 +301,14 @@ for (i in nuc) {
|
|||||||
# Solution:
|
# Solution:
|
||||||
|
|
||||||
# Fetch the code
|
# Fetch the code
|
||||||
GENETIC_CODE_TABLE
|
Biostrings::GENETIC_CODE_TABLE
|
||||||
GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"]
|
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
|
||||||
altYcode <- getGeneticCode("12")
|
altYcode <- Biostrings::getGeneticCode("12")
|
||||||
|
|
||||||
# what's the difference?
|
# what's the difference?
|
||||||
(delta <- which(GENETIC_CODE != altYcode))
|
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
|
||||||
|
|
||||||
GENETIC_CODE[delta]
|
Biostrings::GENETIC_CODE[delta]
|
||||||
altYcode[delta]
|
altYcode[delta]
|
||||||
|
|
||||||
# translate
|
# translate
|
||||||
@ -319,7 +329,7 @@ for (i in nuc) {
|
|||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Solution:
|
# Solution:
|
||||||
table(table(GENETIC_CODE))
|
table(table(Biostrings::GENETIC_CODE))
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2019 01
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.1 Update set.seed() usage
|
# 1.1 Update set.seed() usage
|
||||||
# 1.0 First final version for learning units.
|
# 1.0 First final version for learning units.
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
@ -30,17 +32,17 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ------------------------------------------------------------
|
#TOC> ------------------------------------------------------------
|
||||||
#TOC> 1 Review 48
|
#TOC> 1 Review 50
|
||||||
#TOC> 2 DEGREE DISTRIBUTIONS 201
|
#TOC> 2 DEGREE DISTRIBUTIONS 204
|
||||||
#TOC> 2.1 Random graph 207
|
#TOC> 2.1 Random graph 210
|
||||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
|
#TOC> 2.2 scale-free graph (Barabasi-Albert) 258
|
||||||
#TOC> 2.3 Random geometric graph 320
|
#TOC> 2.3 Random geometric graph 323
|
||||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
|
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 445
|
||||||
#TOC> 3.1 Basics 445
|
#TOC> 3.1 Basics 448
|
||||||
#TOC> 3.2 Components 517
|
#TOC> 3.2 Components 520
|
||||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
|
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539
|
||||||
#TOC> 4.1 Diameter 573
|
#TOC> 4.1 Diameter 576
|
||||||
#TOC> 5 GRAPH CLUSTERING 641
|
#TOC> 5 GRAPH CLUSTERING 645
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -123,9 +125,8 @@ set.seed(NULL) # reset the RNG
|
|||||||
# standard package for work with graphs in r is "igraph". We'll go into more
|
# standard package for work with graphs in r is "igraph". We'll go into more
|
||||||
# details of the igraph package a bit later, for now we just use it to plot:
|
# details of the igraph package a bit later, for now we just use it to plot:
|
||||||
|
|
||||||
if (! require(igraph, quietly=TRUE)) {
|
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||||
install.packages("igraph")
|
install.packages("igraph")
|
||||||
library(igraph)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = igraph) # basic information
|
# library(help = igraph) # basic information
|
||||||
@ -133,10 +134,12 @@ if (! require(igraph, quietly=TRUE)) {
|
|||||||
# data(package = "igraph") # available datasets
|
# data(package = "igraph") # available datasets
|
||||||
|
|
||||||
|
|
||||||
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
myG <- igraph::graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
||||||
|
|
||||||
set.seed(112358) # set RNG seed for repeatable randomness
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
|
# calculate layout coordinates
|
||||||
|
myGxy <- igraph::layout_with_graphopt(myG,
|
||||||
|
charge=0.0012)
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
@ -157,9 +160,9 @@ plot(myG,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||||
vertex.size = 1600 + (300 * degree(myG)),
|
vertex.size = 1600 + (300 * igraph::degree(myG)),
|
||||||
vertex.label = sprintf("%s(%i)", names(V(myG)), degree(myG)),
|
vertex.label = sprintf("%s(%i)", names(igraph::V(myG)), igraph::degree(myG)),
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.7)
|
vertex.label.cex = 0.7)
|
||||||
par(oPar) # reset plot window
|
par(oPar) # reset plot window
|
||||||
@ -179,10 +182,10 @@ sum(myRandAM)
|
|||||||
rowSums(myRandAM) + colSums(myRandAM) # check this against the plot!
|
rowSums(myRandAM) + colSums(myRandAM) # check this against the plot!
|
||||||
|
|
||||||
# The function degree() gives the same values
|
# The function degree() gives the same values
|
||||||
degree(myG)
|
igraph::degree(myG)
|
||||||
|
|
||||||
# Let's plot the degree distribution in a histogram:
|
# Let's plot the degree distribution in a histogram:
|
||||||
degG <- degree(myG)
|
degG <- igraph::degree(myG)
|
||||||
brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1) # define histogram breaks
|
brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1) # define histogram breaks
|
||||||
hist(degG, breaks=brk, col="#A5CCF5",
|
hist(degG, breaks=brk, col="#A5CCF5",
|
||||||
xlim = c(-1,8), xaxt = "n",
|
xlim = c(-1,8), xaxt = "n",
|
||||||
@ -212,8 +215,8 @@ set.seed(31415927) # set RNG seed for repeatable randomness
|
|||||||
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
|
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
myG200 <- igraph::graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
||||||
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
myGxy <- igraph::layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
||||||
# coordinates
|
# coordinates
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||||
@ -222,8 +225,8 @@ plot(myG200,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
|
vertex.color=heat.colors(max(igraph::degree(myG200)+1))[igraph::degree(myG200)+1],
|
||||||
vertex.size = 150 + (60 * degree(myG200)),
|
vertex.size = 150 + (60 * igraph::degree(myG200)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar) # restore graphics state
|
par(oPar) # restore graphics state
|
||||||
|
|
||||||
@ -231,7 +234,7 @@ par(oPar) # restore graphics state
|
|||||||
# biological graphs look approximately like this.
|
# biological graphs look approximately like this.
|
||||||
|
|
||||||
# Calculate degree distributions
|
# Calculate degree distributions
|
||||||
dg <- degree(myG200)
|
dg <- igraph::degree(myG200)
|
||||||
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
||||||
hist(dg, breaks=brk, col="#A5F5CC",
|
hist(dg, breaks=brk, col="#A5F5CC",
|
||||||
xlim = c(-1,11), xaxt = "n",
|
xlim = c(-1,11), xaxt = "n",
|
||||||
@ -263,10 +266,10 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
N <- 200
|
N <- 200
|
||||||
|
|
||||||
set.seed(31415927) # set RNG seed for repeatable randomness
|
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||||
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
|
GBA <- igraph::sample_pa(N, power = 0.8, directed = FALSE)
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
|
GBAxy <- igraph::layout_with_graphopt(GBA, charge=0.0001)
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||||
plot(GBA,
|
plot(GBA,
|
||||||
@ -274,8 +277,8 @@ plot(GBA,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01),
|
xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01),
|
||||||
ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01),
|
ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
|
vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
|
||||||
vertex.size = 200 + (30 * degree(GBA)),
|
vertex.size = 200 + (30 * igraph::degree(GBA)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar) # restore grphics state
|
par(oPar) # restore grphics state
|
||||||
|
|
||||||
@ -287,7 +290,7 @@ par(oPar) # restore grphics state
|
|||||||
# singletons.
|
# singletons.
|
||||||
|
|
||||||
# What's the degree distribution of this graph?
|
# What's the degree distribution of this graph?
|
||||||
(dg <- degree(GBA))
|
(dg <- igraph::degree(GBA))
|
||||||
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
||||||
hist(dg, breaks=brk, col="#DCF5B5",
|
hist(dg, breaks=brk, col="#DCF5B5",
|
||||||
xlim = c(0,max(dg)+1), xaxt = "n",
|
xlim = c(0,max(dg)+1), xaxt = "n",
|
||||||
@ -307,8 +310,8 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
|||||||
# Sort-of linear, but many of the higher ranked nodes have a frequency of only
|
# Sort-of linear, but many of the higher ranked nodes have a frequency of only
|
||||||
# one. That behaviour smooths out in larger graphs:
|
# one. That behaviour smooths out in larger graphs:
|
||||||
#
|
#
|
||||||
X <- sample_pa(100000, power = 0.8, directed = FALSE) # 100,000 nodes
|
X <- igraph::sample_pa(1e5, power = 0.8, directed = FALSE) # 100,000 nodes
|
||||||
freqRank <- table(degree(X))
|
freqRank <- table(igraph::degree(X))
|
||||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||||
log10(as.numeric(freqRank)), type = "b",
|
log10(as.numeric(freqRank)), type = "b",
|
||||||
xlab = "log(Rank)", ylab = "log(frequency)",
|
xlab = "log(Rank)", ylab = "log(frequency)",
|
||||||
@ -404,7 +407,7 @@ rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
|
|||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
|
|
||||||
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
myGRG <- igraph::graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
||||||
|
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||||
plot(myGRG,
|
plot(myGRG,
|
||||||
@ -412,13 +415,13 @@ plot(myGRG,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
||||||
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
||||||
vertex.color=heat.colors(max(degree(myGRG)+1))[degree(myGRG)+1],
|
vertex.color=heat.colors(max(igraph::degree(myGRG)+1))[igraph::degree(myGRG)+1],
|
||||||
vertex.size = 0.1 + (0.2 * degree(myGRG)),
|
vertex.size = 0.1 + (0.2 * igraph::degree(myGRG)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
|
|
||||||
# degree distribution:
|
# degree distribution:
|
||||||
(dg <- degree(myGRG))
|
(dg <- igraph::degree(myGRG))
|
||||||
brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1)
|
brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1)
|
||||||
hist(dg, breaks = brk, col = "#FCC6D2",
|
hist(dg, breaks = brk, col = "#FCC6D2",
|
||||||
xlim = c(0, 25), xaxt = "n",
|
xlim = c(0, 25), xaxt = "n",
|
||||||
@ -450,7 +453,7 @@ summary(myG)
|
|||||||
|
|
||||||
# This output means: this is an IGRAPH graph, with U = UN-directed edges
|
# This output means: this is an IGRAPH graph, with U = UN-directed edges
|
||||||
# and N = named nodes, that has 20 nodes and 20 edges. For details, see
|
# and N = named nodes, that has 20 nodes and 20 edges. For details, see
|
||||||
?print.igraph
|
?igraph::print.igraph
|
||||||
|
|
||||||
mode(myG)
|
mode(myG)
|
||||||
class(myG)
|
class(myG)
|
||||||
@ -463,11 +466,11 @@ class(myG)
|
|||||||
# recipes, called _games_ in this package.
|
# recipes, called _games_ in this package.
|
||||||
|
|
||||||
# Two basic functions retrieve nodes "Vertices", and "Edges":
|
# Two basic functions retrieve nodes "Vertices", and "Edges":
|
||||||
V(myG)
|
igraph::V(myG)
|
||||||
E(myG)
|
igraph::E(myG)
|
||||||
|
|
||||||
# additional properties can be retrieved from the Vertices ...
|
# additional properties can be retrieved from the Vertices ...
|
||||||
V(myG)$name
|
igraph::V(myG)$name
|
||||||
|
|
||||||
|
|
||||||
# As with many R objects, loading the package provides special functions that
|
# As with many R objects, loading the package provides special functions that
|
||||||
@ -487,12 +490,12 @@ plot(myG) # this is the result of default plot parameters
|
|||||||
# Plot with some customizing parameters
|
# Plot with some customizing parameters
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||||
plot(myG,
|
plot(myG,
|
||||||
layout = layout_with_fr(myG),
|
layout = igraph::layout_with_fr(myG),
|
||||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||||
vertex.size = 9 + (2 * degree(myG)),
|
vertex.size = 9 + (2 * igraph::degree(myG)),
|
||||||
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
|
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
|
||||||
edge.width = 2,
|
edge.width = 2,
|
||||||
vertex.label = V(myG)$name,
|
vertex.label = igraph::V(myG)$name,
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.9)
|
vertex.label.cex = 0.9)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
@ -500,12 +503,12 @@ par(oPar)
|
|||||||
# ... or with a different layout:
|
# ... or with a different layout:
|
||||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||||
plot(myG,
|
plot(myG,
|
||||||
layout = layout_in_circle(myG),
|
layout = igraph::layout_in_circle(myG),
|
||||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||||
vertex.size = 9 + (2 * degree(myG)),
|
vertex.size = 9 + (2 * igraph::degree(myG)),
|
||||||
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
|
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
|
||||||
edge.width = 2,
|
edge.width = 2,
|
||||||
vertex.label = V(myG)$name,
|
vertex.label = igraph::V(myG)$name,
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.9)
|
vertex.label.cex = 0.9)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
@ -518,18 +521,18 @@ par(oPar)
|
|||||||
|
|
||||||
# The igraph function components() tells us whether there are components of the
|
# The igraph function components() tells us whether there are components of the
|
||||||
# graph in which there is no path to other components.
|
# graph in which there is no path to other components.
|
||||||
components(myG)
|
igraph::components(myG)
|
||||||
|
|
||||||
# In the _membership_ vector, nodes are annotated with the index of the
|
# In the _membership_ vector, nodes are annotated with the index of the
|
||||||
# component they are part of. Sui7 is the only node of component 2, Cyj1 is in
|
# component they are part of. Sui7 is the only node of component 2, Cyj1 is in
|
||||||
# the third component etc. This is perhaps more clear if we sort by component
|
# the third component etc. This is perhaps more clear if we sort by component
|
||||||
# index
|
# index
|
||||||
sort(components(myG)$membership, decreasing = TRUE)
|
sort(igraph::components(myG)$membership, decreasing = TRUE)
|
||||||
|
|
||||||
# Retrieving e.g. the members of the first component from the list can be done by subsetting:
|
# Retrieving e.g. the members of the first component from the list can be done by subsetting:
|
||||||
|
|
||||||
(sel <- components(myG)$membership == 1) # boolean vector ..
|
(sel <- igraph::components(myG)$membership == 1) # boolean vector ..
|
||||||
(c1 <- components(myG)$membership[sel])
|
(c1 <- igraph::components(myG)$membership[sel])
|
||||||
names(c1)
|
names(c1)
|
||||||
|
|
||||||
|
|
||||||
@ -542,9 +545,9 @@ names(c1)
|
|||||||
# preferential-attachment ... but igraph has ways to simulate the basic ones
|
# preferential-attachment ... but igraph has ways to simulate the basic ones
|
||||||
# (and we could easily simulate our own). Look at the following help pages:
|
# (and we could easily simulate our own). Look at the following help pages:
|
||||||
|
|
||||||
?sample_gnm # see also sample_gnp for the Erdös-Rényi models
|
?igraph::sample_gnm # see also sample_gnp for the Erdös-Rényi models
|
||||||
?sample_smallworld # for the Watts & Strogatz model
|
?igraph::sample_smallworld # for the Watts & Strogatz model
|
||||||
?sample_pa # for the Barabasi-Albert model
|
?igraph::sample_pa # for the Barabasi-Albert model
|
||||||
|
|
||||||
# But note that there are many more sample_ functions. Check out the docs!
|
# But note that there are many more sample_ functions. Check out the docs!
|
||||||
|
|
||||||
@ -554,7 +557,7 @@ names(c1)
|
|||||||
# layout drawas them, obviously.
|
# layout drawas them, obviously.
|
||||||
|
|
||||||
set.seed(112358) # set RNG seed for repeatable randomness
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
myGxy <- layout_with_fr(myG) # calculate layout coordinates
|
myGxy <- igraph::layout_with_fr(myG) # calculate layout coordinates
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
|
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
|
||||||
@ -563,30 +566,31 @@ plot(myG,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
|
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||||
vertex.size = 20 + (10 * degree(myG)),
|
vertex.size = 20 + (10 * igraph::degree(myG)),
|
||||||
vertex.label = V(myG)$name,
|
vertex.label = igraph::V(myG)$name,
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.8)
|
vertex.label.cex = 0.8)
|
||||||
par(oPar) # restore graphics state
|
par(oPar) # restore graphics state
|
||||||
|
|
||||||
# == 4.1 Diameter ==========================================================
|
# == 4.1 Diameter ==========================================================
|
||||||
|
|
||||||
diameter(myG) # The diameter of a graph is its maximum length shortest path.
|
igraph::diameter(myG) # The diameter of a graph is its maximum length
|
||||||
|
# shortest path.
|
||||||
|
|
||||||
# let's plot this path: here are the nodes ...
|
# let's plot this path: here are the nodes ...
|
||||||
get_diameter(myG)
|
igraph::get_diameter(myG)
|
||||||
|
|
||||||
# ... and we can get the x, y coordinates from iGxy by subsetting with the node
|
# ... and we can get the x, y coordinates from iGxy by subsetting with the node
|
||||||
# names. The we draw the diameter-path with a transparent, thick pink line:
|
# names. The we draw the diameter-path with a transparent, thick pink line:
|
||||||
lines(myGxy[get_diameter(myG),], lwd=10, col="#ff63a788")
|
lines(myGxy[igraph::get_diameter(myG),], lwd=10, col="#ff63a788")
|
||||||
|
|
||||||
# == Centralization scores
|
# == Centralization scores
|
||||||
|
|
||||||
?centralize
|
?igraph::centralize
|
||||||
# replot our graph, and color by log_betweenness:
|
# replot our graph, and color by log_betweenness:
|
||||||
|
|
||||||
bC <- centr_betw(myG) # calculate betweenness centrality
|
bC <- igraph::centr_betw(myG) # calculate betweenness centrality
|
||||||
nodeBetw <- bC$res
|
nodeBetw <- bC$res
|
||||||
nodeBetw <- round(log(nodeBetw +1)) + 1
|
nodeBetw <- round(log(nodeBetw +1)) + 1
|
||||||
|
|
||||||
@ -597,8 +601,8 @@ plot(myG,
|
|||||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||||
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
|
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
|
||||||
vertex.size = 20 + (10 * degree(myG)),
|
vertex.size = 20 + (10 * igraph::degree(myG)),
|
||||||
vertex.label = V(myG)$name,
|
vertex.label = igraph::V(myG)$name,
|
||||||
vertex.label.family = "sans",
|
vertex.label.family = "sans",
|
||||||
vertex.label.cex = 0.7)
|
vertex.label.cex = 0.7)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
@ -613,7 +617,7 @@ par(oPar)
|
|||||||
#
|
#
|
||||||
# Lets plot betweenness centrality for our random geometric graph:
|
# Lets plot betweenness centrality for our random geometric graph:
|
||||||
|
|
||||||
bCmyGRG <- centr_betw(myGRG) # calculate betweenness centrality
|
bCmyGRG <- igraph::centr_betw(myGRG) # calculate betweenness centrality
|
||||||
|
|
||||||
nodeBetw <- bCmyGRG$res
|
nodeBetw <- bCmyGRG$res
|
||||||
nodeBetw <- round((log(nodeBetw +1))^2.5) + 1
|
nodeBetw <- round((log(nodeBetw +1))^2.5) + 1
|
||||||
@ -630,9 +634,9 @@ plot(myGRG,
|
|||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
|
|
||||||
diameter(myGRG)
|
igraph::diameter(myGRG)
|
||||||
lines(rGAM$x[get_diameter(myGRG)],
|
lines(rGAM$x[igraph::get_diameter(myGRG)],
|
||||||
rGAM$y[get_diameter(myGRG)],
|
rGAM$y[igraph::get_diameter(myGRG)],
|
||||||
lwd = 10,
|
lwd = 10,
|
||||||
col = "#ff335533")
|
col = "#ff335533")
|
||||||
|
|
||||||
@ -648,11 +652,11 @@ lines(rGAM$x[get_diameter(myGRG)],
|
|||||||
# http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org
|
# http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org
|
||||||
|
|
||||||
|
|
||||||
myGRGclusters <- cluster_infomap(myGRG)
|
myGRGclusters <- igraph::cluster_infomap(myGRG)
|
||||||
modularity(myGRGclusters) # ... measures how separated the different membership
|
igraph::modularity(myGRGclusters) # ... measures how separated the different
|
||||||
# types are from each other
|
# membership types are from each other
|
||||||
membership(myGRGclusters) # which nodes are in what cluster?
|
igraph::membership(myGRGclusters) # which nodes are in what cluster?
|
||||||
table(membership(myGRGclusters)) # how large are the clusters?
|
table(igraph::membership(myGRGclusters)) # how large are the clusters?
|
||||||
|
|
||||||
# The largest cluster has 48 members, the second largest has 25, etc.
|
# The largest cluster has 48 members, the second largest has 25, etc.
|
||||||
|
|
||||||
@ -661,7 +665,7 @@ table(membership(myGRGclusters)) # how large are the clusters?
|
|||||||
# their cluster membership:
|
# their cluster membership:
|
||||||
|
|
||||||
# first, make a vector with as many grey colors as we have communities ...
|
# first, make a vector with as many grey colors as we have communities ...
|
||||||
commColors <- rep("#f1eef6", max(membership(myGRGclusters)))
|
commColors <- rep("#f1eef6", max(igraph::membership(myGRGclusters)))
|
||||||
# ... then overwrite the first five with "real colors" - something like rust,
|
# ... then overwrite the first five with "real colors" - something like rust,
|
||||||
# lilac, pink, and mauve or so.
|
# lilac, pink, and mauve or so.
|
||||||
commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da")
|
commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da")
|
||||||
@ -673,8 +677,8 @@ plot(myGRG,
|
|||||||
rescale = FALSE,
|
rescale = FALSE,
|
||||||
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
||||||
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
||||||
vertex.color=commColors[membership(myGRGclusters)],
|
vertex.color=commColors[igraph::membership(myGRGclusters)],
|
||||||
vertex.size = 0.1 + (0.1 * degree(myGRG)),
|
vertex.size = 0.1 + (0.1 * igraph::degree(myGRG)),
|
||||||
vertex.label = NA)
|
vertex.label = NA)
|
||||||
par(oPar)
|
par(oPar)
|
||||||
|
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2019 01
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.3 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
# 1.2 Update set.seed() usage
|
# 1.2 Update set.seed() usage
|
||||||
# 1.1 Corrected empirical p-value
|
# 1.1 Corrected empirical p-value
|
||||||
# 1.0 First code live version
|
# 1.0 First code live version
|
||||||
@ -28,21 +30,21 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------------------------------------
|
#TOC> -----------------------------------------------------------------------------
|
||||||
#TOC> 1 Introduction 50
|
#TOC> 1 Introduction 52
|
||||||
#TOC> 2 Three fundamental distributions 113
|
#TOC> 2 Three fundamental distributions 115
|
||||||
#TOC> 2.1 The Poisson Distribution 116
|
#TOC> 2.1 The Poisson Distribution 118
|
||||||
#TOC> 2.2 The uniform distribution 170
|
#TOC> 2.2 The uniform distribution 172
|
||||||
#TOC> 2.3 The Normal Distribution 190
|
#TOC> 2.3 The Normal Distribution 192
|
||||||
#TOC> 3 quantile-quantile comparison 231
|
#TOC> 3 quantile-quantile comparison 233
|
||||||
#TOC> 3.1 qqnorm() 241
|
#TOC> 3.1 qqnorm() 243
|
||||||
#TOC> 3.2 qqplot() 307
|
#TOC> 3.2 qqplot() 309
|
||||||
#TOC> 4 Quantifying the difference 324
|
#TOC> 4 Quantifying the difference 326
|
||||||
#TOC> 4.1 Chi2 test for discrete distributions 359
|
#TOC> 4.1 Chi2 test for discrete distributions 361
|
||||||
#TOC> 4.2 Kullback-Leibler divergence 451
|
#TOC> 4.2 Kullback-Leibler divergence 452
|
||||||
#TOC> 4.2.1 An example from tossing dice 462
|
#TOC> 4.2.1 An example from tossing dice 463
|
||||||
#TOC> 4.2.2 An example from lognormal distributions 585
|
#TOC> 4.2.2 An example from lognormal distributions 586
|
||||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
|
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -385,9 +387,8 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4])
|
|||||||
# package information - plotrix has _many_ useful utilities to enhance
|
# package information - plotrix has _many_ useful utilities to enhance
|
||||||
# plots or produce informative visualizations.
|
# plots or produce informative visualizations.
|
||||||
|
|
||||||
if (! require(plotrix, quietly=TRUE)) {
|
if (! requireNamespace("plotrix", quietly = TRUE)) {
|
||||||
install.packages("plotrix")
|
install.packages("plotrix")
|
||||||
library(plotrix)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = plotrix) # basic information
|
# library(help = plotrix) # basic information
|
||||||
@ -395,7 +396,7 @@ if (! require(plotrix, quietly=TRUE)) {
|
|||||||
# data(package = "plotrix") # available datasets
|
# data(package = "plotrix") # available datasets
|
||||||
|
|
||||||
|
|
||||||
h <- multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
|
h <- plotrix::multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
|
||||||
breaks = myBreaks,
|
breaks = myBreaks,
|
||||||
col = myCols)
|
col = myCols)
|
||||||
legend("topright",
|
legend("topright",
|
||||||
|
151
RPR-Biostrings.R
151
RPR-Biostrings.R
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Biostrings unit.
|
# R code accompanying the RPR-Biostrings unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 20
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.0 2017 Revisions
|
# 1.0 2017 Revisions
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,19 +31,19 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------
|
#TOC> ---------------------------------------------------------------
|
||||||
#TOC> 1 The Biostrings Package 52
|
#TOC> 1 The Biostrings Package 55
|
||||||
#TOC> 2 Getting Data into Biostrings Objects 85
|
#TOC> 2 Getting Data into Biostrings Objects 86
|
||||||
#TOC> 3 Working with Biostrings Objects 106
|
#TOC> 3 Working with Biostrings Objects 108
|
||||||
#TOC> 3.1 Properties 109
|
#TOC> 3.1 Properties 125
|
||||||
#TOC> 3.2 Subsetting 146
|
#TOC> 3.2 Subsetting 163
|
||||||
#TOC> 3.3 Operators 158
|
#TOC> 3.3 Operators 175
|
||||||
#TOC> 3.4 Transformations 165
|
#TOC> 3.4 Transformations 182
|
||||||
#TOC> 4 Getting Data out of Biostrings Objects 172
|
#TOC> 4 Getting Data out of Biostrings Objects 189
|
||||||
#TOC> 5 More 181
|
#TOC> 5 More 198
|
||||||
#TOC> 5.1 Views 183
|
#TOC> 5.1 Views 200
|
||||||
#TOC> 5.2 Iranges 195
|
#TOC> 5.2 Iranges 214
|
||||||
#TOC> 5.3 StringSets 201
|
#TOC> 5.3 StringSets 220
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -54,14 +57,12 @@
|
|||||||
|
|
||||||
# First, we install and load the Biostrings package from bioconductor
|
# First, we install and load the Biostrings package from bioconductor
|
||||||
|
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Examine the package information:
|
# Examine the package information:
|
||||||
library(help = Biostrings) # basic information
|
library(help = Biostrings) # basic information
|
||||||
browseVignettes("Biostrings") # available vignettes
|
browseVignettes("Biostrings") # available vignettes
|
||||||
@ -72,72 +73,88 @@ data(package = "Biostrings") # available datasets
|
|||||||
# of a "class" in R as a special kind of list), that can take on particular
|
# of a "class" in R as a special kind of list), that can take on particular
|
||||||
# flavours for RNA, DNA or amino acid sequence information.
|
# flavours for RNA, DNA or amino acid sequence information.
|
||||||
|
|
||||||
class(RNAString("AUG"))
|
class(Biostrings::RNAString("AUG"))
|
||||||
class(DNAString("ATG"))
|
class(Biostrings::DNAString("ATG"))
|
||||||
class(AAString("M"))
|
class(Biostrings::AAString("M"))
|
||||||
|
|
||||||
# An essential property of Biostrings objects is that they only allow letters
|
# An essential property of Biostrings objects is that they only allow letters
|
||||||
# from the applicable IUPAC alphabet:
|
# from the applicable IUPAC alphabet:
|
||||||
RNAString("AUG")
|
Biostrings::RNAString("AUG")
|
||||||
DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
||||||
|
|
||||||
|
|
||||||
# = 2 Getting Data into Biostrings Objects ================================
|
# = 2 Getting Data into Biostrings Objects ================================
|
||||||
|
|
||||||
|
|
||||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
|
# Example: read FASTA. Extract sequence. Convert to DNAString object.
|
||||||
x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||||
x <- dbSanitizeSequence(x)
|
rawSeq <- dbSanitizeSequence(rawSeq)
|
||||||
myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a
|
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
|
||||||
# object of class DNAstring
|
# into an object of class DNAstring
|
||||||
|
|
||||||
# Multi FASTA files can be read directly as a "XStringSet) ...
|
# Multi FASTA files can be read directly as a "XStringSet) ...
|
||||||
(myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa"))
|
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
|
||||||
|
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
|
||||||
|
|
||||||
# ... and if you subset one sequence from the set, you get an XString object
|
# ... and if you subset one sequence from the set, you get an XString object
|
||||||
# back again.
|
# back again.
|
||||||
(Xseq <- myDNASet[[1]])
|
(Xseq <- biosDNASet[[1]])
|
||||||
|
|
||||||
myDNAseq == Xseq # the comparison evaluates to TRUE ...
|
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
|
||||||
identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 3 Working with Biostrings Objects =====================================
|
# = 3 Working with Biostrings Objects =====================================
|
||||||
|
|
||||||
|
# Biostrings is a highly engineered package that is tightly integrated into
|
||||||
|
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||||
|
# undesirable level of computational overhead and dependencies. Using the
|
||||||
|
# package as we normally do - i.e. calling required functions with their
|
||||||
|
# explicit package prefix is therefore not advisable. There are generics
|
||||||
|
# that won't be propery dispatched. If you only need a small number of
|
||||||
|
# functions for a very specific context, you will probably get away with
|
||||||
|
# Biostrings::<function>() - but even in the demonstration code of this script
|
||||||
|
# not everything works out of the box. We'll therefore load the library,
|
||||||
|
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||||
|
# the functions come from.
|
||||||
|
|
||||||
|
library(Biostrings)
|
||||||
|
|
||||||
|
|
||||||
# == 3.1 Properties ========================================================
|
# == 3.1 Properties ========================================================
|
||||||
str(myDNAseq)
|
str(rawSeq)
|
||||||
length(myDNAseq) # This gives you the _number of nucleotides_!
|
str(biosDNAseq)
|
||||||
# By comparison ...
|
|
||||||
length(x) # ... is 1: one string only. To get the number of
|
|
||||||
# characters in a string, you need nchar().
|
|
||||||
nchar(x) # However ...
|
|
||||||
nchar(myDNAseq) # ... also works.
|
|
||||||
|
|
||||||
uniqueLetters(myDNAseq)
|
length(rawSeq) # ... is 1: one string only. To get the number of
|
||||||
|
# characters in a string, you need nchar().
|
||||||
|
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
|
||||||
|
nchar(rawSeq)
|
||||||
|
nchar(biosDNAseq) # ... but nchar() works too.
|
||||||
|
|
||||||
|
(uL <- Biostrings::uniqueLetters(biosDNAseq))
|
||||||
|
|
||||||
# Count frequencies - with strings, you would strsplit() into a character
|
# Count frequencies - with strings, you would strsplit() into a character
|
||||||
# vector and then use table(). biost
|
# vector and then use table(). biost
|
||||||
alphabetFrequency(myDNAseq)
|
Biostrings::alphabetFrequency(biosDNAseq)
|
||||||
|
|
||||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
|
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
|
||||||
# returns.
|
# returns.
|
||||||
letterFrequency(myDNAseq, uniqueLetters(myDNAseq))
|
Biostrings::letterFrequency(biosDNAseq, uL)
|
||||||
|
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
|
||||||
|
length(biosDNAseq) # GC contents
|
||||||
|
|
||||||
sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents
|
Biostrings::dinucleotideFrequency(biosDNAseq)
|
||||||
|
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
|
||||||
|
|
||||||
dinucleotideFrequency(myDNAseq)
|
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
|
||||||
barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5)
|
|
||||||
|
|
||||||
(triNuc <- trinucleotideFrequency(myDNAseq))
|
|
||||||
barplot(sort(triNuc), col="#4499EE33")
|
barplot(sort(triNuc), col="#4499EE33")
|
||||||
triNuc[triNuc == max(triNuc)]
|
triNuc[triNuc == max(triNuc)]
|
||||||
triNuc[triNuc == min(triNuc)]
|
triNuc[triNuc == min(triNuc)]
|
||||||
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
|
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
|
||||||
|
|
||||||
# compare to a shuffled sequence:
|
# compare to a shuffled sequence:
|
||||||
(triNuc <- trinucleotideFrequency(sample(myDNAseq)))
|
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
||||||
|
|
||||||
# Interpret this plot.
|
# Interpret this plot.
|
||||||
@ -146,34 +163,34 @@ barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
|||||||
# == 3.2 Subsetting ========================================================
|
# == 3.2 Subsetting ========================================================
|
||||||
|
|
||||||
# Subsetting any XString object works as expected:
|
# Subsetting any XString object works as expected:
|
||||||
myDNAseq[4:15]
|
biosDNAseq[4:15]
|
||||||
|
|
||||||
# ... well - maybe not expected, because x[4:15] would not work.
|
# ... well - maybe not expected, because rawSeq[4:15] would not work.
|
||||||
|
|
||||||
# Alternatively to the "[" operator, use the subseq() function - especially for
|
# Alternatively to the "[" operator, use the subseq() function - especially for
|
||||||
# long sequences. This is far more efficient.
|
# long sequences. This is far more efficient.
|
||||||
subseq(myDNAseq, start = 1, end = 30)
|
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
|
||||||
|
|
||||||
|
|
||||||
# == 3.3 Operators =========================================================
|
# == 3.3 Operators =========================================================
|
||||||
|
|
||||||
# RNAstring() and DNAstring() objects compare U and T as equals!
|
# RNAstring() and DNAstring() objects compare U and T as equals!
|
||||||
RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
||||||
DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
||||||
|
|
||||||
|
|
||||||
# == 3.4 Transformations ===================================================
|
# == 3.4 Transformations ===================================================
|
||||||
|
|
||||||
myDNAseq[4:15]
|
biosDNAseq[4:15]
|
||||||
reverseComplement(myDNAseq[4:15])
|
Biostrings::reverseComplement(biosDNAseq[4:15])
|
||||||
translate(myDNAseq[4:15])
|
Biostrings::translate(biosDNAseq[4:15])
|
||||||
|
|
||||||
|
|
||||||
# = 4 Getting Data out of Biostrings Objects ==============================
|
# = 4 Getting Data out of Biostrings Objects ==============================
|
||||||
|
|
||||||
# If you need a character object, use toString():
|
# If you need a character object, use toString():
|
||||||
|
|
||||||
toString(myDNAseq[4:15])
|
Biostrings::toString(biosDNAseq[4:15])
|
||||||
|
|
||||||
# save() and load() works like on all other R objects.
|
# save() and load() works like on all other R objects.
|
||||||
|
|
||||||
@ -185,7 +202,9 @@ toString(myDNAseq[4:15])
|
|||||||
# Biostring "Views" are objects that store multiple substrings of one
|
# Biostring "Views" are objects that store multiple substrings of one
|
||||||
# Biostring object.
|
# Biostring object.
|
||||||
|
|
||||||
(myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45)))
|
(myView <- Biostrings::Views(biosDNAseq,
|
||||||
|
start = c(1, 19, 37),
|
||||||
|
end = c(15, 30, 45)))
|
||||||
|
|
||||||
# Views are convenient to store feature annotations
|
# Views are convenient to store feature annotations
|
||||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
|
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
|
||||||
@ -202,20 +221,20 @@ cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
|
|||||||
|
|
||||||
# Biostring "StringSets" store multiple sequences.
|
# Biostring "StringSets" store multiple sequences.
|
||||||
#
|
#
|
||||||
ompA <- AAString("MKKTAIAIAVALAGFATVAQA")
|
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
|
||||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
|
sample(ompA) # sample can work directly on a Biostring object to shuffle it
|
||||||
|
|
||||||
x[1] <- toString(ompA)
|
x <- Biostrings::toString(ompA)
|
||||||
for (i in 2:10) {
|
for (i in 2:10) {
|
||||||
x[i] <- toString(sample(ompA))
|
x[i] <- Biostrings::toString(sample(ompA))
|
||||||
}
|
}
|
||||||
shuffledPeptideSet <- AAStringSet(x)
|
shuffledPeptideSet <- Biostrings::AAStringSet(x)
|
||||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
|
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
|
||||||
shuffledPeptideSet
|
shuffledPeptideSet
|
||||||
|
|
||||||
length(shuffledPeptideSet)
|
length(shuffledPeptideSet)
|
||||||
width(shuffledPeptideSet)
|
Biostrings::width(shuffledPeptideSet)
|
||||||
alphabetFrequency(shuffledPeptideSet)
|
Biostrings::alphabetFrequency(shuffledPeptideSet)
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
167
RPR-GEO2R.R
167
RPR-GEO2R.R
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR_GEO2R unit.
|
# R code accompanying the RPR_GEO2R unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2018 01
|
# Date: 2017 09 - 2019 01
|
||||||
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.1 Add section on GPL annotations
|
# 1.1 Add section on GPL annotations
|
||||||
# 1.0 Updates for BCH441 2017
|
# 1.0 Updates for BCH441 2017
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
@ -33,19 +36,19 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------------
|
#TOC> --------------------------------------------------------------------------
|
||||||
#TOC> 1 Preparations 53
|
#TOC> 1 Preparations 56
|
||||||
#TOC> 2 Loading a GEO Dataset 84
|
#TOC> 2 Loading a GEO Dataset 82
|
||||||
#TOC> 3 Column wise analysis - time points 154
|
#TOC> 3 Column wise analysis - time points 152
|
||||||
#TOC> 3.1 Task - Comparison of experiments 160
|
#TOC> 3.1 Task - Comparison of experiments 158
|
||||||
#TOC> 3.2 Grouped Samples 207
|
#TOC> 3.2 Grouped Samples 205
|
||||||
#TOC> 4 Row-wise Analysis: Expression Profiles 242
|
#TOC> 4 Row-wise Analysis: Expression Profiles 240
|
||||||
#TOC> 4.1 Task - Read a table of features 277
|
#TOC> 4.1 Task - Read a table of features 275
|
||||||
#TOC> 4.2 Selected Expression profiles 325
|
#TOC> 4.2 Selected Expression profiles 323
|
||||||
#TOC> 5 Differential Expression 366
|
#TOC> 5 Differential Expression 364
|
||||||
#TOC> 5.1 Final task: Gene descriptions 490
|
#TOC> 5.1 Final task: Gene descriptions 504
|
||||||
#TOC> 6 Improving on Discovery by Differential Expression 495
|
#TOC> 6 Improving on Discovery by Differential Expression 510
|
||||||
#TOC> 7 Annotation data 577
|
#TOC> 7 Annotation data 594
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -55,12 +58,11 @@
|
|||||||
# To load and analyze GEO datasets we use a number of Bioconductor packages:
|
# To load and analyze GEO datasets we use a number of Bioconductor packages:
|
||||||
|
|
||||||
|
|
||||||
if (! require(Biobase, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biobase")
|
if (! requireNamespace("Biobase", quietly = TRUE)) {
|
||||||
library(Biobase)
|
BiocManager::install("Biobase")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biobase) # basic information
|
# library(help = Biobase) # basic information
|
||||||
@ -68,12 +70,8 @@ if (! require(Biobase, quietly=TRUE)) {
|
|||||||
# data(package = "Biobase") # available datasets
|
# data(package = "Biobase") # available datasets
|
||||||
|
|
||||||
|
|
||||||
if (! require(GEOquery, quietly=TRUE)) {
|
if (! requireNamespace("GEOquery", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("GEOquery")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("GEOquery")
|
|
||||||
library(GEOquery)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = GEOquery) # basic information
|
# library(help = GEOquery) # basic information
|
||||||
@ -94,7 +92,7 @@ if (! require(GEOquery, quietly=TRUE)) {
|
|||||||
# I have experienced outages over several hours. If the command below does
|
# I have experienced outages over several hours. If the command below does
|
||||||
# not work for you, skip ahead to the fallback procedure.
|
# not work for you, skip ahead to the fallback procedure.
|
||||||
|
|
||||||
GSE3635 <- getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
|
GSE3635 <- GEOquery::getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
|
||||||
# Note: GEO2R scripts call the expression data set
|
# Note: GEO2R scripts call the expression data set
|
||||||
# "gset" throughout ... in this script I give
|
# "gset" throughout ... in this script I give
|
||||||
# it the name "GSE3635" for clarity.
|
# it the name "GSE3635" for clarity.
|
||||||
@ -136,14 +134,14 @@ help("ExpressionSet-class")
|
|||||||
GSE3635
|
GSE3635
|
||||||
|
|
||||||
# Access contents via methods:
|
# Access contents via methods:
|
||||||
featureNames(GSE3635)[1:20] # Rows. What are these features?
|
Biobase::featureNames(GSE3635)[1:20] # Rows. What are these features?
|
||||||
sampleNames(GSE3635)[1:10] # Columns. What are these columns?
|
Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns?
|
||||||
|
|
||||||
# Access contents by subsetting:
|
# Access contents by subsetting:
|
||||||
( tmp <- GSE3635[12:17, 1:6] )
|
( tmp <- GSE3635[12:17, 1:6] )
|
||||||
|
|
||||||
# Access data
|
# Access data
|
||||||
exprs(tmp) # exprs() gives us the actual expression values.
|
Biobase::exprs(tmp) # exprs() gives us the actual expression values.
|
||||||
|
|
||||||
|
|
||||||
#TASK> What are the data:
|
#TASK> What are the data:
|
||||||
@ -160,9 +158,9 @@ exprs(tmp) # exprs() gives us the actual expression values.
|
|||||||
# == 3.1 Task - Comparison of experiments ==================================
|
# == 3.1 Task - Comparison of experiments ==================================
|
||||||
|
|
||||||
# Get an overview of the distribution of data values in individual columns
|
# Get an overview of the distribution of data values in individual columns
|
||||||
summary(exprs(GSE3635)[ , 1])
|
summary(Biobase::exprs(GSE3635)[ , 1])
|
||||||
summary(exprs(GSE3635)[ , 4])
|
summary(Biobase::exprs(GSE3635)[ , 4])
|
||||||
summary(exprs(GSE3635)[ , 7])
|
summary(Biobase::exprs(GSE3635)[ , 7])
|
||||||
|
|
||||||
# as a boxplot
|
# as a boxplot
|
||||||
cyclicPalette <- colorRampPalette(c("#00AAFF",
|
cyclicPalette <- colorRampPalette(c("#00AAFF",
|
||||||
@ -173,7 +171,7 @@ cyclicPalette <- colorRampPalette(c("#00AAFF",
|
|||||||
"#FFAA00",
|
"#FFAA00",
|
||||||
"#00AAFF"))
|
"#00AAFF"))
|
||||||
tCols <- cyclicPalette(13)
|
tCols <- cyclicPalette(13)
|
||||||
boxplot(exprs(GSE3635), col = tCols)
|
boxplot(Biobase::exprs(GSE3635), col = tCols)
|
||||||
|
|
||||||
|
|
||||||
#TASK> Study this boxplot. What's going on? Are these expression values?
|
#TASK> Study this boxplot. What's going on? Are these expression values?
|
||||||
@ -181,11 +179,11 @@ boxplot(exprs(GSE3635), col = tCols)
|
|||||||
|
|
||||||
|
|
||||||
# Lets plot the distributions of values in a more fine-grained manner:
|
# Lets plot the distributions of values in a more fine-grained manner:
|
||||||
hT0 <- hist(exprs(GSE3635)[ , 1], breaks = 100)
|
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100)
|
||||||
hT3 <- hist(exprs(GSE3635)[ , 4], breaks = 100)
|
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100)
|
||||||
hT6 <- hist(exprs(GSE3635)[ , 7], breaks = 100)
|
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100)
|
||||||
hT9 <- hist(exprs(GSE3635)[ , 10], breaks = 100)
|
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100)
|
||||||
hT12 <- hist(exprs(GSE3635)[ , 13], breaks = 100)
|
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100)
|
||||||
|
|
||||||
|
|
||||||
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
|
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
|
||||||
@ -218,7 +216,7 @@ for (i in 1:nchar(gsms)) {
|
|||||||
sml <- paste("G", sml, sep="") # set group names
|
sml <- paste("G", sml, sep="") # set group names
|
||||||
|
|
||||||
# order samples by group
|
# order samples by group
|
||||||
ex <- exprs(GSE3635)[ , order(sml)]
|
ex <- Biobase::exprs(GSE3635)[ , order(sml)]
|
||||||
sml <- sml[order(sml)]
|
sml <- sml[order(sml)]
|
||||||
fl <- as.factor(sml)
|
fl <- as.factor(sml)
|
||||||
labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
|
labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
|
||||||
@ -231,8 +229,8 @@ labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
|
|||||||
GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5",
|
GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5",
|
||||||
"#dff4e4", "#f4dff4", "#AABBCC")
|
"#dff4e4", "#f4dff4", "#AABBCC")
|
||||||
dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window
|
dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window
|
||||||
par(mar = c(2 + round(max(nchar(sampleNames(GSE3635))) / 2), 4, 2, 1))
|
par(mar = c(2 + round(max(nchar(Biobase::sampleNames(GSE3635))) / 2), 4, 2, 1))
|
||||||
title <- paste ("GSE3635", '/', annotation(GSE3635),
|
title <- paste ("GSE3635", '/', Biobase::annotation(GSE3635),
|
||||||
" grouped samples", sep ='')
|
" grouped samples", sep ='')
|
||||||
boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE,
|
boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE,
|
||||||
las = 2, col = GEOcols[fl])
|
las = 2, col = GEOcols[fl])
|
||||||
@ -331,7 +329,7 @@ gName <- "MBP1"
|
|||||||
(iFeature <- which(SGD_features$name == gName))
|
(iFeature <- which(SGD_features$name == gName))
|
||||||
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
||||||
plot(seq(0, 120, by = 10),
|
plot(seq(0, 120, by = 10),
|
||||||
exprs(GSE3635)[iExprs, ],
|
Biobase::exprs(GSE3635)[iExprs, ],
|
||||||
main = paste("Expression profile for", gName),
|
main = paste("Expression profile for", gName),
|
||||||
xlab = "time (min)",
|
xlab = "time (min)",
|
||||||
ylab = "expression",
|
ylab = "expression",
|
||||||
@ -368,12 +366,8 @@ SGD_features$description[iFeature]
|
|||||||
# GEO2R discovers the top differentially expressed expressed genes by
|
# GEO2R discovers the top differentially expressed expressed genes by
|
||||||
# using functions in the Bioconductor limma package.
|
# using functions in the Bioconductor limma package.
|
||||||
|
|
||||||
if (! require(limma, quietly=TRUE)) {
|
if (! requireNamespace("limma", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
BiocManager::install("limma")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
|
||||||
biocLite("limma")
|
|
||||||
library(limma)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = limma) # basic information
|
# library(help = limma) # basic information
|
||||||
@ -392,6 +386,20 @@ if (! require(limma, quietly=TRUE)) {
|
|||||||
# the groups
|
# the groups
|
||||||
# 4. Format results.
|
# 4. Format results.
|
||||||
|
|
||||||
|
# Biobase is a highly engineered package that is tightly integrated into
|
||||||
|
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||||
|
# undesirable level of computational overhead and dependencies. Using the
|
||||||
|
# package as we normally do - i.e. calling required functions with their
|
||||||
|
# explicit package prefix is therefore not advisable. There are generics
|
||||||
|
# that won't be propery dispatched. If you only need a small number of
|
||||||
|
# functions for a very specific context, you will probably get away with
|
||||||
|
# Biobase::<function>() - but even in the demonstration code of this script
|
||||||
|
# not everything works out of the box. We'll therefore load the library,
|
||||||
|
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||||
|
# the functions come from.
|
||||||
|
|
||||||
|
library(Biobase)
|
||||||
|
|
||||||
# We are recapitulating the experiment in which we assigned the 0, 10, 60 and
|
# We are recapitulating the experiment in which we assigned the 0, 10, 60 and
|
||||||
# 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to
|
# 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to
|
||||||
# another group, and calculated differential expression values between these
|
# another group, and calculated differential expression values between these
|
||||||
@ -415,15 +423,15 @@ myDesign
|
|||||||
|
|
||||||
# Now we can calculate the fit of all rows to a linear model that depends
|
# Now we can calculate the fit of all rows to a linear model that depends
|
||||||
# on the two groups as specified in the design:
|
# on the two groups as specified in the design:
|
||||||
myFit <- lmFit(mySet, myDesign)
|
myFit <- limma::lmFit(mySet, myDesign)
|
||||||
|
|
||||||
# Next we calculate the contrasts, given the fit ...
|
# Next we calculate the contrasts, given the fit ...
|
||||||
myCont.matrix <- makeContrasts(A - B, levels = myDesign)
|
myCont.matrix <- limma::makeContrasts(A - B, levels = myDesign)
|
||||||
myFit2 <- contrasts.fit(myFit, myCont.matrix)
|
myFit2 <- limma::contrasts.fit(myFit, myCont.matrix)
|
||||||
|
|
||||||
# ... compute appropriate probabilites from a modified t-test
|
# ... compute appropriate probabilites from a modified t-test
|
||||||
# (empirical Bayes) ...
|
# (empirical Bayes) ...
|
||||||
myFit2 <- eBayes(myFit2, 0.01)
|
myFit2 <- limma::eBayes(myFit2, 0.01)
|
||||||
|
|
||||||
# ... add the gene names to the fit - object ...
|
# ... add the gene names to the fit - object ...
|
||||||
myFit2$genes <- featureNames(mySet)
|
myFit2$genes <- featureNames(mySet)
|
||||||
@ -433,7 +441,10 @@ myFit2$genes <- featureNames(mySet)
|
|||||||
# gave us only the top 250 genes, but we might as well do 1000, just so we
|
# gave us only the top 250 genes, but we might as well do 1000, just so we
|
||||||
# can be reasonable sure that our gens of interest are included.
|
# can be reasonable sure that our gens of interest are included.
|
||||||
N <- 1000
|
N <- 1000
|
||||||
myTable <- topTable(myFit2, adjust.method = "fdr", sort.by = "B", number = N)
|
myTable <- limma::topTable(myFit2,
|
||||||
|
adjust.method = "fdr",
|
||||||
|
sort.by = "B",
|
||||||
|
number = N)
|
||||||
|
|
||||||
str(myTable)
|
str(myTable)
|
||||||
# The gene names are now in the $ID column
|
# The gene names are now in the $ID column
|
||||||
@ -461,7 +472,7 @@ abline(h = 0, col = "#00000055")
|
|||||||
|
|
||||||
for (i in 1:10) {
|
for (i in 1:10) {
|
||||||
thisID <- myTable$ID[i]
|
thisID <- myTable$ID[i]
|
||||||
points(seq(0, 120, by = 10), exprs(GSE3635)[thisID, ], type = "b")
|
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Our guess that we might discover interesting genes be selecting groups A and B
|
# Our guess that we might discover interesting genes be selecting groups A and B
|
||||||
@ -480,7 +491,10 @@ for (i in 1:10) {
|
|||||||
myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3")
|
myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3")
|
||||||
for (name in toupper(myControls)) {
|
for (name in toupper(myControls)) {
|
||||||
thisID <- SGD_features$sysName[which(SGD_features$name == name)]
|
thisID <- SGD_features$sysName[which(SGD_features$name == name)]
|
||||||
points(seq(0, 120, by=10), exprs(GSE3635)[thisID, ], type="b", col="#AA0000")
|
points(seq(0, 120, by=10),
|
||||||
|
Biobase::exprs(GSE3635)[thisID, ],
|
||||||
|
type="b",
|
||||||
|
col="#AA0000")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Indeed, the discovered gene profiles look much "cleaner" than the real cycle
|
# Indeed, the discovered gene profiles look much "cleaner" than the real cycle
|
||||||
@ -504,7 +518,7 @@ for (name in toupper(myControls)) {
|
|||||||
gName <- "CLN2"
|
gName <- "CLN2"
|
||||||
(iFeature <- which(SGD_features$name == gName))
|
(iFeature <- which(SGD_features$name == gName))
|
||||||
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
||||||
Cln2Profile <- exprs(GSE3635)[iExprs, ]
|
Cln2Profile <- Biobase::exprs(GSE3635)[iExprs, ]
|
||||||
plot(seq(0, 120, by = 10),
|
plot(seq(0, 120, by = 10),
|
||||||
Cln2Profile,
|
Cln2Profile,
|
||||||
ylim = c(-1, 1),
|
ylim = c(-1, 1),
|
||||||
@ -519,16 +533,16 @@ abline(v = 60, col = "#00000055")
|
|||||||
# Set up a vector of correlation values
|
# Set up a vector of correlation values
|
||||||
|
|
||||||
|
|
||||||
myCorrelations <- numeric(nrow(exprs(GSE3635)))
|
myCorrelations <- numeric(nrow(Biobase::exprs(GSE3635)))
|
||||||
names(myCorrelations) <- featureNames(GSE3635)
|
names(myCorrelations) <- Biobase::featureNames(GSE3635)
|
||||||
for (i in 1:length(myCorrelations)) {
|
for (i in 1:length(myCorrelations)) {
|
||||||
myCorrelations[i] <- cor(Cln2Profile, exprs(GSE3635)[i, ])
|
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
|
||||||
}
|
}
|
||||||
|
|
||||||
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten
|
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten
|
||||||
|
|
||||||
# Number 1
|
# Number 1
|
||||||
(ID <- featureNames(GSE3635)[myTopC[1]])
|
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
|
||||||
|
|
||||||
# Get information
|
# Get information
|
||||||
SGD_features[which(SGD_features$sysName == ID), ]
|
SGD_features[which(SGD_features$sysName == ID), ]
|
||||||
@ -537,12 +551,13 @@ SGD_features[which(SGD_features$sysName == ID), ]
|
|||||||
|
|
||||||
# Let's plot the rest
|
# Let's plot the rest
|
||||||
for (i in 2:length(myTopC)) {
|
for (i in 2:length(myTopC)) {
|
||||||
ID <- featureNames(GSE3635)[myTopC[i]]
|
ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
|
||||||
points(seq(0, 120, by = 10),
|
points(seq(0, 120, by = 10),
|
||||||
exprs(GSE3635)[ID, ],
|
Biobase::exprs(GSE3635)[ID, ],
|
||||||
type = "b",
|
type = "b",
|
||||||
col= "chartreuse")
|
col= "chartreuse")
|
||||||
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
|
print(SGD_features[which(SGD_features$sysName == ID),
|
||||||
|
c("name", "description")])
|
||||||
}
|
}
|
||||||
|
|
||||||
# Note that all of these genes are highly correlated with a known cell cycle
|
# Note that all of these genes are highly correlated with a known cell cycle
|
||||||
@ -554,12 +569,13 @@ for (i in 2:length(myTopC)) {
|
|||||||
# And we haven't even looked at the anticorrelated genes yet...
|
# And we haven't even looked at the anticorrelated genes yet...
|
||||||
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten
|
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten
|
||||||
for (i in 1:length(myBottomC)) {
|
for (i in 1:length(myBottomC)) {
|
||||||
ID <- featureNames(GSE3635)[myBottomC[i]]
|
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
|
||||||
points(seq(0, 120, by = 10),
|
points(seq(0, 120, by = 10),
|
||||||
exprs(GSE3635)[ID, ],
|
Biobase::exprs(GSE3635)[ID, ],
|
||||||
type = "b",
|
type = "b",
|
||||||
col= "coral")
|
col= "coral")
|
||||||
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
|
print(SGD_features[which(SGD_features$sysName == ID),
|
||||||
|
c("name", "description")])
|
||||||
}
|
}
|
||||||
# ... which are very interesting in their own right.
|
# ... which are very interesting in their own right.
|
||||||
|
|
||||||
@ -583,7 +599,7 @@ for (i in 1:length(myBottomC)) {
|
|||||||
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
|
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
|
||||||
# annotations were not loaded. We could use getGPL = TRUE instead ...
|
# annotations were not loaded. We could use getGPL = TRUE instead ...
|
||||||
|
|
||||||
GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
|
GSE3635annot <- GEOquery::getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
|
||||||
GSE3635annot <- GSE3635annot[[1]]
|
GSE3635annot <- GSE3635annot[[1]]
|
||||||
|
|
||||||
# ... and the feature data is then available in the GSE3635@featureData@data
|
# ... and the feature data is then available in the GSE3635@featureData@data
|
||||||
@ -597,13 +613,8 @@ GSE3635annot@featureData@data[ 1:20 , ]
|
|||||||
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
|
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
|
||||||
str(myAnnot)
|
str(myAnnot)
|
||||||
|
|
||||||
# ... Note that this is a data frame, but - oy veh - the gene symbols are
|
# ... Note that this is a data frame and it is easy to find things we
|
||||||
# factors. Really, we need to fix this! To convert a factor into a string,
|
# might be looking for ...
|
||||||
# we need to cast it to character.
|
|
||||||
|
|
||||||
myAnnot$Gene <- as.character(myAnnot$Gene)
|
|
||||||
|
|
||||||
# ... whereupon we can find things we might be looking for ...
|
|
||||||
myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
||||||
|
|
||||||
# ... or identify rows that might give us trouble, such as probes that
|
# ... or identify rows that might give us trouble, such as probes that
|
||||||
@ -614,13 +625,11 @@ myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
|||||||
GSE3635@annotation # "GPL1914"
|
GSE3635@annotation # "GPL1914"
|
||||||
|
|
||||||
# ... and downloaded it directly from NCBI:
|
# ... and downloaded it directly from NCBI:
|
||||||
GPL1914 <- getGEO("GPL1914")
|
GPL1914 <- GEOquery::getGEO("GPL1914")
|
||||||
str(GPL1914)
|
str(GPL1914)
|
||||||
|
|
||||||
# ... from which we can get the data - which is however NOT necessarily
|
# ... from which we can get the data - which is however NOT necessarily
|
||||||
# matched to the rows of our expression dataset. Note that here too: the
|
# matched to the rows of our expression dataset.
|
||||||
# majority of data elements are factors and will likely have to be converted
|
|
||||||
# before use.
|
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -3,12 +3,15 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2019 01
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
|
# use Biocmanager:: not biocLite()
|
||||||
# 1.1 Update set.seed() usage
|
# 1.1 Update set.seed() usage
|
||||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||||
# 1.0 New material.
|
# 1.0 New material.
|
||||||
@ -30,16 +33,16 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------------
|
#TOC> --------------------------------------------------------------
|
||||||
#TOC> 1 Designing a computational experiment 54
|
#TOC> 1 Designing a computational experiment 57
|
||||||
#TOC> 2 Setting up the tools 70
|
#TOC> 2 Setting up the tools 73
|
||||||
#TOC> 2.1 Natural and alternative genetic codes 73
|
#TOC> 2.1 Natural and alternative genetic codes 76
|
||||||
#TOC> 2.2 Effect of mutations 132
|
#TOC> 2.2 Effect of mutations 134
|
||||||
#TOC> 2.2.1 reverse-translate 143
|
#TOC> 2.2.1 reverse-translate 145
|
||||||
#TOC> 2.2.2 Randomly mutate 168
|
#TOC> 2.2.2 Randomly mutate 170
|
||||||
#TOC> 2.2.3 Forward- translate 193
|
#TOC> 2.2.3 Forward- translate 195
|
||||||
#TOC> 2.2.4 measure effect 211
|
#TOC> 2.2.4 measure effect 213
|
||||||
#TOC> 3 Run the experiment 258
|
#TOC> 3 Run the experiment 260
|
||||||
#TOC> 4 Task solutions 351
|
#TOC> 4 Task solutions 356
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -73,12 +76,11 @@
|
|||||||
# == 2.1 Natural and alternative genetic codes =============================
|
# == 2.1 Natural and alternative genetic codes =============================
|
||||||
|
|
||||||
# Load genetic code tables from the Biostrings package
|
# Load genetic code tables from the Biostrings package
|
||||||
if (! require(Biostrings, quietly=TRUE)) {
|
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||||
if (! exists("biocLite")) {
|
install.packages("BiocManager")
|
||||||
source("https://bioconductor.org/biocLite.R")
|
|
||||||
}
|
}
|
||||||
biocLite("Biostrings")
|
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||||
library(Biostrings)
|
BiocManager::install("Biostrings")
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = Biostrings) # basic information
|
# library(help = Biostrings) # basic information
|
||||||
@ -257,52 +259,55 @@ evalMut <- function(nat, mut) {
|
|||||||
|
|
||||||
# = 3 Run the experiment ==================================================
|
# = 3 Run the experiment ==================================================
|
||||||
|
|
||||||
|
# Fetch the standard Genetic code from Biostrings::
|
||||||
|
|
||||||
|
stdCode <- Biostrings::GENETIC_CODE
|
||||||
|
|
||||||
# Fetch the nucleotide sequence for MBP1:
|
# Fetch the nucleotide sequence for MBP1:
|
||||||
|
|
||||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
||||||
myDNA <- paste0(myDNA, collapse = "")
|
myDNA <- paste0(myDNA, collapse = "")
|
||||||
myDNA <- as.character(codons(DNAString(myDNA)))
|
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
|
||||||
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
||||||
|
|
||||||
myAA <- traFor(myDNA, GENETIC_CODE)
|
myAA <- traFor(myDNA, stdCode)
|
||||||
|
|
||||||
# Mutate and evaluate
|
# Mutate and evaluate
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
x <- randMut(myDNA)
|
x <- randMut(myDNA)
|
||||||
set.seed(NULL)
|
set.seed(NULL)
|
||||||
x <- traFor(x, GENETIC_CODE)
|
x <- traFor(x, stdCode)
|
||||||
evalMut(myAA, x) # 166.4
|
evalMut(myAA, x) # 166.4
|
||||||
|
|
||||||
# Try this 200 times, and see how the values are distributed.
|
# Try this 200 times, and see how the values are distributed.
|
||||||
N <- 200
|
N <- 200
|
||||||
valUGC <- numeric(N)
|
valSTDC <- numeric(N)
|
||||||
|
|
||||||
set.seed(112358) # set RNG seed for repeatable randomness
|
set.seed(112358) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
x <- randMut(myDNA) # mutate
|
x <- randMut(myDNA) # mutate
|
||||||
x <- traFor(x, GENETIC_CODE) # translate
|
x <- traFor(x, stdCode) # translate
|
||||||
valUGC[i] <- evalMut(myAA, x) # evaluate
|
valSTDC[i] <- evalMut(myAA, x) # evaluate
|
||||||
}
|
}
|
||||||
set.seed(NULL) # reset the RNG
|
set.seed(NULL) # reset the RNG
|
||||||
|
|
||||||
hist(valUGC,
|
hist(valSTDC,
|
||||||
breaks = 15,
|
breaks = 15,
|
||||||
col = "palegoldenrod",
|
col = "palegoldenrod",
|
||||||
xlim = c(0, 400),
|
xlim = c(0, 400),
|
||||||
ylim = c(0, N/4),
|
ylim = c(0, N/4),
|
||||||
main = "Universal vs. Synthetic Genetic Code",
|
main = "Standard vs. Synthetic Genetic Code",
|
||||||
xlab = "Mutation penalty")
|
xlab = "Mutation penalty")
|
||||||
|
|
||||||
# This looks like a normal distribution. Let's assume the effect of mutations
|
# This looks like a normal distribution. Let's assume the effect of mutations
|
||||||
# under the universal genetic code is the mean of this distribution:
|
# under the standard genetic code is the mean of this distribution:
|
||||||
effectUGC <- mean(valUGC) # 178.1
|
effectSTDC <- mean(valSTDC) # 178.1
|
||||||
|
|
||||||
# Now we can look at the effects of alternate genetic codes:
|
# Now we can look at the effects of alternate genetic codes:
|
||||||
|
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
# choose a new code
|
# choose a new code
|
||||||
GC <- randomGC(GENETIC_CODE)
|
GC <- randomGC(stdCode)
|
||||||
set.seed(NULL)
|
set.seed(NULL)
|
||||||
|
|
||||||
# reverse translate hypothetical sequence according to the new code
|
# reverse translate hypothetical sequence according to the new code
|
||||||
@ -321,7 +326,7 @@ valXGC <- numeric(N)
|
|||||||
|
|
||||||
set.seed(1414214) # set RNG seed for repeatable randomness
|
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
GC <- randomGC(GENETIC_CODE) # Choose code
|
GC <- randomGC(stdCode) # Choose code
|
||||||
x <- traRev(myAA, GC) # reverse translate
|
x <- traRev(myAA, GC) # reverse translate
|
||||||
x <- randMut(x) # mutate
|
x <- randMut(x) # mutate
|
||||||
x <- traFor(x, GC) # translate
|
x <- traFor(x, GC) # translate
|
||||||
@ -355,7 +360,7 @@ valSGC <- numeric(N)
|
|||||||
|
|
||||||
set.seed(2718282) # set RNG seed for repeatable randomness
|
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||||
for (i in 1:N) {
|
for (i in 1:N) {
|
||||||
GC <- swappedGC(GENETIC_CODE) # Choose code
|
GC <- swappedGC(stdCode) # Choose code
|
||||||
x <- traRev(myAA, GC) # reverse translate
|
x <- traRev(myAA, GC) # reverse translate
|
||||||
x <- randMut(x) # mutate
|
x <- randMut(x) # mutate
|
||||||
x <- traFor(x, GC) # translate
|
x <- traFor(x, GC) # translate
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 - 2018 12
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout,
|
||||||
# 1.0.1 Updates for slightly changed interfaces
|
# 1.0.1 Updates for slightly changed interfaces
|
||||||
# 1.0 First ABC units version
|
# 1.0 First ABC units version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
@ -29,10 +31,10 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------------
|
#TOC> ---------------------------------------------------------------------
|
||||||
#TOC> 1 Constructing a POST command from a Web query 44
|
#TOC> 1 Constructing a POST command from a Web query 42
|
||||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 145
|
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
|
||||||
#TOC> 2 Task solutions 153
|
#TOC> 2 Task solutions 150
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -40,9 +42,8 @@
|
|||||||
# = 1 Constructing a POST command from a Web query ========================
|
# = 1 Constructing a POST command from a Web query ========================
|
||||||
|
|
||||||
|
|
||||||
if (! require(httr, quietly=TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = httr) # basic information
|
# library(help = httr) # basic information
|
||||||
@ -60,7 +61,7 @@ UniProtID <- "P39678"
|
|||||||
|
|
||||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||||
|
|
||||||
response <- POST(URL,
|
response <- httr::POST(URL,
|
||||||
body = list(meta = "opt1",
|
body = list(meta = "opt1",
|
||||||
meta1_protein = "opt1",
|
meta1_protein = "opt1",
|
||||||
seq = UniProtID,
|
seq = UniProtID,
|
||||||
@ -70,14 +71,14 @@ response <- POST(URL,
|
|||||||
# Send off this request, and you should have a response in a few
|
# Send off this request, and you should have a response in a few
|
||||||
# seconds. Let's check the status first:
|
# seconds. Let's check the status first:
|
||||||
|
|
||||||
status_code(response) # If this is not 200, something went wrong and it
|
httr::status_code(response) # If this is not 200, something went wrong and it
|
||||||
# makes no sense to continue. If this persists, ask
|
# makes no sense to continue. If this persists, ask
|
||||||
# on the mailing list what to do.
|
# on the mailing list what to do.
|
||||||
|
|
||||||
|
|
||||||
# The text contents of the response is available with the
|
# The text contents of the response is available with the
|
||||||
# content() function:
|
# content() function:
|
||||||
content(response, "text")
|
httr::content(response, "text")
|
||||||
|
|
||||||
# ... should show you the same as the page contents that
|
# ... should show you the same as the page contents that
|
||||||
# you have seen in the browser. The date we need Now we need to extract
|
# you have seen in the browser. The date we need Now we need to extract
|
||||||
@ -86,7 +87,7 @@ content(response, "text")
|
|||||||
# individual lines, since each of our data elements is on
|
# individual lines, since each of our data elements is on
|
||||||
# its own line. We simply split on the "\\n" newline character.
|
# its own line. We simply split on the "\\n" newline character.
|
||||||
|
|
||||||
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||||
head(lines)
|
head(lines)
|
||||||
|
|
||||||
# Now we define a query pattern for the lines we want:
|
# Now we define a query pattern for the lines we want:
|
||||||
|
76
RPR-SX-PDB.R
76
RPR-SX-PDB.R
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-SX-PDB unit.
|
# R code accompanying the RPR-SX-PDB unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 19
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 First live version, completely refactores 2016 code
|
# 1.0 First live version, completely refactores 2016 code
|
||||||
# with remarkable speed gains. Added section on x, y, z
|
# with remarkable speed gains. Added section on x, y, z
|
||||||
# (density) plots.
|
# (density) plots.
|
||||||
@ -30,19 +32,19 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------
|
#TOC> ----------------------------------------------------------
|
||||||
#TOC> 1 Introduction to the bio3D package 63
|
#TOC> 1 Introduction to the bio3D package 61
|
||||||
#TOC> 2 A Ramachandran plot 151
|
#TOC> 2 A Ramachandran plot 152
|
||||||
#TOC> 3 Density plots 227
|
#TOC> 3 Density plots 228
|
||||||
#TOC> 3.1 Density-based colours 241
|
#TOC> 3.1 Density-based colours 242
|
||||||
#TOC> 3.2 Plotting with smoothScatter() 260
|
#TOC> 3.2 Plotting with smoothScatter() 261
|
||||||
#TOC> 3.3 Plotting hexbins 275
|
#TOC> 3.3 Plotting hexbins 276
|
||||||
#TOC> 3.4 Plotting density contours 299
|
#TOC> 3.4 Plotting density contours 304
|
||||||
#TOC> 3.4.1 ... as overlay on a colored grid 333
|
#TOC> 3.4.1 ... as overlay on a colored grid 337
|
||||||
#TOC> 3.4.2 ... as filled countour 350
|
#TOC> 3.4.2 ... as filled countour 354
|
||||||
#TOC> 3.4.3 ... as a perspective plot 381
|
#TOC> 3.4.3 ... as a perspective plot 385
|
||||||
#TOC> 4 cis-peptide bonds 399
|
#TOC> 4 cis-peptide bonds 403
|
||||||
#TOC> 5 H-bond lengths 414
|
#TOC> 5 H-bond lengths 418
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -59,9 +61,8 @@
|
|||||||
# = 1 Introduction to the bio3D package ===================================
|
# = 1 Introduction to the bio3D package ===================================
|
||||||
|
|
||||||
|
|
||||||
if (! require(bio3d, quietly=TRUE)) {
|
if (! requireNamespace("bio3d", quietly = TRUE)) {
|
||||||
install.packages("bio3d")
|
install.packages("bio3d")
|
||||||
library(bio3d)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = bio3d) # basic information
|
# library(help = bio3d) # basic information
|
||||||
@ -89,8 +90,8 @@ file.show("./data/1BM8.pdb")
|
|||||||
# Are all atoms of the N-terminal residue present?
|
# Are all atoms of the N-terminal residue present?
|
||||||
# Are all atoms of the C-terminal residue present?
|
# Are all atoms of the C-terminal residue present?
|
||||||
|
|
||||||
apses <- read.pdb("1bm8") # load a molecule directly from the PDB via the
|
apses <- bio3d::read.pdb("1bm8") # load a molecule directly from the PDB via
|
||||||
# Internet. (This is not your local version.)
|
# the Internet.
|
||||||
|
|
||||||
# check what we have:
|
# check what we have:
|
||||||
apses
|
apses
|
||||||
@ -121,10 +122,11 @@ apses$atom[apses$atom[,"resno"] == i, ]
|
|||||||
apses$seqres[1:10] # the "A"s here identify chain "A"
|
apses$seqres[1:10] # the "A"s here identify chain "A"
|
||||||
|
|
||||||
# Can we convert this to one letter code?
|
# Can we convert this to one letter code?
|
||||||
aa321(apses$seqres[1:10])
|
bio3d::aa321(apses$seqres[1:10])
|
||||||
|
|
||||||
# Lets get the implicit sequence:
|
# Lets get the implicit sequence:
|
||||||
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
|
bio3d::aa321((apses$atom$resid[apses$calpha])[1:10])
|
||||||
|
# Do you understand this code?
|
||||||
|
|
||||||
# Do explicit and implicit sequence have the same length?
|
# Do explicit and implicit sequence have the same length?
|
||||||
length(apses$seqres)
|
length(apses$seqres)
|
||||||
@ -140,7 +142,10 @@ apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
|
|||||||
# The introduction to bio3d tutorial at
|
# The introduction to bio3d tutorial at
|
||||||
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
|
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
|
||||||
# has the following example:
|
# has the following example:
|
||||||
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
|
bio3d::plot.bio3d(apses$atom$b[apses$calpha],
|
||||||
|
sse=apses,
|
||||||
|
typ="l",
|
||||||
|
ylab="B-factor")
|
||||||
|
|
||||||
# Good for now. Let's do some real work.
|
# Good for now. Let's do some real work.
|
||||||
|
|
||||||
@ -149,7 +154,7 @@ plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
|
|||||||
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
|
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
|
||||||
# calculates all dihedral angles for backbone and sidechain bonds, NA where
|
# calculates all dihedral angles for backbone and sidechain bonds, NA where
|
||||||
# the bond does not exist in an amino acid.
|
# the bond does not exist in an amino acid.
|
||||||
tor <- torsion.pdb(apses)
|
tor <- bio3d::torsion.pdb(apses)
|
||||||
plot(tor$phi, tor$psi,
|
plot(tor$phi, tor$psi,
|
||||||
xlim = c(-180, 180), ylim = c(-180, 180),
|
xlim = c(-180, 180), ylim = c(-180, 180),
|
||||||
main = "Ramachandran plot for 1BM8",
|
main = "Ramachandran plot for 1BM8",
|
||||||
@ -164,7 +169,7 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
|
|||||||
# color the points for glycine residues differently. First, we
|
# color the points for glycine residues differently. First, we
|
||||||
# get a vector of glycine residue indices in the structure:
|
# get a vector of glycine residue indices in the structure:
|
||||||
|
|
||||||
mySeq <- pdbseq(apses)
|
mySeq <- bio3d::pdbseq(apses)
|
||||||
|
|
||||||
# Explore the result object and extract the indices of GLY resiues.
|
# Explore the result object and extract the indices of GLY resiues.
|
||||||
mySeq == "G"
|
mySeq == "G"
|
||||||
@ -210,7 +215,7 @@ for (i in 1:nrow(dat)) {
|
|||||||
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
|
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
|
||||||
text(dat$phi[i],
|
text(dat$phi[i],
|
||||||
dat$psi[i],
|
dat$psi[i],
|
||||||
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
|
labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]),
|
||||||
pos = 4,
|
pos = 4,
|
||||||
offset = 0.4,
|
offset = 0.4,
|
||||||
cex = 0.7)
|
cex = 0.7)
|
||||||
@ -272,9 +277,8 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
|
|||||||
|
|
||||||
# If we wish to approximate values in a histogram-like fashion, we can use
|
# If we wish to approximate values in a histogram-like fashion, we can use
|
||||||
# hexbin()
|
# hexbin()
|
||||||
if (! require(hexbin, quietly=TRUE)) {
|
if (! requireNamespace("hexbin", quietly = TRUE)) {
|
||||||
install.packages("hexbin")
|
install.packages("hexbin")
|
||||||
library(hexbin)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = hexbin) # basic information
|
# library(help = hexbin) # basic information
|
||||||
@ -285,12 +289,17 @@ if (! require(hexbin, quietly=TRUE)) {
|
|||||||
myColorRamp <- colorRampPalette(c("#EEEEEE",
|
myColorRamp <- colorRampPalette(c("#EEEEEE",
|
||||||
"#3399CC",
|
"#3399CC",
|
||||||
"#2266DD"))
|
"#2266DD"))
|
||||||
plot(hexbin(phi, psi, xbins = 10),
|
hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10),
|
||||||
colramp = myColorRamp,
|
colramp = myColorRamp,
|
||||||
main = "phi-psi Density Bins for 1BM8",
|
main = "phi-psi Density Bins for 1BM8",
|
||||||
xlab = expression(phi),
|
xlab = expression(phi),
|
||||||
ylab = expression(psi))
|
ylab = expression(psi))
|
||||||
|
|
||||||
|
# Note:
|
||||||
|
# Had we loaded hexbin with library(hexbin), the plot function would have
|
||||||
|
# been dispatched by the plot() generic, and we could simply have written:
|
||||||
|
# plot(hexbin(phi, psi, xbins = 10), ...
|
||||||
|
|
||||||
|
|
||||||
# == 3.4 Plotting density contours =========================================
|
# == 3.4 Plotting density contours =========================================
|
||||||
|
|
||||||
@ -305,17 +314,16 @@ plot(hexbin(phi, psi, xbins = 10),
|
|||||||
# distributions. But for 2D data like or phi-psi plots, we need a function from
|
# distributions. But for 2D data like or phi-psi plots, we need a function from
|
||||||
# the MASS package: kde2d()
|
# the MASS package: kde2d()
|
||||||
|
|
||||||
if (! require(MASS, quietly=TRUE)) {
|
if (! requireNamespace("MASS", quietly = TRUE)) {
|
||||||
install.packages("MASS")
|
install.packages("MASS")
|
||||||
library(MASS)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = MASS) # basic information
|
# library(help = MASS) # basic information
|
||||||
# browseVignettes("MASS") # available vignettes
|
# browseVignettes("MASS") # available vignettes
|
||||||
# data(package = "MASS") # available datasets
|
# data(package = "MASS") # available datasets
|
||||||
|
|
||||||
?kde2d
|
?MASS::kde2d
|
||||||
dPhiPsi <-kde2d(phi, psi,
|
dPhiPsi <-MASS::kde2d(phi, psi,
|
||||||
n = 60,
|
n = 60,
|
||||||
lims = c(-180, 180, -180, 180))
|
lims = c(-180, 180, -180, 180))
|
||||||
|
|
||||||
@ -469,7 +477,7 @@ ssSelect <- function(PDB, myChain = "A", ssType, myElety) {
|
|||||||
|
|
||||||
# get id's from PDB
|
# get id's from PDB
|
||||||
|
|
||||||
x <- atom.select(PDB,
|
x <- bio3d::atom.select(PDB,
|
||||||
string = "protein",
|
string = "protein",
|
||||||
type = "ATOM",
|
type = "ATOM",
|
||||||
chain = myChain,
|
chain = myChain,
|
||||||
@ -506,7 +514,7 @@ pairDist <- function(PDB, a, b) {
|
|||||||
|
|
||||||
A <- PDB$atom[a, c("x", "y", "z")]
|
A <- PDB$atom[a, c("x", "y", "z")]
|
||||||
B <- PDB$atom[b, c("x", "y", "z")]
|
B <- PDB$atom[b, c("x", "y", "z")]
|
||||||
dMat <- dist.xyz(A, B)
|
dMat <- bio3d::dist.xyz(A, B)
|
||||||
|
|
||||||
}
|
}
|
||||||
return(dMat)
|
return(dMat)
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 05
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 First ABC units version
|
# 1.0 First ABC units version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,10 +30,10 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ----------------------------------------------------
|
#TOC> ----------------------------------------------------------
|
||||||
#TOC> 1 UniProt files via GET 44
|
#TOC> 1 UniProt files via GET 41
|
||||||
#TOC> 1.1 Task - fetchUniProtSeq() function 107
|
#TOC> 1.1 Task - fetchUniProtSeq() function 103
|
||||||
#TOC> 2 Task solutions 114
|
#TOC> 2 Task solutions 110
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -48,9 +50,8 @@
|
|||||||
# a Web browser. Since this is a short and simple request, the GET verb is the
|
# a Web browser. Since this is a short and simple request, the GET verb is the
|
||||||
# right tool:
|
# right tool:
|
||||||
|
|
||||||
if (! require(httr, quietly=TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = httr) # basic information
|
# library(help = httr) # basic information
|
||||||
@ -69,7 +70,7 @@ UniProtID <- "P39678"
|
|||||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
||||||
|
|
||||||
# the GET() function from httr will get the data.
|
# the GET() function from httr will get the data.
|
||||||
response <- GET(URL)
|
response <- httr::GET(URL)
|
||||||
|
|
||||||
str(response) # the response object is a bit complex ...
|
str(response) # the response object is a bit complex ...
|
||||||
as.character(response) # ... but it is easy to pull out the data.
|
as.character(response) # ... but it is easy to pull out the data.
|
||||||
@ -82,21 +83,21 @@ dbSanitizeSequence(x)
|
|||||||
# Simple.
|
# Simple.
|
||||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
||||||
|
|
||||||
response <- GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||||
as.character(response)
|
as.character(response)
|
||||||
# this is a large HTML page that tells us the URL was not found. So we need to
|
# this is a large HTML page that tells us the URL was not found. So we need to
|
||||||
# check for errors. The Right way to do this is to evaluate the staus code that
|
# check for errors. The Right Way to do this is to evaluate the staus code that
|
||||||
# every Web server returns for every transaction.
|
# every Web server returns for every transaction.
|
||||||
#
|
#
|
||||||
status_code(response) # 404 == Page Not Found
|
httr::status_code(response) # 404 == Page Not Found
|
||||||
|
|
||||||
# There are many possible codes, but the only code we will be happy with
|
# There are many possible codes, but the only code we will be happy with
|
||||||
# is 200 - oK.
|
# is 200 - oK.
|
||||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
||||||
|
|
||||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
||||||
response <- GET(URL)
|
response <- httr::GET(URL)
|
||||||
status_code(response)
|
httr::status_code(response)
|
||||||
|
|
||||||
|
|
||||||
# == 1.1 Task - fetchUniProtSeq() function =================================
|
# == 1.1 Task - fetchUniProtSeq() function =================================
|
||||||
|
@ -3,12 +3,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Unit_testing unit.
|
# R code accompanying the RPR-Unit_testing unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 15
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace()
|
||||||
# 1.0 New code
|
# 1.0 New code
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
@ -27,10 +28,11 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -------------------------------------------
|
#TOC> -------------------------------------------------
|
||||||
#TOC> 1 Unit Tests with testthat 43
|
#TOC> 1 Unit Tests with testthat 40
|
||||||
#TOC> 2 Organizing your tests 156
|
#TOC> 2 Organizing your tests 159
|
||||||
#TOC> 3 Task solutions 181
|
#TOC> 2.1 Testing scripts 183
|
||||||
|
#TOC> 3 Task solutions 198
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -39,15 +41,22 @@
|
|||||||
|
|
||||||
# The testthat package supports writing and executing unit tests in many ways.
|
# The testthat package supports writing and executing unit tests in many ways.
|
||||||
|
|
||||||
if (! require(testthat, quietly=TRUE)) {
|
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||||
install.packages("testthat")
|
install.packages("testthat")
|
||||||
library(testthat)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = testthat) # basic information
|
# library(help = testthat) # basic information
|
||||||
# browseVignettes("testthat") # available vignettes
|
# browseVignettes("testthat") # available vignettes
|
||||||
# data(package = "testthat") # available datasets
|
# data(package = "testthat") # available datasets
|
||||||
|
|
||||||
|
# testthat is one of those packages that we either use A LOT in a script,
|
||||||
|
# or not at all. Therfore it's more reasonable to depart from our usual
|
||||||
|
# <package>::<function>() idiom, and load the entire library. In fact, if
|
||||||
|
# we author packages, it is common practice to load testthat in the part
|
||||||
|
# of the package that automates testing.
|
||||||
|
|
||||||
|
library(testthat)
|
||||||
|
|
||||||
# An atomic test consists of an expectation about the bahaviour of a function or
|
# An atomic test consists of an expectation about the bahaviour of a function or
|
||||||
# the existence of an object. testthat provides a number of useful expectations:
|
# the existence of an object. testthat provides a number of useful expectations:
|
||||||
|
|
||||||
@ -171,6 +180,20 @@ test_file("./tests/test_biCode.R")
|
|||||||
# .. or execute all the test files in the directory:
|
# .. or execute all the test files in the directory:
|
||||||
test_dir("./tests")
|
test_dir("./tests")
|
||||||
|
|
||||||
|
# == 2.1 Testing scripts ===================================================
|
||||||
|
|
||||||
|
# Scripts need special consideration since we do not necessarily source() them
|
||||||
|
# entirely. Therefore automated testing is not reasonable. What you can do
|
||||||
|
# instead is to place a conditional block at the end of your script, that
|
||||||
|
# never gets executed - then you can manually execute the code in the block
|
||||||
|
# whenever you wish to test your functions. For example:
|
||||||
|
|
||||||
|
if (FALSE) {
|
||||||
|
# ... your tests go here
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# = 3 Task solutions ======================================================
|
# = 3 Task solutions ======================================================
|
||||||
|
|
||||||
|
@ -3,12 +3,14 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 05
|
# Date: 2017 10 05
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 First ABC units version
|
# 1.0 First ABC units version
|
||||||
# 0.1 First code copied from 2016 material.
|
# 0.1 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
@ -28,10 +30,10 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------------
|
#TOC> -----------------------------------------------------------
|
||||||
#TOC> 1 Working with NCBI eUtils 44
|
#TOC> 1 Working with NCBI eUtils 41
|
||||||
#TOC> 1.1 Task - fetchNCBItaxData() function 162
|
#TOC> 1.1 Task - fetchNCBItaxData() function 144
|
||||||
#TOC> 2 Task solutions 169
|
#TOC> 2 Task solutions 151
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -40,26 +42,11 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# To begin, we load some libraries with functions
|
# To begin, we load the xml2 package that contains functions
|
||||||
# we need...
|
# we need to receive and parse html data. NCBI's eUtils send information in
|
||||||
|
# XML format so we need to be able to parse XML.
|
||||||
# ... the package httr, which sends and receives information via the http
|
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||||
# protocol, just like a Web browser.
|
|
||||||
if (! require(httr, quietly=TRUE)) {
|
|
||||||
install.packages("httr")
|
|
||||||
library(httr)
|
|
||||||
}
|
|
||||||
# Package information:
|
|
||||||
# library(help = httr) # basic information
|
|
||||||
# browseVignettes("httr") # available vignettes
|
|
||||||
# data(package = "httr") # available datasets
|
|
||||||
|
|
||||||
|
|
||||||
# ...plus the package xml2: NCBI's eUtils send information in XML format so we
|
|
||||||
# need to be able to parse XML.
|
|
||||||
if (! require(xml2, quietly=TRUE)) {
|
|
||||||
install.packages("xml2")
|
install.packages("xml2")
|
||||||
library(xml2)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = xml2) # basic information
|
# library(help = xml2) # basic information
|
||||||
@ -91,24 +78,23 @@ URL <- paste(eUtilsBase,
|
|||||||
# what the response should look like.
|
# what the response should look like.
|
||||||
URL
|
URL
|
||||||
|
|
||||||
# To fetch a response in R, we use the function GET() from the httr package
|
# To fetch a response in R, we use the function read_xml()
|
||||||
# with our URL as its argument.
|
# with our URL as its argument.
|
||||||
myXML <- read_xml(URL)
|
(myXML <- xml2::read_xml(URL))
|
||||||
myXML
|
|
||||||
|
|
||||||
# This is XML. We can take the response apart into
|
# This is XML. We can take the response apart into
|
||||||
# its indvidual components with the as_list() function.
|
# its indvidual components with the as_list() function.
|
||||||
|
|
||||||
as_list(myXML)
|
xml2::as_list(myXML)
|
||||||
|
|
||||||
# Note how the XML "tree" is represented as a list of
|
# Note how the XML "tree" is represented as a list of
|
||||||
# lists of lists ...
|
# lists of lists ...
|
||||||
# If we know exactly what elelement we are looking for,
|
# If we know exactly what elelement we are looking for,
|
||||||
# we can extract it from this structure:
|
# we can extract it from this structure:
|
||||||
as_list(myXML)[["IdList"]][["Id"]][[1]]
|
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
|
||||||
|
|
||||||
# But this is not very robust, it would break with the
|
# But this is not very robust, it would break with the
|
||||||
# slightest change that the NCBI makes to their response
|
# slightest change that the NCBI makes to their data format -
|
||||||
# and the NCBI changes things A LOT!
|
# and the NCBI changes things A LOT!
|
||||||
|
|
||||||
# Somewhat more robust is to specify the type of element
|
# Somewhat more robust is to specify the type of element
|
||||||
@ -116,11 +102,12 @@ as_list(myXML)[["IdList"]][["Id"]][[1]]
|
|||||||
# element, and use the XPath XML parsing language to
|
# element, and use the XPath XML parsing language to
|
||||||
# retrieve it.
|
# retrieve it.
|
||||||
|
|
||||||
xml_find_all(myXML, "//Id") # returns a "node set"
|
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
|
||||||
|
|
||||||
xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
|
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
|
||||||
|
# of the node set
|
||||||
|
|
||||||
# We will need doing this a lot, so we write a function
|
# We will need to do this more than once, so we write a function
|
||||||
# for it...
|
# for it...
|
||||||
node2text <- function(doc, tag) {
|
node2text <- function(doc, tag) {
|
||||||
# an extractor function for the contents of elements
|
# an extractor function for the contents of elements
|
||||||
@ -128,8 +115,8 @@ node2text <- function(doc, tag) {
|
|||||||
# Contents of all matching elements is returned in
|
# Contents of all matching elements is returned in
|
||||||
# a vector of strings.
|
# a vector of strings.
|
||||||
path <- paste0("//", tag)
|
path <- paste0("//", tag)
|
||||||
nodes <- xml_find_all(doc, path)
|
nodes <- xml2::xml_find_all(doc, path)
|
||||||
return(xml_text(nodes))
|
return(xml2::xml_text(nodes))
|
||||||
}
|
}
|
||||||
|
|
||||||
# using node2text() ...
|
# using node2text() ...
|
||||||
@ -145,7 +132,7 @@ URL <- paste0(eUtilsBase,
|
|||||||
"&id=",
|
"&id=",
|
||||||
GID,
|
GID,
|
||||||
"&version=2.0")
|
"&version=2.0")
|
||||||
(myXML <- read_xml(URL))
|
(myXML <- xml2::read_xml(URL))
|
||||||
|
|
||||||
(taxID <- node2text(myXML, "TaxId"))
|
(taxID <- node2text(myXML, "TaxId"))
|
||||||
(organism <- node2text(myXML, "Organism"))
|
(organism <- node2text(myXML, "Organism"))
|
||||||
|
@ -22,17 +22,18 @@ setwd("<your/project/directory>")
|
|||||||
|
|
||||||
|
|
||||||
# ==== PACKAGES ==============================================================
|
# ==== PACKAGES ==============================================================
|
||||||
# Load all required packages.
|
# Check that required packages have been installed. Install if needed.
|
||||||
|
|
||||||
if (! require(seqinr, quietly=TRUE)) {
|
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||||
install.packages("seqinr")
|
install.packages("seqinr")
|
||||||
library(seqinr)
|
|
||||||
}
|
}
|
||||||
# Package information:
|
# Package information:
|
||||||
# library(help = seqinr) # basic information
|
# library(help = seqinr) # basic information
|
||||||
# browseVignettes("seqinr") # available vignettes
|
# browseVignettes("seqinr") # available vignettes
|
||||||
# data(package = "seqinr") # available datasets
|
# data(package = "seqinr") # available datasets
|
||||||
|
|
||||||
|
# Note: use package functions with the :: operator - eg.
|
||||||
|
# seqinr::aaa("K")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,21 +9,18 @@
|
|||||||
# ====== PACKAGES ==============================================================
|
# ====== PACKAGES ==============================================================
|
||||||
|
|
||||||
|
|
||||||
if (! require(jsonlite, quietly = TRUE)) {
|
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||||
install.packages("jsonlite")
|
install.packages("jsonlite")
|
||||||
library(jsonlite)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (! require(httr, quietly = TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (! require(xml2, quietly = TRUE)) {
|
if (! requireNamespace("xml2", quietly = TRUE)) {
|
||||||
install.packages("xml2")
|
install.packages("xml2")
|
||||||
library(xml2)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -226,10 +223,10 @@ dbFetchUniProtSeq <- function(ID) {
|
|||||||
|
|
||||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
|
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
|
||||||
|
|
||||||
response <- GET(URL)
|
response <- httr::GET(URL)
|
||||||
|
|
||||||
mySeq <- character()
|
mySeq <- character()
|
||||||
if (status_code(response) == 200) {
|
if (httr::status_code(response) == 200) {
|
||||||
x <- as.character(response)
|
x <- as.character(response)
|
||||||
x <- strsplit(x, "\n")
|
x <- strsplit(x, "\n")
|
||||||
mySeq <- dbSanitizeSequence(x)
|
mySeq <- dbSanitizeSequence(x)
|
||||||
@ -253,7 +250,7 @@ dbFetchPrositeFeatures <- function(ID) {
|
|||||||
|
|
||||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||||
|
|
||||||
response <- POST(URL,
|
response <- httr::POST(URL,
|
||||||
body = list(meta = "opt1",
|
body = list(meta = "opt1",
|
||||||
meta1_protein = "opt1",
|
meta1_protein = "opt1",
|
||||||
seq = ID,
|
seq = ID,
|
||||||
@ -261,9 +258,9 @@ dbFetchPrositeFeatures <- function(ID) {
|
|||||||
output = "tabular"))
|
output = "tabular"))
|
||||||
|
|
||||||
myFeatures <- data.frame()
|
myFeatures <- data.frame()
|
||||||
if (status_code(response) == 200) {
|
if (httr::status_code(response) == 200) {
|
||||||
|
|
||||||
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||||
|
|
||||||
patt <- sprintf("\\|%s\\|", UniProtID)
|
patt <- sprintf("\\|%s\\|", UniProtID)
|
||||||
lines <- lines[grep(patt, lines)]
|
lines <- lines[grep(patt, lines)]
|
||||||
@ -289,8 +286,8 @@ node2text <- function(doc, tag) {
|
|||||||
# Contents of all matching elements is returned in
|
# Contents of all matching elements is returned in
|
||||||
# a vector of strings.
|
# a vector of strings.
|
||||||
path <- paste0("//", tag)
|
path <- paste0("//", tag)
|
||||||
nodes <- xml_find_all(doc, path)
|
nodes <- xml2::xml_find_all(doc, path)
|
||||||
return(xml_text(nodes))
|
return(xml2::xml_text(nodes))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -309,7 +306,7 @@ dbFetchNCBItaxData <- function(ID) {
|
|||||||
"db=protein",
|
"db=protein",
|
||||||
"&term=", ID,
|
"&term=", ID,
|
||||||
sep="")
|
sep="")
|
||||||
myXML <- read_xml(URL)
|
myXML <- xml2::read_xml(URL)
|
||||||
GID <- node2text(myXML, "Id")
|
GID <- node2text(myXML, "Id")
|
||||||
|
|
||||||
URL <- paste0(eUtilsBase,
|
URL <- paste0(eUtilsBase,
|
||||||
@ -318,7 +315,7 @@ dbFetchNCBItaxData <- function(ID) {
|
|||||||
"&id=",
|
"&id=",
|
||||||
GID,
|
GID,
|
||||||
"&version=2.0")
|
"&version=2.0")
|
||||||
myXML <- read_xml(URL)
|
myXML <- xml2::read_xml(URL)
|
||||||
|
|
||||||
x <- as.integer(node2text(myXML, "TaxId"))
|
x <- as.integer(node2text(myXML, "TaxId"))
|
||||||
y <- node2text(myXML, "Organism")
|
y <- node2text(myXML, "Organism")
|
||||||
@ -346,14 +343,14 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
# for IDs that are not mapped.
|
# for IDs that are not mapped.
|
||||||
|
|
||||||
URL <- "https://www.uniprot.org/uploadlists/"
|
URL <- "https://www.uniprot.org/uploadlists/"
|
||||||
response <- POST(URL,
|
response <- httr::POST(URL,
|
||||||
body = list(from = mapFrom,
|
body = list(from = mapFrom,
|
||||||
to = mapTo,
|
to = mapTo,
|
||||||
format = "tab",
|
format = "tab",
|
||||||
query = s))
|
query = s))
|
||||||
|
|
||||||
if (status_code(response) == 200) { # 200: oK
|
if (httr::status_code(response) == 200) { # 200: oK
|
||||||
myMap <- read.delim(file = textConnection(content(response)),
|
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||||
sep = "\t",
|
sep = "\t",
|
||||||
stringsAsFactors = FALSE)
|
stringsAsFactors = FALSE)
|
||||||
myMap <- myMap[ , c(1,3)]
|
myMap <- myMap[ , c(1,3)]
|
||||||
@ -362,12 +359,23 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
|||||||
myMap <- data.frame()
|
myMap <- data.frame()
|
||||||
warning(paste("No uniProt ID mapping returned:",
|
warning(paste("No uniProt ID mapping returned:",
|
||||||
"server sent status",
|
"server sent status",
|
||||||
status_code(response)))
|
httr::status_code(response)))
|
||||||
}
|
}
|
||||||
|
|
||||||
return(myMap)
|
return(myMap)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ====== TESTS =================================================================
|
||||||
|
|
||||||
|
if (FALSE) {
|
||||||
|
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||||
|
install.packages("testthat")
|
||||||
|
}
|
||||||
|
|
||||||
|
# ToDo: test everything here
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -3,14 +3,16 @@
|
|||||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||||
# Mbp1 homologues.
|
# Mbp1 homologues.
|
||||||
#
|
#
|
||||||
# Version: 1.1.2
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2016 09 - 2017 09
|
# Date: 2016 09 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# V 1.1.2 Moved BLAST.R to ./scripts directory
|
# Versions
|
||||||
# V 1.1 Update 2017
|
# 1.2 Change from require() to requireNamespace()
|
||||||
# V 1.0 First code 2016
|
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||||
|
# 1.1 Update 2017
|
||||||
|
# 1.0 First code 2016
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
@ -31,27 +33,25 @@
|
|||||||
# the respective intermediate results.
|
# the respective intermediate results.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------
|
#TOC> ---------------------------------------------------------
|
||||||
#TOC> 1 The strategy 54
|
#TOC> 1 The strategy 55
|
||||||
#TOC> 2 GOLD species 66
|
#TOC> 2 GOLD species 67
|
||||||
#TOC> 2.1 Initialize 71
|
#TOC> 2.1 Initialize 72
|
||||||
#TOC> 2.2 Import 77
|
#TOC> 2.2 Import 79
|
||||||
#TOC> 2.3 Unique species 129
|
#TOC> 2.3 Unique species 131
|
||||||
#TOC> 3 BLAST species 171
|
#TOC> 3 BLAST species 173
|
||||||
#TOC> 3.1 find homologous proteins 178
|
#TOC> 3.1 find homologous proteins 180
|
||||||
#TOC> 3.2 Identify species in "hits" 202
|
#TOC> 3.2 Identify species in "hits" 204
|
||||||
#TOC> 4 Intersect GOLD and BLAST species 247
|
#TOC> 4 Intersect GOLD and BLAST species 249
|
||||||
#TOC> 5 Cleanup and finish 265
|
#TOC> 5 Cleanup and finish 267
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
#TOC>
|
|
||||||
#TOC>
|
|
||||||
|
|
||||||
# = 1 The strategy ========================================================
|
# = 1 The strategy ========================================================
|
||||||
|
|
||||||
# This script will create a list of "MYSPE" species and save it in an R object
|
# This script will create a list of "MYSPE" species and save it in an R object
|
||||||
@ -70,9 +70,10 @@
|
|||||||
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
|
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
|
||||||
|
|
||||||
# == 2.1 Initialize ========================================================
|
# == 2.1 Initialize ========================================================
|
||||||
if (! require(httr)) { # httr provides interfaces to Webservers on the Internet
|
|
||||||
|
# httr provides interfaces to Webservers on the Internet
|
||||||
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# == 2.2 Import ============================================================
|
# == 2.2 Import ============================================================
|
||||||
|
@ -15,12 +15,14 @@
|
|||||||
# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Version: 1.0
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 10 06
|
# Date: 2017 10 - 2019 01
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 1.0 First code copied from 2016 material.
|
# 1.0 First code copied from 2016 material.
|
||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
@ -28,16 +30,16 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
if (! require(readr, quietly = TRUE)) {
|
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||||
install.packages("readr")
|
install.packages("readr")
|
||||||
library(readr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# STRING functional interaction data
|
# STRING functional interaction data
|
||||||
|
|
||||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||||
STR <- read_delim("./data/4932.protein.links.full.v10.5.txt", delim = " ")
|
STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt",
|
||||||
|
delim = " ")
|
||||||
|
|
||||||
# Subset only IDs and combined_score column
|
# Subset only IDs and combined_score column
|
||||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||||
@ -61,7 +63,7 @@ myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
|||||||
#
|
#
|
||||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||||
|
|
||||||
Gsl <- read_tsv("./data/go_slim_mapping.tab",
|
Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
|
||||||
col_names = c("ID",
|
col_names = c("ID",
|
||||||
"name",
|
"name",
|
||||||
"SGDId",
|
"SGDId",
|
||||||
|
@ -7,11 +7,13 @@
|
|||||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Version: 3
|
# Version: 3.1
|
||||||
# Date: 2016 09 - 2017 11
|
# Date: 2016 09 - 2019 01
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 3.1 Change from require() to requireNamespace(),
|
||||||
|
# use <package>::<function>() idiom throughout
|
||||||
# 3 parsing logic had not been fully implemented; Fixed.
|
# 3 parsing logic had not been fully implemented; Fixed.
|
||||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||||
@ -29,9 +31,8 @@
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
if (! require(httr, quietly = TRUE)) {
|
if (! requireNamespace(httr, quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
library(httr)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -92,13 +93,13 @@ BLAST <- function(q,
|
|||||||
}
|
}
|
||||||
|
|
||||||
# send it off ...
|
# send it off ...
|
||||||
response <- GET(results$query)
|
response <- httr::GET(results$query)
|
||||||
if (http_status(response)$category != "Success" ) {
|
if (httr::http_status(response)$category != "Success" ) {
|
||||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||||
http_status(response)$message))
|
httr::http_status(response)$message))
|
||||||
}
|
}
|
||||||
|
|
||||||
txt <- content(response, "text", encoding = "UTF-8")
|
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||||
|
|
||||||
patt <- "RID = (\\w+)" # match the request id
|
patt <- "RID = (\\w+)" # match the request id
|
||||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||||
@ -127,13 +128,13 @@ BLAST <- function(q,
|
|||||||
|
|
||||||
while (TRUE) {
|
while (TRUE) {
|
||||||
# Check whether the result is ready
|
# Check whether the result is ready
|
||||||
response <- GET(checkStatus)
|
response <- httr::GET(checkStatus)
|
||||||
if (http_status(response)$category != "Success" ) {
|
if (httr::http_status(response)$category != "Success" ) {
|
||||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||||
http_status(response)$message))
|
httr::http_status(response)$message))
|
||||||
}
|
}
|
||||||
|
|
||||||
txt <- content(response, "text", encoding = "UTF-8")
|
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||||
|
|
||||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||||
myTimeout <- myTimeout - EXTRAWAIT
|
myTimeout <- myTimeout - EXTRAWAIT
|
||||||
@ -184,13 +185,13 @@ BLAST <- function(q,
|
|||||||
"&FORMAT_TYPE=Text",
|
"&FORMAT_TYPE=Text",
|
||||||
sep = "")
|
sep = "")
|
||||||
|
|
||||||
response <- GET(retrieve)
|
response <- httr::GET(retrieve)
|
||||||
if (http_status(response)$category != "Success" ) {
|
if (httr::http_status(response)$category != "Success" ) {
|
||||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||||
http_status(response)$message))
|
httr::http_status(response)$message))
|
||||||
}
|
}
|
||||||
|
|
||||||
txt <- content(response, "text", encoding = "UTF-8")
|
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||||
|
|
||||||
# txt contains the whole set of results. Process:
|
# txt contains the whole set of results. Process:
|
||||||
|
|
||||||
@ -357,7 +358,7 @@ parseBLASTalignment <- function(hit) {
|
|||||||
# ==== TESTS ===================================================================
|
# ==== TESTS ===================================================================
|
||||||
|
|
||||||
# define query:
|
# define query:
|
||||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
|
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||||
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||||
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||||
# sep="")
|
# sep="")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user