Use requireNamespace(), <package>::<function>() idiom,

Biocmanager:: - not biocLite()
This commit is contained in:
hyginn 2019-01-08 17:11:25 +10:00
parent 8cdbb9f4cc
commit cffd803c23
30 changed files with 1185 additions and 1069 deletions

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-BLAST unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10 23
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.1 Fixed parsing logic.
# 1.0 First live version 2017.
# 0.1 First code copied from 2016 material.
@ -29,29 +31,15 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------
#TOC> 1 Preparations 41
#TOC> 2 Defining the APSES domain 54
#TOC> 3 Executing the BLAST search 76
#TOC> 4 Analysing results 98
#TOC> ---------------------------------------------------
#TOC> 1 Defining the APSES domain 42
#TOC> 2 Executing the BLAST search 64
#TOC> 3 Analysing results 86
#TOC>
#TOC> ==========================================================================
# = 1 Preparations ========================================================
if (!require(Biostrings, quietly=TRUE)) {
source("https://bioconductor.org/biocLite.R")
biocLite("Biostrings")
library(Biostrings)
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# = 2 Defining the APSES domain ===========================================
# = 1 Defining the APSES domain ===========================================
# Load your protein database
source("makeProteinDB.R")
@ -73,7 +61,7 @@ source("makeProteinDB.R")
# BLAST search.
# = 3 Executing the BLAST search ==========================================
# = 2 Executing the BLAST search ==========================================
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
# through its Web API, and to parse results. Have a look at the script, then
@ -91,11 +79,11 @@ BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
limits = "txid559292[ORGN]") # S. cerevisiae S288c
length(BLASTresults$hits) # There should be at least one hit there. Ask for advice
# in case this step fails.
length(BLASTresults$hits) # There should be at least one hit there. Ask for
# advice in case this step fails.
# = 4 Analysing results ===================================================
# = 3 Analysing results ===================================================
(topHit <- BLASTresults$hits[[1]]) # Get the top hit

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Dotplot unit.
#
# Version: 0.1
# Version: 0.2
#
# Date: 2017 08 28
# Date: 2019 01 07
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 0.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 0.1 First code copied from 2016 material.
#
#
@ -23,24 +25,37 @@
#
# ==============================================================================
# = 1 ___Section___
# First, we install and load the Biostrings package.
if (!require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------
#TOC> 1 ___Section___ 39
#TOC> 2 Tasks 187
#TOC>
#TOC> ==========================================================================
# = 1 ___Section___ =======================================================
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
if (!requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Let's load BLOSUM62
data(BLOSUM62)
data(BLOSUM62, package = "Biostrings")
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
# matrix that has as many rows as one sequence, as many columns as another. Then
@ -51,10 +66,10 @@ data(BLOSUM62)
# First we fetch our sequences and split them into single characters.
sel <- myDB$protein$name == "MBP1_SACCE"
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
# Check that we have two character vectors of the expected length.
str(MBP1_SACCE)
@ -136,7 +151,7 @@ axis(4, at = c(1, seq(10, len, by=10)))
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
# there already is a dotplot function in the seqinr package:
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
# Which one do you prefer? You can probably see the block patterns that arise
@ -169,7 +184,7 @@ dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
# = 1 Tasks
# = 2 Tasks ===============================================================

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-MSA unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.1 Added fetchMSAmotif()
# 1.0 Fully refactored and rewritten for 2017
# 0.1 First code copied from 2016 material.
@ -29,22 +31,22 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Preparations 51
#TOC> 2 Aligning full length MBP1 proteins 99
#TOC> 2.1 Preparing Sequences 110
#TOC> 2.2 Compute the MSA 135
#TOC> 3 Analyzing an MSA 156
#TOC> 4 Comparing MSAs 227
#TOC> 4.1 Importing an alignment to msa 236
#TOC> 4.1.1 importing an .aln file 245
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276
#TOC> 4.2 More alignments 313
#TOC> 4.3 Computing comparison metrics 325
#TOC> 5 Profile-Profile alignments 462
#TOC> 6 Sequence Logos 539
#TOC> 6.1 Subsetting an alignment by motif 548
#TOC> 6.2 Plot a Sequence Logo 591
#TOC> ------------------------------------------------------------------
#TOC> 1 Preparations 54
#TOC> 2 Aligning full length MBP1 proteins 96
#TOC> 2.1 Preparing Sequences 107
#TOC> 2.2 Compute the MSA 132
#TOC> 3 Analyzing an MSA 153
#TOC> 4 Comparing MSAs 224
#TOC> 4.1 Importing an alignment to msa 233
#TOC> 4.1.1 importing an .aln file 242
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 273
#TOC> 4.2 More alignments 324
#TOC> 4.3 Computing comparison metrics 336
#TOC> 5 Profile-Profile alignments 473
#TOC> 6 Sequence Logos 546
#TOC> 6.1 Subsetting an alignment by motif 555
#TOC> 6.2 Plot a Sequence Logo 604
#TOC>
#TOC> ==========================================================================
@ -59,28 +61,22 @@
source("makeProteinDB.R")
# Multiple sequence alignment algorithms are provided in
# the Bioconductor msa package.
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
# Multiple sequence alignment algorithms are provided in
# the Bioconductor msa package.
if (! require(msa, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("msa")
library(msa)
if (! requireNamespace("msa", quietly=TRUE)) {
BiocManager::install("msa")
}
# Package information:
# library(help=msa) # basic information
@ -115,7 +111,7 @@ help(package = "msa")
# of sequence.
sel <- grep("MBP1", myDB$protein$name)
MBP1set <- AAStringSet(myDB$protein$sequence[sel])
MBP1set <- Biostrings::AAStringSet(myDB$protein$sequence[sel])
# To help us make sense of the alignment we need to add the names for
# the sequences. Names for a seqSet object are held in the ranges slot...
@ -142,10 +138,10 @@ MBP1set
# Let's run an alignment with "Muscle"
(msaM <- msaMuscle( MBP1set, order = "aligned"))
(msaM <- msa::msaMuscle( MBP1set, order = "aligned"))
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
msa::print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
# You see that the alignment object has sequence strings with hyphens as
@ -173,7 +169,7 @@ print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package
msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
msaMScores <- msa::msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
# That plot shows the well-aligned regions (domains ?) of the sequence, but it
@ -246,17 +242,17 @@ for (i in seq_along(highScoringRanges$lengths)) {
# === 4.1.1 importing an .aln file
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
if (! require(seqinr, quietly=TRUE)) {
install.packages(seqinr)
library(seqinr)
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# Package information:
# library(help=seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# read the donwloaded file
tmp <- read.alignment("msaT.aln", format = "clustal")
# read the T-coffee aligned file that you donwloaded from the EBI MSA tools
# (cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/BIN-ALI-MSA)
tmp <- seqinr::read.alignment("msaT.aln", format = "clustal")
# read.alignment() returns a list. $seq is a list of strings, one for each
# complete alignment. However, they are converted to lower case.
@ -278,12 +274,12 @@ for (i in seq_along(x)) {
# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
# in their @unmasked slot, and a few additional items. Rather then build the
# object from scratch, we copy an axisting object, and overwrite the dta in its
# object from scratch, we copy an existing object, and overwrite the data in its
# slots with what we need. Our goal is pragmatic, we want an object that msa's
# functions will accept as input.
# First: convert our named char vector into an AAstringSet
x <- AAStringSet(x)
x <- Biostrings::AAStringSet(x)
# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
# defined what such an object should look like, with the SetClass() function. To
@ -294,8 +290,22 @@ x <- AAStringSet(x)
str(msaM)
# There is a catch however in the way R makes such operations specific to
# the packages they need them: the function that creates the class is
# defined as a "generic", and when it is called, R looks in the package
# namespace for a more specific function with precise instructions what
# to do. However, we have not loaded the package namespace - we access all
# of the functions directly with the msa:: prefix. This method breaks down
# when generic functions are involved. I.e. - we could make it work, but
# the amount of code we need then is unreasonable. The straightforward
# way is to load the package. We can still use the prefix notation for
# its functions, just to emphasize where the function comes from. But since
# the namespace then exists, we ensure that generics are properly dispatched.
library(msa) # load the msa package namespace
msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
unmasked = x, # "unmasked" slot takes an AASringSet
unmasked = x, # "unmasked" slot takes an AAStringSet
version = "T-Coffee", # "version" slot takes a string
params = list(), # "params" takes a list(), we leave the
# list empty, but we could add the
@ -309,18 +319,18 @@ str(msaT)
msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
# use the msa package functions on it
msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
msaTScores <- msa::msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
# == 4.2 More alignments ===================================================
# Next, we calculate alignments with msa's two other alignment options:
# CLUSTAL Omega
(msaO <- msaClustalOmega( MBP1set, order = "aligned"))
msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
(msaO <- msa::msaClustalOmega( MBP1set, order = "aligned"))
msaOScores <- msa::msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
# CLUSTAL W
(msaW <- msaClustalW( MBP1set, order = "aligned"))
msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
(msaW <- msa::msaClustalW( MBP1set, order = "aligned"))
msaWScores <- msa::msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
# == 4.3 Computing comparison metrics ======================================
@ -454,7 +464,7 @@ legend("bottomright",
# Your alignment is going to be different from mine, due to the inclusion of
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
# achieves this with fewer indels then most, and the lowest number of gaps of
# achieves this with fewer indels than most, and the lowest number of gaps of
# all algorithms.
# To actually compare regions of alignments, we need to align alignments.
@ -470,12 +480,8 @@ legend("bottomright",
# to compare two MSAs with each other, by aligning them. The algorithm is
# provided by the DECIPHER package.
if (! require(DECIPHER, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("DECIPHER")
library(DECIPHER)
if (! requireNamespace("DECIPHER", quietly=TRUE)) {
BiocManager::install("DECIPHER")
}
# Package information:
# library(help = DECIPHER) # basic information
@ -484,14 +490,14 @@ if (! require(DECIPHER, quietly=TRUE)) {
# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
# CLUSTAL W alignments: we could do this directly ...
AlignProfiles(msaW@unmasked, msaM@unmasked)
DECIPHER::AlignProfiles(msaW@unmasked, msaM@unmasked)
# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
# alignment into the same order as the MUSCLE alignment:
m <- as.character(msaM)
w <- as.character(msaW)[names(m)]
(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
(ppa <- DECIPHER::AlignProfiles(msa::AAStringSet(w), msa::AAStringSet(m)))
# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
# writeALN function to show it. Here is an arbitrary block, from somewhere in
@ -533,8 +539,8 @@ writeALN(ppa2, range = c(800, 960))
# Again, go explore, and get a sense of what's going on. You may find that
# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
# expect from an algorithm that builds alignments from pairwise local
# alignments, without global refinement.
# expect from an algorithm that builds alignments incrementally from pairwise
# local alignments, without global refinement.
# = 6 Sequence Logos ======================================================
@ -602,16 +608,15 @@ writeALN(fetchMSAmotif(msaM, wing))
# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
# the EBI.
if (! require(ggseqlogo, quietly=TRUE)) {
if (! requireNamspace("ggseqlogo", quietly=TRUE)) {
install.packages(("ggseqlogo"))
library(ggseqlogo)
}
# Package information:
# library(help=ggseqlogo) # basic information
# browseVignettes("ggseqlogo") # available vignettes
# data(package = "ggseqlogo") # available datasets
ggseqlogo(as.character(motifAli))
ggseqlogo::ggseqlogo(as.character(motifAli))

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
#
# Version: 1.4
# Version: 1.5
#
# Date: 2017 09 - 2017 11
# Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.5 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.4 Pull s2c() from seqinr package, rather then loading the
# entire library.
# 1.3 Updated confirmation task with correct logic
@ -34,27 +36,30 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 1 Prepare 52
#TOC> 2 Biostrings Pairwise Alignment 66
#TOC> 2.1 Optimal global alignment 84
#TOC> 2.2 Optimal local alignment 147
#TOC> 3 APSES Domain annotation by alignment 171
#TOC> 4 Update your database script 252
#TOC> 4.1 Preparing an annotation file ... 258
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303
#TOC> 4.2 Execute and Validate 327
#TOC> --------------------------------------------------------------------------
#TOC> 1 Prepare 54
#TOC> 2 Biostrings Pairwise Alignment 71
#TOC> 2.1 Optimal global alignment 89
#TOC> 2.2 Optimal local alignment 152
#TOC> 3 APSES Domain annotation by alignment 176
#TOC> 4 Update your database script 257
#TOC> 4.1 Preparing an annotation file ... 263
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 265
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 308
#TOC> 4.2 Execute and Validate 332
#TOC>
#TOC> ==========================================================================
# = 1 Prepare =============================================================
# To simplify code, we pull the function s2c(x) from the seqinr package,
# rather than using the lengthier idiom unlist(strsplit(x, "").
# This assumes that the seqinr package has been installed previously.
s2c <- seqinr::s2c
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
}
# You can get package information with the following commands:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# You need to recreate the protein database that you have constructed in the
@ -66,13 +71,13 @@ source("makeProteinDB.R")
# = 2 Biostrings Pairwise Alignment =======================================
if (!require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (!requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (!requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
# browseVignettes("Biostrings") # available vignettes
# data(package = "Biostrings") # available datasets
@ -88,15 +93,15 @@ if (!require(Biostrings, quietly=TRUE)) {
# First: make AAString objects ...
sel <- myDB$protein$name == "MBP1_SACCE"
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
?pairwiseAlignment
# ... and align.
# Global optimal alignment with end-gap penalties is default.
ali1 <- pairwiseAlignment(
ali1 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
substitutionMatrix = "BLOSUM62",
@ -108,7 +113,7 @@ str(ali1) # ... it's complicated
# This is a Biostrings alignment object. But we can use Biostrings functions to
# tame it:
ali1
writePairwiseAlignments(ali1) # That should look familiar
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
# And we can make the internal structure work for us (@ is for classes as
# $ is for lists ...)
@ -147,7 +152,7 @@ percentID(ali1)
# == 2.2 Optimal local alignment ===========================================
# Compare with local optimal alignment (like EMBOSS Water)
ali2 <- pairwiseAlignment(
ali2 <- Biostrings::pairwiseAlignment(
aaMBP1_SACCE,
aaMBP1_MYSPE,
type = "local",
@ -155,9 +160,9 @@ ali2 <- pairwiseAlignment(
gapOpening = 50,
gapExtension = 10)
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
# DNA binding domain - but that one has quite
# high sequence identity:
Biostrings::writePairwiseAlignments(ali2)
# This has probably only aligned the N-terminal DNA binding domain - but that
# one has quite high sequence identity:
percentID(ali2)
# == TASK: ==
@ -209,14 +214,14 @@ myDB$annotation[myDB$annotation$ID == proID &
# the sequence, and used the start and end coordinates to extract a substring.
# Let's convert this to an AAstring and assign it:
aaMB1_SACCE_APSES <- AAString(apses)
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
# Now let's align these two sequences of very different length without end-gap
# penalties using the "overlap" type. "overlap" turns the
# end-gap penalties off and that is crucially important since
# the sequences have very different length.
aliApses <- pairwiseAlignment(
aliApses <- Biostrings::pairwiseAlignment(
aaMB1_SACCE_APSES,
aaMBP1_MYSPE,
type = "overlap",
@ -228,7 +233,7 @@ aliApses <- pairwiseAlignment(
# homologous, and have (almost) no indels. The entire "pattern"
# sequence from QIYSAR ... to ... KPLFDF should be matched
# with the "query". Is this correct?
writePairwiseAlignments(aliApses)
Biostrings::writePairwiseAlignments(aliApses)
# If this is correct, you can extract the matched sequence from
# the alignment object. The syntax is a bit different from what

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-ALI-Similarity unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 20
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 Refactored for 2017; add aaindex, ternary plot.
# 0.1 First code copied from 2016 material.
#
@ -28,10 +30,10 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------
#TOC> 1 Amino Acid Properties 43
#TOC> 2 Mutation Data matrix 163
#TOC> 3 Background score 205
#TOC> ----------------------------------------------
#TOC> 1 Amino Acid Properties 41
#TOC> 2 Mutation Data matrix 158
#TOC> 3 Background score 199
#TOC>
#TOC> ==========================================================================
@ -41,9 +43,8 @@
# A large collection of amino acid property tables is available via the seqinr
# package:
if (!require(seqinr)) {
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
library(seqinr)
}
# Package information:
# library(help = seqinr) # basic information
@ -127,9 +128,8 @@ text(Y$I, K$I, names(Y$I))
# plots are in general unintuitive and hard to interpret. One alternative is a
# so-called "ternary plot":
if (!require(ggtern)) {
if (! requireNamespace("ggtern", quietly=TRUE)) {
install.packages("ggtern")
library(ggtern)
}
# Package information:
# library(help = ggtern) # basic information
@ -145,12 +145,11 @@ myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
stringsAsFactors = FALSE)
rownames(myDat) <- names(Y$I)
ggtern(data = myDat,
aes(x = vol,
ggtern::ggtern(data = myDat,
ggplot2::aes(x = vol,
y = phi,
z = pK,
label = rownames(myDat))) +
geom_text()
label = rownames(myDat))) + ggplot2::geom_text()
# This results in a mapping of amino acids relative to each other that is
# similar to the Venn diagram you have seen in the notes.
@ -162,12 +161,11 @@ ggtern(data = myDat,
# The Biostrings package contains the most common mutation data matrices.
if (!require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly=TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly=TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help=Biostrings) # basic information
@ -200,7 +198,9 @@ BLOSUM62["W", "R"]
# = 3 Background score ====================================================
# The mutation data matrix is designed to give high scores to homologous sequences, low scores to non-homologous sequences. What score on average should we expect for a random sequence?
# The mutation data matrix is designed to give high scores to homologous
# sequences, low scores to non-homologous sequences. What score on average
# should we expect for a random sequence?
# If we sample amino acid pairs at random, we will get a score that is the
# average of the individual pairscores in the matrix. Omitting the ambiguity
@ -219,12 +219,12 @@ sum(BLOSUM62[1:20, 1:20])/400
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
# low-complexity amino acid sequence ...
aa3FG7 <- readAAStringSet("./data/3FG7.fa")[[1]]
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
# with an exceptionally high percentage of hydrophobic residues.
aa2F1C <- readAAStringSet("./data/2F1C.fa")[[1]]
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
# Here is a function that takes two sequences and
# returns their average pairscore.

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Data_integration unit.
#
# Version: 1.0.1
# Version: 1.1
#
# Date: 2018 10 30
# Date: 2018 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0.1 Bugfix: UniProt ID Mapping service API change
# 1.0 First live version
#
@ -31,8 +33,8 @@
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------
#TOC> 1 Identifier mapping 40
#TOC> 2 Cross-referencing tables 164
#TOC> 1 Identifier mapping 42
#TOC> 2 Cross-referencing tables 165
#TOC>
#TOC> ==========================================================================
@ -54,9 +56,8 @@
# To begin, we load httr, which supports sending and receiving data via the
# http protocol, just like a Web browser.
if (!require(httr, quietly=TRUE)) {
install.packages("httr")
library(httr)
if (! requireNamespace("httpr", quietly=TRUE)) {
install.packages("httpr")
}
# Package information:
# library(help = httr) # basic information
@ -75,22 +76,22 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036"
# of the request. GET() and POST() are functions from httr.
URL <- "https://www.uniprot.org/mapping/"
response <- POST(URL,
response <- httr::POST(URL,
body = list(from = "P_REFSEQ_AC", # Refseq Protein
to = "ACC", # UniProt ID
format = "tab",
query = myQueryIDs))
cat(content(response))
cat(httr::content(response))
# We need to check the status code - if it is not 200, an error ocurred and we
# can't process the result:
status_code(response)
httr::status_code(response)
# If the query is successful, tabbed text is returned. We can assign that to a
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
myMappedIDs <- read.delim(file = textConnection(content(response)),
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
myMappedIDs
@ -132,14 +133,14 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# for IDs that are not mapped.
URL <- "https://www.uniprot.org/uploadlists/"
response <- POST(URL,
response <- httr::POST(URL,
body = list(from = mapFrom,
to = mapTo,
format = "tab",
query = s))
if (status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(content(response)),
if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
myMap <- myMap[ , c(1,3)]
@ -148,7 +149,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:",
"server sent status",
status_code(response)))
httr::status_code(response)))
}
return(myMap)
@ -168,7 +169,8 @@ myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
# Nomenclature commission. How do we map one set of identifiers to another one?
# The function to use is match().
# Here is a tiny set of identifiers taken from a much larger table to illustrate the principle:
# Here is a tiny set of identifiers taken from a much larger table to
# illustrate the principle:
#
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 11 12
# Date: 2017 11 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 New code.
#
#
@ -27,59 +30,65 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39
#TOC> 2 Fetch GO Annotations 89
#TOC> 3 Semantic Similarities 98
#TOC> 4 GO Term Enrichment in Gene Sets 116
#TOC> --------------------------------------------------------------------
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42
#TOC> 2 Fetch GO Annotations 98
#TOC> 3 Semantic Similarities 107
#TOC> 4 GO Term Enrichment in Gene Sets 125
#TOC>
#TOC> ==========================================================================
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# GOSim is an R-package in the Bioconductor project.
if (! require(GOSim, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("GOSim")
library(GOSim)
if (! requireNamespace("GOSim", quietly = TRUE)) {
BiocManager::install("GOSim")
}
# Package information:
# library(help = GOSim) # basic information
# browseVignettes("GOSim") # available vignettes
# data(package = "GOSim") # available datasets
# GOSim makes extensive assumptions about loaded packages, and many base
# methods are masked. We will thus use library(GOSim) to load it
# in its entirety and with all packages it depends on. We will still use
# the <package>::<function>() syntax in the code below, but this now serves
# more of a didactic purpose, rather than actual syntax requirements.
library(GOSim)
# GOSim loads human annotations by default. We load yeast annotations instead...
if (!require(org.Sc.sgd.db, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
BiocManager::install("org.Sc.sgd.db")
}
biocLite("org.Sc.sgd.db")
# Bioconductor annotation packages won't work stably unless we actually load
# them:
library(org.Sc.sgd.db)
}
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
# databases exist for all model organisms. It's a kind of a fancy data frame
# from which we can get annotations by rows (genes) with the keys() funtion ...
keys(org.Sc.sgd.db)[1500:1510]
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
# ... and the types of available annotations with the columns() function
columns(org.Sc.sgd.db)
AnnotationDbi::columns(org.Sc.sgd.db)
# Note that one of the columns is "GO" ... and we load that into the
# datastructures used by GOSim:
# Choose GOterms to use
setEvidenceLevel(evidences="all",
GOSim::setEvidenceLevel(evidences = "all",
organism = org.Sc.sgdORGANISM,
gomap = org.Sc.sgdGO)
# Use Biological Process ontology
setOntology("BP", loadIC=FALSE)
GOSim::setOntology("BP", loadIC = FALSE)
# confirm that we loaded the correct ontology
head(get("gomap", envir = GOSimEnv))
@ -92,7 +101,7 @@ head(get("gomap", envir=GOSimEnv))
# All keys being used here are yeast systematic names.
# Get one set of annotations
getGOInfo(c("YDL056W")) # Mbp1
GOSim::getGOInfo(c("YDL056W")) # Mbp1
# = 3 Semantic Similarities ===============================================
@ -105,30 +114,30 @@ getGOInfo(c("YDL056W")) # Mbp1
# in this package.
# Mbp1 and...
getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex
getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators
getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator
getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist
getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist
getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
# = 4 GO Term Enrichment in Gene Sets =====================================
# Calculating GO term enrichment in gene sets is done with the topGO package.
if (! require(topGO, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("topGO")
library(topGO)
# Calculating GO term enrichment in gene sets is done with the Bioconductor
# topGO package.
if (! requireNamespace("topGO", quietly = TRUE)) {
BiocManager::install("topGO")
}
# Package information:
# library(help = topGO) # basic information
# browseVignettes("topGO") # available vignettes
# data(package = "topGO") # available datasets
# Once again - assumptions are made by GOsim that require us to load the
# topGO package wholesale:
library(topGO)
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
mySet <- c("YFR028C", # Cdc14
@ -141,7 +150,7 @@ mySet <- c("YFR028C", # Cdc14
"YPL256C", # Cln2
"YAL040C") # Cln3
allGenes <- keys(org.Sc.sgd.db)
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
# we define enrichment
@ -164,7 +173,7 @@ setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
# What are these genes?
# Select annotations from the annotation database:
select(org.Sc.sgd.db,
AnnotationDbi::select(org.Sc.sgd.db,
keys = setdiff(fullSet, mySet),
columns = c("COMMON", "DESCRIPTION"))

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Data_preparation unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 31
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
@ -28,12 +31,12 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 Preparations 41
#TOC> 2 Fetching sequences 78
#TOC> 3 Multiple Sequence Alignment 119
#TOC> 4 Reviewing and Editing Alignments 138
#TOC> 4.1 Masking workflow 154
#TOC> ---------------------------------------------------------
#TOC> 1 Preparations 44
#TOC> 2 Fetching sequences 76
#TOC> 3 Multiple Sequence Alignment 117
#TOC> 4 Reviewing and Editing Alignments 136
#TOC> 4.1 Masking workflow 152
#TOC>
#TOC> ==========================================================================
@ -49,12 +52,11 @@ source("makeProteinDB.R")
# Load packages we need
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
@ -62,12 +64,8 @@ if (! require(Biostrings, quietly=TRUE)) {
# data(package = "Biostrings") # available datasets
if (! require(msa, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("msa")
library(msa)
if (! requireNamespace("msa", quietly = TRUE)) {
BiocManager::install("msa")
}
# Package information:
# library(help = msa) # basic information
@ -123,8 +121,8 @@ tail(APSI)
# the MSA algorithms in Biostrings.
#
APSESSet <- AAStringSet(APSI)
APSESMsa <- msaMuscle(APSESSet, order = "aligned")
APSESSet <- Biostrings::AAStringSet(APSI)
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
# that happens in your case, just use msaClustalOmega() instead.

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
#
# Version: 1.0.2
# Version: 1.1
#
# Date: 2017 10 31
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.2 Typo in variable name, style changes
# 1.0.1 Wrong section heading
# 1.0 First 2017 version
@ -31,11 +34,11 @@
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------
#TOC> 1 Preparation and Tree Plot 43
#TOC> 2 Tree Analysis 82
#TOC> 2.1 Rooting Trees 141
#TOC> 2.2 Rotating Clades 187
#TOC> 2.3 Computing tree distances 234
#TOC> 1 Preparation and Tree Plot 46
#TOC> 2 Tree Analysis 86
#TOC> 2.1 Rooting Trees 145
#TOC> 2.2 Rotating Clades 190
#TOC> 2.3 Computing tree distances 241
#TOC>
#TOC> ==========================================================================
@ -43,19 +46,17 @@
# = 1 Preparation and Tree Plot ===========================================
if (!require(Rphylip, quietly=TRUE)) {
install.packages("Rphylip")
library(Rphylip)
if (! requireNamespace("ape", quietly = TRUE)) {
install.packages("ape")
}
# Package information:
# library(help = Rphylip) # basic information
# browseVignettes("Rphylip") # available vignettes
# data(package = "Rphylip") # available datasets
# library(help = ape) # basic information
# browseVignettes("ape") # available vignettes
# data(package = "ape") # available datasets
# Read the species tree that you have created at the phyloT Website:
fungiTree <- read.tree("fungiTree.txt")
fungiTree <- ape::read.tree("fungiTree.txt")
plot(fungiTree)
@ -73,7 +74,10 @@ for (i in seq_along(fungiTree$tip.label)) {
# Plot the tree
plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE)
nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
ape::nodelabels(text = fungiTree$node.label,
cex = 0.6,
adj = 0.2,
bg = "#D4F2DA")
# Note that you can use the arrow buttons in the menu above the plot to scroll
# back to plots you have created earlier - so you can reference back to the
# species tree.
@ -91,10 +95,10 @@ nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
# trees in Newick format and visualize them elsewhere.
# The "phylo" class object is one of R's "S3" objects and methods to plot and
# print it have been defined with the Rphylip package, and the package ape that
# Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to
# do with <your-tree> and how to plot it. The underlying function is
# plot.phylo(), and documentation for its many options can by found by typing:
# print it have been defined with the Rphylip package, and in ape. You can
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
# to plot it. The underlying function is plot.phylo(), and documentation for its
# many options can by found by typing:
?plot.phylo
@ -125,40 +129,39 @@ apsTree$edge.length
# show the node / edge and tip labels on a plot
plot(apsTree)
nodelabels()
edgelabels()
tiplabels()
ape::nodelabels()
ape::edgelabels()
ape::tiplabels()
# show the number of nodes, edges and tips
Nnode(apsTree)
Nedge(apsTree)
Ntip(apsTree)
ape::Nnode(apsTree)
ape::Nedge(apsTree)
ape::Ntip(apsTree)
# Finally, write the tree to console in Newick format
write.tree(apsTree)
ape::write.tree(apsTree)
# == 2.1 Rooting Trees =====================================================
# In order to analyse the tree, it is helpful to root it first and reorder its
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
is.rooted(apsTree)
ape::is.rooted(apsTree)
# You can root the tree with the command root() from the "ape" package. ape is
# automatically installed and loaded with Rphylip.
# You can root the tree with the command root() from the "ape" package.
plot(apsTree)
# add labels for internal nodes and tips
nodelabels(cex = 0.5, frame = "circle")
tiplabels(cex = 0.5, frame = "rect")
ape::nodelabels(cex = 0.5, frame = "circle")
ape::tiplabels(cex = 0.5, frame = "rect")
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
# number in yours. Substitute the correct node number below for "outgroup".
apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE)
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
plot(apsTree)
is.rooted(apsTree)
ape::is.rooted(apsTree)
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
# an edge of length zero was added to connect the MRCA (Most Recent Common
@ -172,7 +175,7 @@ apsTree$edge.length
# overlap.
apsTree$edge.length[1] <- 0.1
plot(apsTree, cex = 0.7)
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
# This procedure does however not assign an actual length to a root edge, and
@ -181,7 +184,7 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
plot(apsTree, cex = 0.7, root.edge = TRUE)
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# == 2.2 Rotating Clades ===================================================
@ -192,9 +195,9 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
# We can either rotate around individual internal nodes ...
layout(matrix(1:2, 1, 2))
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
nodelabels(node = 17, cex = 0.7, bg = "#ff8866")
plot(rotate(apsTree, node = 17), no.margin = TRUE, root.edge = TRUE)
nodelabels(node = 17, cex = 0.7, bg = "#88ff66")
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
# Note that the species at the bottom of the clade descending from node
# 17 is now plotted at the top.
layout(matrix(1), widths = 1.0, heights = 1.0)
@ -211,11 +214,15 @@ nOrg <- length(apsTree$tip.label)
layout(matrix(1:2, 1, 2))
plot(fungiTree,
no.margin = TRUE, root.edge = TRUE)
nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA")
ape::nodelabels(text = fungiTree$node.label,
cex = 0.5,
adj = 0.2,
bg = "#D4F2DA")
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
no.margin = TRUE, root.edge = TRUE)
add.scale.bar(length = 0.5)
plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
no.margin = TRUE,
root.edge = TRUE)
ape::add.scale.bar(length = 0.5)
layout(matrix(1), widths = 1.0, heights = 1.0)
# Task: Study the two trees and consider their similarities and differences.
@ -236,9 +243,8 @@ layout(matrix(1), widths = 1.0, heights = 1.0)
# Many superb phylogeny tools are contributed by the phangorn package.
if (!require(phangorn, quietly=TRUE)) {
if (! requireNamespace("phangorn", quietly = TRUE)) {
install.packages("phangorn")
library(phangorn)
}
# Package information:
# library(help = phangorn) # basic information
@ -253,14 +259,14 @@ apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
# phangorn provides several functions to compute tree-differences (and there
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
# "symmetric difference"
treedist(fungiTree, apsTree2, check.labels = TRUE)
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
# Numbers. What do they mean? How much more similar is our apsTree to the
# (presumably) ground truth of fungiTree than a random tree would be?
# The ape package (which was loaded with RPhylip) provides the function rtree()
# The ape package provides the function rtree()
# to compute random trees.
rtree(n = length(apsTree2$tip.label), # number of tips
ape::rtree(n = length(apsTree2$tip.label), # number of tips
rooted = TRUE, # we rooted the tree above,
# and fungiTree is rooted anyway
tip.label = apsTree2$tip.label, # use the apsTree2 labels
@ -278,17 +284,17 @@ colnames(myTreeDistances) <- c("symm", "path")
set.seed(112358)
for (i in 1:N) {
xTree <- rtree(n = length(apsTree2$tip.label),
xTree <- ape::rtree(n = length(apsTree2$tip.label),
rooted = TRUE,
tip.label = apsTree2$tip.label,
br = NULL)
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree)
}
set.seed(NULL) # reset the random number generator
table(myTreeDistances[, "symm"])
(symmObs <- treedist(fungiTree, apsTree2)[1])
(symmObs <- phangorn::treedist(fungiTree, apsTree2)[1])
# Random events less-or-equal to observation, divided by total number of
# events gives us the empirical p-value.
@ -298,7 +304,7 @@ cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n
hist(myTreeDistances[, "path"],
col = "aliceblue",
main = "Distances of random Trees to fungiTree")
(pathObs <- treedist(fungiTree, apsTree2)[2])
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
abline(v = pathObs, col = "chartreuse")
# Random events less-or-equal to observation, divided by total number of

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PHYLO-Tree_building unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10. 31
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0 First 2017 version
# 0.1 First code copied from 2016 material.
#
@ -29,14 +31,14 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------------
#TOC> 1 Calculating Trees 43
#TOC> 1.1 PROMLPATH ... 64
#TOC> 1.1.1 ... on the Mac 69
#TOC> 1.1.2 ... on Windows 80
#TOC> 1.1.3 ... on Linux 94
#TOC> 1.1.4 Confirming PROMLPATH 99
#TOC> 1.2 Building a maximum likelihood tree 108
#TOC> -----------------------------------------------------------
#TOC> 1 Calculating Trees 46
#TOC> 1.1 PROMLPATH ... 66
#TOC> 1.1.1 ... on the Mac 71
#TOC> 1.1.2 ... on Windows 82
#TOC> 1.1.3 ... on Linux 96
#TOC> 1.1.4 Confirming PROMLPATH 101
#TOC> 1.2 Building a maximum likelihood tree 110
#TOC>
#TOC> ==========================================================================
@ -50,9 +52,8 @@
# After you have installed Phylip on your computer, install the R package that
# provides an interface to the Phylip functions.
if (!require(Rphylip, quietly=TRUE)) {
if (! requireNamespace("Rphylip", quietly = TRUE)) {
install.packages("Rphylip")
library(Rphylip)
}
# Package information:
# library(help = Rphylip) # basic information
@ -110,7 +111,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
# as a "proseq" object with the read.protein() function of the RPhylip package:
apsIn <- read.protein("APSESphyloSet.mfa")
apsIn <- Rphylip::read.protein("APSESphyloSet.mfa")
# ... and you are ready to build a tree.
@ -125,7 +126,7 @@ apsIn <- read.protein("APSESphyloSet.mfa")
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
# of coffee while you are waiting.
apsTree <- Rproml(apsIn, path=PROMLPATH)
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
# A quick first look:

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-PPI-Analysis unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 08 - 2017 11
# Date: 2017 08 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 First live version
# 0.1 First code copied from 2016 material.
#
@ -29,13 +32,13 @@
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Setup and data 43
#TOC> 2 Functional Edges in the Human Proteome 80
#TOC> 2.1 Cliques 123
#TOC> 2.2 Communities 164
#TOC> 2.3 Betweenness Centrality 178
#TOC> 3 biomaRt 224
#TOC> 4 Task for submission 295
#TOC> 1 Setup and data 46
#TOC> 2 Functional Edges in the Human Proteome 82
#TOC> 2.1 Cliques 125
#TOC> 2.2 Communities 166
#TOC> 2.3 Betweenness Centrality 180
#TOC> 3 biomaRt 226
#TOC> 4 Task for submission 296
#TOC>
#TOC> ==========================================================================
@ -45,9 +48,8 @@
# Not surprisingly, the analysis of PPI networks needs iGraph:
if (!require(igraph, quietly=TRUE)) {
if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph")
library(igraph)
}
# Package information:
# library(help = igraph) # basic information
@ -88,9 +90,9 @@ head(STRINGedges)
# Make a graph from this dataframe
?graph_from_data_frame
?igraph::graph_from_data_frame
gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
# layout of such large graphs is possible, but requires specialized code. Google
@ -99,13 +101,13 @@ gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
# Of course simple computations on this graph are reasonably fast:
compSTR <- components(gSTR)
compSTR <- igraph::components(gSTR)
summary(compSTR) # our graph is fully connected!
hist(log(degree(gSTR)), col="#FEE0AF")
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
# this actually does look rather scale-free
(freqRank <- table(degree(gSTR)))
(freqRank <- table(igraph::degree(gSTR)))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
pch = 21, bg = "#FEE0AF",
@ -126,29 +128,29 @@ abline(regressionLine, col = "firebrick")
# subgraph, i.e. a subgraph in which every node is connected to every other.
# Biological complexes often appear as cliques in interaction graphs.
clique_num(gSTR)
igraph::clique_num(gSTR)
# The largest clique has 63 members.
(C <- largest_cliques(gSTR)[[1]])
(C <- igraph::largest_cliques(gSTR)[[1]])
# Pick one of the proteins and find out what this fully connected cluster of 63
# proteins is (you can simply Google for any of the IDs). Is this expected?
# Plot this ...
R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
# color the vertices along a color spectrum
vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
# color the edges to have the same color as the originating node
eCol <- character()
for (i in seq_along(vCol)) {
eCol <- c(eCol, rep(vCol[i], gorder(R)))
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
}
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(R,
layout = layout_in_circle(R),
layout = igraph::layout_in_circle(R),
vertex.size = 3,
vertex.color = vCol,
edge.color = eCol,
@ -164,14 +166,14 @@ par(oPar)
# == 2.2 Communities =======================================================
set.seed(112358) # set RNG seed for repeatable randomness
gSTRclusters <- cluster_infomap(gSTR)
gSTRclusters <- igraph::cluster_infomap(gSTR)
set.seed(NULL) # reset the RNG
modularity(gSTRclusters) # ... measures how separated the different membership
# types are from each other
tMem <- table(membership(gSTRclusters))
igraph::modularity(gSTRclusters) # ... measures how separated the different
# membership types are from each other
tMem <- table(igraph::membership(gSTRclusters))
length(tMem) # More than 2000 communities identified
hist(tMem, breaks = 50) # most clusters are small ...
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
range(tMem) # ... but one has > 100 members
@ -179,14 +181,14 @@ range(tMem) # ... but one has > 100 members
# Let's find the nodes with the 10 - highest betweenness centralities.
#
BC <- centr_betw(gSTR)
BC <- igraph::centr_betw(gSTR)
# remember: BC$res contains the results
head(BC$res)
BC$res[1] # betweeness centrality of node 1 in the graph ...
# ... which one is node 1?
V(gSTR)[1]
igraph::V(gSTR)[1]
# to get the ten-highest nodes, we simply label the elements of BC with their
# index ...
@ -203,7 +205,7 @@ head(sBC)
# We can use the first ten labels to subset the nodes in gSTR and fetch the
# IDs...
(ENSPsel <- names(V(gSTR)[BCsel]))
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
# We are going to use these IDs to produce some output for a submitted task:
# so I need you to personalize ENSPsel with the following
@ -231,12 +233,11 @@ set.seed(NULL) # reset the random number generator
# day), simply a few lines of sample code to get you started on the specific use
# case of retrieving descriptions for ensembl protein IDs.
if (!require(biomaRt, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("biomaRt")
library(biomaRt)
if (! requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt")
}
# Package information:
# library(help = biomaRt) # basic information
@ -244,14 +245,14 @@ if (!require(biomaRt, quietly=TRUE)) {
# data(package = "biomaRt") # available datasets
# define which dataset to use ...
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
# what filters are defined?
(filters <- listFilters(myMart))
(filters <- biomaRt::listFilters(myMart))
# and what attributes can we filter for?
(attributes <- listAttributes(myMart))
(attributes <- biomaRt::listAttributes(myMart))
# Soooo many options - let's look for the correct name of filters that are
@ -264,7 +265,7 @@ attributes[grep("description", attributes$description, ignore.case=TRUE), ]
# ... so we can put this together: here is a syntax example:
getBM(filters = "ensembl_peptide_id",
biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",
@ -279,7 +280,7 @@ CPdefs <- list() # Since we don't know how many matches one of our queries
# will return, we'll put the result dataframes into a list.
for (ID in ENSPsel) {
CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id",
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
attributes = c("hgnc_symbol",
"wikigene_description",
"interpro_description",

View File

@ -3,13 +3,17 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-SEQA-Comparison unit
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 11 17
# Date: 2017 11 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.0 First live version 2017
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# Versions:
# 1.0 First live version 2017
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
#
# TODO:
#
@ -27,24 +31,23 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Preparation 41
#TOC> 2 Aggregate properties 63
#TOC> 3 Sequence Composition Enrichment 106
#TOC> 3.1 Barplot, and side-by-side barplot 129
#TOC> 3.2 Plotting ratios 164
#TOC> 3.3 Plotting log ratios 180
#TOC> 3.4 Sort by frequency 195
#TOC> 3.5 Color by amino acid type 210
#TOC> ----------------------------------------------------------
#TOC> 1 Preparation 47
#TOC> 2 Aggregate properties 68
#TOC> 3 Sequence Composition Enrichment 111
#TOC> 3.1 Barplot, and side-by-side barplot 134
#TOC> 3.2 Plotting ratios 169
#TOC> 3.3 Plotting log ratios 185
#TOC> 3.4 Sort by frequency 200
#TOC> 3.5 Color by amino acid type 215
#TOC>
#TOC> ==========================================================================
# = 1 Preparation =========================================================
if (!require(seqinr, quietly=TRUE)) {
if (! requireNamespace("seqinr", quietly = TRUE)) {
install.packages("seqinr")
library(seqinr)
}
# Package information:
# library(help = seqinr) # basic information
@ -66,7 +69,7 @@ if (!require(seqinr, quietly=TRUE)) {
# Let's try a simple function from seqinr: computing the pI of the sequence
?computePI
?seqinr::computePI
# This takes as input a vector of upper-case AA codes
@ -82,12 +85,12 @@ s <- unlist(s) # strsplit() returns a list! Why?
# the function s2c() to convert strings into
# character vectors (and c2s to convert them back).
s2c(mySeq)
seqinr::s2c(mySeq)
computePI(s2c(mySeq)) # isoelectric point
pmw(s2c(mySeq)) # molecular weight
AAstat(s2c(mySeq)) # This also plots the distribution of
seqinr::computePI(s2c(mySeq)) # isoelectric point
seqinr::pmw(s2c(mySeq)) # molecular weight
seqinr::AAstat(s2c(mySeq)) # This also plots the distribution of
# values along the sequence
# A true Labor of Love has gone into the
@ -117,7 +120,7 @@ aaindex[[459]]$D
# with the amino acid counts in our sequence.
(refData <- aaindex[[459]]$I) # reference frequencies in %
names(refData) <- a(names(refData)) # change names to single-letter
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
# code using seqinr's "a()" function
sum(refData)
refData # ... in %

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the BIN-Sequence unit.
#
# Version: 1.3
# Version: 1.4
#
# Date: 2017 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.4 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.3 Update set.seed() usage
# 1.2 Removed irrelevant task. How did that even get in there? smh
# 1.1 Add chartr()
@ -30,23 +33,23 @@
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Prepare 60
#TOC> 2 Storing Sequence 78
#TOC> 3 String properties 107
#TOC> 4 Substrings 114
#TOC> 5 Creating strings: sprintf() 135
#TOC> 6 Changing strings 170
#TOC> 6.1.1 Changing case 172
#TOC> 6.1.2 Reverse 177
#TOC> 6.1.3 Change characters 181
#TOC> 6.1.4 Substitute characters 209
#TOC> 6.2 stringi and stringr 229
#TOC> 6.3 dbSanitizeSequence() 239
#TOC> 7 Permuting and sampling 251
#TOC> 7.1 Permutations 258
#TOC> 7.2 Sampling 304
#TOC> 7.2.1 Equiprobable characters 306
#TOC> 7.2.2 Defined probability vector 348
#TOC> 1 Prepare 63
#TOC> 2 Storing Sequence 80
#TOC> 3 String properties 109
#TOC> 4 Substrings 116
#TOC> 5 Creating strings: sprintf() 137
#TOC> 6 Changing strings 172
#TOC> 6.1.1 Changing case 174
#TOC> 6.1.2 Reverse 179
#TOC> 6.1.3 Change characters 183
#TOC> 6.1.4 Substitute characters 211
#TOC> 6.2 stringi and stringr 231
#TOC> 6.3 dbSanitizeSequence() 241
#TOC> 7 Permuting and sampling 253
#TOC> 7.1 Permutations 260
#TOC> 7.2 Sampling 306
#TOC> 7.2.1 Equiprobable characters 308
#TOC> 7.2.2 Defined probability vector 350
#TOC>
#TOC> ==========================================================================
@ -62,12 +65,11 @@
# Much basic sequence handling is supported by the Bioconductor package
# Biostrings.
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
@ -86,7 +88,7 @@ if (! require(Biostrings, quietly=TRUE)) {
# ... or as more complex objects with rich metadata e.g. as a Biostrings
# DNAstring, RNAstring, AAString, etc.
(a <- AAString("DIVMTQ"))
(a <- Biostrings::AAString("DIVMTQ"))
# ... and all of these representations can be interconverted:
@ -314,6 +316,7 @@ N <- 100
set.seed(16818) # set RNG seed for repeatable randomness
v <- sample(nuc, N, replace = TRUE)
set.seed(NULL) # reset the RNG
(mySeq <- paste(v, collapse = ""))
# What's the GC content?
@ -323,9 +326,8 @@ sum(table(v)[c("G", "C")]) # 51 is close to expected
# What's the number of CpG motifs? Easy to check with the stringi
# stri_match_all() function
if (! require(stringi, quietly=TRUE)) {
if (! requireNamespace("stringi", quietly = TRUE)) {
install.packages("stringi")
library(stringi)
}
# Package information:
# library(help = stringi) # basic information

View File

@ -29,27 +29,27 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------------------
#TOC> 1 A Relational Datamodel in R: review 62
#TOC> 1.1 Building a sample database structure 102
#TOC> 1.1.1 completing the database 213
#TOC> 1.2 Querying the database 248
#TOC> 1.3 Task: submit for credit (part 1/2) 277
#TOC> 2 Implementing the protein datamodel 289
#TOC> 2.1 JSON formatted source data 315
#TOC> 2.2 "Sanitizing" sequence data 355
#TOC> 2.3 Create a protein table for our data model 375
#TOC> 2.3.1 Initialize the database 377
#TOC> 2.3.2 Add data 389
#TOC> 2.4 Complete the database 409
#TOC> 2.4.1 Examples of navigating the database 436
#TOC> 2.5 Updating the database 468
#TOC> 3 Add your own data 480
#TOC> 3.1 Find a protein 488
#TOC> 3.2 Put the information into JSON files 517
#TOC> 3.3 Create an R script to create your own database 540
#TOC> 3.3.1 Check and validate 560
#TOC> 3.4 Task: submit for credit (part 2/2) 601
#TOC> -----------------------------------------------------------------------
#TOC> 1 A Relational Datamodel in R: review 57
#TOC> 1.1 Building a sample database structure 97
#TOC> 1.1.1 completing the database 208
#TOC> 1.2 Querying the database 243
#TOC> 1.3 Task: submit for credit (part 1/2) 272
#TOC> 2 Implementing the protein datamodel 284
#TOC> 2.1 JSON formatted source data 310
#TOC> 2.2 "Sanitizing" sequence data 350
#TOC> 2.3 Create a protein table for our data model 370
#TOC> 2.3.1 Initialize the database 372
#TOC> 2.3.2 Add data 384
#TOC> 2.4 Complete the database 404
#TOC> 2.4.1 Examples of navigating the database 431
#TOC> 2.5 Updating the database 463
#TOC> 3 Add your own data 475
#TOC> 3.1 Find a protein 483
#TOC> 3.2 Put the information into JSON files 512
#TOC> 3.3 Create an R script to create your own database 535
#TOC> 3.3.1 Check and validate 555
#TOC> 3.4 Task: submit for credit (part 2/2) 596
#TOC>
#TOC> ==========================================================================
@ -328,11 +328,11 @@ file.show("./data/MBP1_SACCE.json")
# sanitize the sequence at some point. But since we need to do that
# anyway, it is easier to see the whole sequence if we store it in chunks.
# Let's load the "jsonlite" package and have a look at how it reads this data.
# Let's make sure the "jsonlite" package exists on your computer, then we'll
# explore how it reads this data.
if (! require(jsonlite, quietly=TRUE)) {
if (! requireNamespace("jsonlite", quietly = TRUE)) {
install.packages("jsonlite")
library(jsonlite)
}
# Package information:
# library(help = jsonlite) # basic information
@ -340,7 +340,7 @@ if (! require(jsonlite, quietly=TRUE)) {
# data(package = "jsonlite") # available datasets
x <- fromJSON("./data/MBP1_SACCE.json")
x <- jsonlite::fromJSON("./data/MBP1_SACCE.json")
str(x)
x$name
@ -389,7 +389,7 @@ str(myDB)
dbAddProtein
myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json"))
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
str(myDB)
# Lets check that the 833 amino acids of the yeast MBP1 sequence have

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-Genetic_code unit.
#
# Version: 1.0.1
# Version: 1.1
#
# Date: 2017 10 12
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0.1 Comment on "incomplete final line" warning in FASTA
# 1.0 First live version
#
@ -27,14 +30,14 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------------
#TOC> 1 Storing the genetic code 47
#TOC> 1.1 Genetic code in Biostrings 65
#TOC> 2 Working with the genetic code 97
#TOC> 2.1 Translate a sequence. 126
#TOC> 3 An alternative representation: 3D array 208
#TOC> 3.1 Print a Genetic code table 241
#TOC> 4 Tasks 267
#TOC> ----------------------------------------------------------------
#TOC> 1 Storing the genetic code 45
#TOC> 1.1 Genetic code in Biostrings 63
#TOC> 2 Working with the genetic code 94
#TOC> 2.1 Translate a sequence. 129
#TOC> 3 An alternative representation: 3D array 212
#TOC> 3.1 Print a Genetic code table 246
#TOC> 4 Tasks 272
#TOC>
#TOC> ==========================================================================
@ -63,12 +66,11 @@ x["TAA"]
# available in the Bioconductor "Biostrings" package:
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
@ -77,45 +79,51 @@ if (! require(Biostrings, quietly=TRUE)) {
# The standard genetic code vector
GENETIC_CODE
Biostrings::GENETIC_CODE
# The table of genetic codes. This information corresponds to this page
# at the NCBI:
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE
# Most of the alternative codes are mitochondrial codes. The id of the
# Alternative Yeast Nuclear code is "12"
getGeneticCode("12") # Alternative Yeast Nuclear
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
# = 2 Working with the genetic code =======================================
# GENETIC_CODE is a "named vector"
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
# to a "local" variable, rather than retrieving it from the package all the
# time.
str(GENETIC_CODE)
genCode <- Biostrings::GENETIC_CODE
# This is a named vector of characters ...
str(genCode)
# ... which also stores the alternative initiation codons TTG and CTG in
# an attribute of the vector. (Alternative initiation codons sometimes are
# used instead of ATG to intiate translation, if if not ATG they are translated
# with fMet.)
attr(GENETIC_CODE, "alt_init_codons")
attr(genCode, "alt_init_codons")
# But the key to use this vector is in the "names" which we use for subsetting
# the list of amino acids in whatever way we need.
names(GENETIC_CODE)
names(genCode)
# The translation of "TGG" ...
GENETIC_CODE["TGG"]
genCode["TGG"]
# All stop codons
names(GENETIC_CODE)[GENETIC_CODE == "*"]
names(genCode)[genCode == "*"]
# All start codons
names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or
c(names(GENETIC_CODE)[GENETIC_CODE == "M"],
attr(GENETIC_CODE, "alt_init_codons"))
names(genCode)[genCode == "M"] # ... or
c(names(genCode)[genCode == "M"],
attr(genCode, "alt_init_codons"))
# == 2.1 Translate a sequence. =============================================
@ -165,7 +173,7 @@ nchar(mbp1)/3
# attributes that are useful for Biostrings. Thus we convert the sequence first
# with DNAstring(), then split it up, then convert it into a plain
# character vector.
mbp1Codons <- as.character(codons(DNAString(mbp1)))
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
head(mbp1Codons)
@ -173,7 +181,7 @@ head(mbp1Codons)
mbp1AA <- character(834)
for (i in seq_along(mbp1Codons)) {
mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]]
mbp1AA[i] <- genCode[mbp1Codons[i]]
}
head(mbp1Codons)
@ -196,7 +204,8 @@ sort(table(mbp1AA), decreasing = TRUE)
mbp1AA <- mbp1AA[-(length(mbp1AA))]
tail(mbp1AA) # Note the stop is gone!
# paste it together, collapsing the elements without separation-character
# paste it together, collapsing the elements using an empty string as the
# separation-character (i.e.: nothing)
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
@ -204,14 +213,15 @@ tail(mbp1AA) # Note the stop is gone!
# We don't use 3D arrays often - usually just 2D tables and data frames, so
# here is a good opportunity to review the syntax with a genetic code cube:
# here is a good opportunity to review the syntax of 3D arrays with a
# genetic code cube:
# Initialize, using A C G T as the names of the elements in each dimension
# Initialize, using A G C T as the names of the elements in each dimension
cCube <- array(data = character(64),
dim = c(4, 4, 4),
dimnames = list(c("A", "C", "G", "T"),
c("A", "C", "G", "T"),
c("A", "C", "G", "T")))
dimnames = list(c("A", "G", "C", "T"),
c("A", "G", "C", "T"),
c("A", "G", "C", "T")))
# fill it with amino acid codes using three nested loops
for (i in 1:4) {
@ -222,7 +232,7 @@ for (i in 1:4) {
dimnames(cCube)[[3]][k],
sep = "",
collapse = "")
cCube[i, j, k] <- GENETIC_CODE[myCodon]
cCube[i, j, k] <- genCode[myCodon]
}
}
}
@ -291,14 +301,14 @@ for (i in nuc) {
# Solution:
# Fetch the code
GENETIC_CODE_TABLE
GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"]
altYcode <- getGeneticCode("12")
Biostrings::GENETIC_CODE_TABLE
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
altYcode <- Biostrings::getGeneticCode("12")
# what's the difference?
(delta <- which(GENETIC_CODE != altYcode))
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
GENETIC_CODE[delta]
Biostrings::GENETIC_CODE[delta]
altYcode[delta]
# translate
@ -319,7 +329,7 @@ for (i in nuc) {
#
#
# Solution:
table(table(GENETIC_CODE))
table(table(Biostrings::GENETIC_CODE))
# [END]

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-MAT-Graphs_and_networks unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.1 Update set.seed() usage
# 1.0 First final version for learning units.
# 0.1 First code copied from 2016 material.
@ -30,17 +32,17 @@
#TOC>
#TOC> Section Title Line
#TOC> ------------------------------------------------------------
#TOC> 1 Review 48
#TOC> 2 DEGREE DISTRIBUTIONS 201
#TOC> 2.1 Random graph 207
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
#TOC> 2.3 Random geometric graph 320
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
#TOC> 3.1 Basics 445
#TOC> 3.2 Components 517
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
#TOC> 4.1 Diameter 573
#TOC> 5 GRAPH CLUSTERING 641
#TOC> 1 Review 50
#TOC> 2 DEGREE DISTRIBUTIONS 204
#TOC> 2.1 Random graph 210
#TOC> 2.2 scale-free graph (Barabasi-Albert) 258
#TOC> 2.3 Random geometric graph 323
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 445
#TOC> 3.1 Basics 448
#TOC> 3.2 Components 520
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539
#TOC> 4.1 Diameter 576
#TOC> 5 GRAPH CLUSTERING 645
#TOC>
#TOC> ==========================================================================
@ -123,9 +125,8 @@ set.seed(NULL) # reset the RNG
# standard package for work with graphs in r is "igraph". We'll go into more
# details of the igraph package a bit later, for now we just use it to plot:
if (! require(igraph, quietly=TRUE)) {
if (! requireNamespace("igraph", quietly = TRUE)) {
install.packages("igraph")
library(igraph)
}
# Package information:
# library(help = igraph) # basic information
@ -133,10 +134,12 @@ if (! require(igraph, quietly=TRUE)) {
# data(package = "igraph") # available datasets
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
myG <- igraph::graph_from_adjacency_matrix(myRandAM, mode = "undirected")
set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
# calculate layout coordinates
myGxy <- igraph::layout_with_graphopt(myG,
charge=0.0012)
set.seed(NULL) # reset the RNG
@ -157,9 +160,9 @@ plot(myG,
rescale = FALSE,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
vertex.size = 1600 + (300 * degree(myG)),
vertex.label = sprintf("%s(%i)", names(V(myG)), degree(myG)),
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
vertex.size = 1600 + (300 * igraph::degree(myG)),
vertex.label = sprintf("%s(%i)", names(igraph::V(myG)), igraph::degree(myG)),
vertex.label.family = "sans",
vertex.label.cex = 0.7)
par(oPar) # reset plot window
@ -179,10 +182,10 @@ sum(myRandAM)
rowSums(myRandAM) + colSums(myRandAM) # check this against the plot!
# The function degree() gives the same values
degree(myG)
igraph::degree(myG)
# Let's plot the degree distribution in a histogram:
degG <- degree(myG)
degG <- igraph::degree(myG)
brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1) # define histogram breaks
hist(degG, breaks=brk, col="#A5CCF5",
xlim = c(-1,8), xaxt = "n",
@ -212,8 +215,8 @@ set.seed(31415927) # set RNG seed for repeatable randomness
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
set.seed(NULL) # reset the RNG
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
myG200 <- igraph::graph_from_adjacency_matrix(my200AM, mode = "undirected")
myGxy <- igraph::layout_with_graphopt(myG200, charge=0.0001) # calculate layout
# coordinates
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
@ -222,8 +225,8 @@ plot(myG200,
rescale = FALSE,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
vertex.size = 150 + (60 * degree(myG200)),
vertex.color=heat.colors(max(igraph::degree(myG200)+1))[igraph::degree(myG200)+1],
vertex.size = 150 + (60 * igraph::degree(myG200)),
vertex.label = NA)
par(oPar) # restore graphics state
@ -231,7 +234,7 @@ par(oPar) # restore graphics state
# biological graphs look approximately like this.
# Calculate degree distributions
dg <- degree(myG200)
dg <- igraph::degree(myG200)
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
hist(dg, breaks=brk, col="#A5F5CC",
xlim = c(-1,11), xaxt = "n",
@ -263,10 +266,10 @@ plot(log10(as.numeric(names(freqRank)) + 1),
N <- 200
set.seed(31415927) # set RNG seed for repeatable randomness
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
GBA <- igraph::sample_pa(N, power = 0.8, directed = FALSE)
set.seed(NULL) # reset the RNG
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
GBAxy <- igraph::layout_with_graphopt(GBA, charge=0.0001)
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
plot(GBA,
@ -274,8 +277,8 @@ plot(GBA,
rescale = FALSE,
xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01),
ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
vertex.size = 200 + (30 * degree(GBA)),
vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
vertex.size = 200 + (30 * igraph::degree(GBA)),
vertex.label = NA)
par(oPar) # restore grphics state
@ -287,7 +290,7 @@ par(oPar) # restore grphics state
# singletons.
# What's the degree distribution of this graph?
(dg <- degree(GBA))
(dg <- igraph::degree(GBA))
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
hist(dg, breaks=brk, col="#DCF5B5",
xlim = c(0,max(dg)+1), xaxt = "n",
@ -307,8 +310,8 @@ plot(log10(as.numeric(names(freqRank)) + 1),
# Sort-of linear, but many of the higher ranked nodes have a frequency of only
# one. That behaviour smooths out in larger graphs:
#
X <- sample_pa(100000, power = 0.8, directed = FALSE) # 100,000 nodes
freqRank <- table(degree(X))
X <- igraph::sample_pa(1e5, power = 0.8, directed = FALSE) # 100,000 nodes
freqRank <- table(igraph::degree(X))
plot(log10(as.numeric(names(freqRank)) + 1),
log10(as.numeric(freqRank)), type = "b",
xlab = "log(Rank)", ylab = "log(frequency)",
@ -404,7 +407,7 @@ rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
set.seed(NULL) # reset the RNG
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
myGRG <- igraph::graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(myGRG,
@ -412,13 +415,13 @@ plot(myGRG,
rescale = FALSE,
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
vertex.color=heat.colors(max(degree(myGRG)+1))[degree(myGRG)+1],
vertex.size = 0.1 + (0.2 * degree(myGRG)),
vertex.color=heat.colors(max(igraph::degree(myGRG)+1))[igraph::degree(myGRG)+1],
vertex.size = 0.1 + (0.2 * igraph::degree(myGRG)),
vertex.label = NA)
par(oPar)
# degree distribution:
(dg <- degree(myGRG))
(dg <- igraph::degree(myGRG))
brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1)
hist(dg, breaks = brk, col = "#FCC6D2",
xlim = c(0, 25), xaxt = "n",
@ -450,7 +453,7 @@ summary(myG)
# This output means: this is an IGRAPH graph, with U = UN-directed edges
# and N = named nodes, that has 20 nodes and 20 edges. For details, see
?print.igraph
?igraph::print.igraph
mode(myG)
class(myG)
@ -463,11 +466,11 @@ class(myG)
# recipes, called _games_ in this package.
# Two basic functions retrieve nodes "Vertices", and "Edges":
V(myG)
E(myG)
igraph::V(myG)
igraph::E(myG)
# additional properties can be retrieved from the Vertices ...
V(myG)$name
igraph::V(myG)$name
# As with many R objects, loading the package provides special functions that
@ -487,12 +490,12 @@ plot(myG) # this is the result of default plot parameters
# Plot with some customizing parameters
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(myG,
layout = layout_with_fr(myG),
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
vertex.size = 9 + (2 * degree(myG)),
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
layout = igraph::layout_with_fr(myG),
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
vertex.size = 9 + (2 * igraph::degree(myG)),
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
edge.width = 2,
vertex.label = V(myG)$name,
vertex.label = igraph::V(myG)$name,
vertex.label.family = "sans",
vertex.label.cex = 0.9)
par(oPar)
@ -500,12 +503,12 @@ par(oPar)
# ... or with a different layout:
oPar <- par(mar= rep(0,4)) # Turn margins off
plot(myG,
layout = layout_in_circle(myG),
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
vertex.size = 9 + (2 * degree(myG)),
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
layout = igraph::layout_in_circle(myG),
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
vertex.size = 9 + (2 * igraph::degree(myG)),
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
edge.width = 2,
vertex.label = V(myG)$name,
vertex.label = igraph::V(myG)$name,
vertex.label.family = "sans",
vertex.label.cex = 0.9)
par(oPar)
@ -518,18 +521,18 @@ par(oPar)
# The igraph function components() tells us whether there are components of the
# graph in which there is no path to other components.
components(myG)
igraph::components(myG)
# In the _membership_ vector, nodes are annotated with the index of the
# component they are part of. Sui7 is the only node of component 2, Cyj1 is in
# the third component etc. This is perhaps more clear if we sort by component
# index
sort(components(myG)$membership, decreasing = TRUE)
sort(igraph::components(myG)$membership, decreasing = TRUE)
# Retrieving e.g. the members of the first component from the list can be done by subsetting:
(sel <- components(myG)$membership == 1) # boolean vector ..
(c1 <- components(myG)$membership[sel])
(sel <- igraph::components(myG)$membership == 1) # boolean vector ..
(c1 <- igraph::components(myG)$membership[sel])
names(c1)
@ -542,9 +545,9 @@ names(c1)
# preferential-attachment ... but igraph has ways to simulate the basic ones
# (and we could easily simulate our own). Look at the following help pages:
?sample_gnm # see also sample_gnp for the Erdös-Rényi models
?sample_smallworld # for the Watts & Strogatz model
?sample_pa # for the Barabasi-Albert model
?igraph::sample_gnm # see also sample_gnp for the Erdös-Rényi models
?igraph::sample_smallworld # for the Watts & Strogatz model
?igraph::sample_pa # for the Barabasi-Albert model
# But note that there are many more sample_ functions. Check out the docs!
@ -554,7 +557,7 @@ names(c1)
# layout drawas them, obviously.
set.seed(112358) # set RNG seed for repeatable randomness
myGxy <- layout_with_fr(myG) # calculate layout coordinates
myGxy <- igraph::layout_with_fr(myG) # calculate layout coordinates
set.seed(NULL) # reset the RNG
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
@ -563,30 +566,31 @@ plot(myG,
rescale = FALSE,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
vertex.size = 20 + (10 * degree(myG)),
vertex.label = V(myG)$name,
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
vertex.size = 20 + (10 * igraph::degree(myG)),
vertex.label = igraph::V(myG)$name,
vertex.label.family = "sans",
vertex.label.cex = 0.8)
par(oPar) # restore graphics state
# == 4.1 Diameter ==========================================================
diameter(myG) # The diameter of a graph is its maximum length shortest path.
igraph::diameter(myG) # The diameter of a graph is its maximum length
# shortest path.
# let's plot this path: here are the nodes ...
get_diameter(myG)
igraph::get_diameter(myG)
# ... and we can get the x, y coordinates from iGxy by subsetting with the node
# names. The we draw the diameter-path with a transparent, thick pink line:
lines(myGxy[get_diameter(myG),], lwd=10, col="#ff63a788")
lines(myGxy[igraph::get_diameter(myG),], lwd=10, col="#ff63a788")
# == Centralization scores
?centralize
?igraph::centralize
# replot our graph, and color by log_betweenness:
bC <- centr_betw(myG) # calculate betweenness centrality
bC <- igraph::centr_betw(myG) # calculate betweenness centrality
nodeBetw <- bC$res
nodeBetw <- round(log(nodeBetw +1)) + 1
@ -597,8 +601,8 @@ plot(myG,
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
vertex.size = 20 + (10 * degree(myG)),
vertex.label = V(myG)$name,
vertex.size = 20 + (10 * igraph::degree(myG)),
vertex.label = igraph::V(myG)$name,
vertex.label.family = "sans",
vertex.label.cex = 0.7)
par(oPar)
@ -613,7 +617,7 @@ par(oPar)
#
# Lets plot betweenness centrality for our random geometric graph:
bCmyGRG <- centr_betw(myGRG) # calculate betweenness centrality
bCmyGRG <- igraph::centr_betw(myGRG) # calculate betweenness centrality
nodeBetw <- bCmyGRG$res
nodeBetw <- round((log(nodeBetw +1))^2.5) + 1
@ -630,9 +634,9 @@ plot(myGRG,
vertex.label = NA)
par(oPar)
diameter(myGRG)
lines(rGAM$x[get_diameter(myGRG)],
rGAM$y[get_diameter(myGRG)],
igraph::diameter(myGRG)
lines(rGAM$x[igraph::get_diameter(myGRG)],
rGAM$y[igraph::get_diameter(myGRG)],
lwd = 10,
col = "#ff335533")
@ -648,11 +652,11 @@ lines(rGAM$x[get_diameter(myGRG)],
# http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org
myGRGclusters <- cluster_infomap(myGRG)
modularity(myGRGclusters) # ... measures how separated the different membership
# types are from each other
membership(myGRGclusters) # which nodes are in what cluster?
table(membership(myGRGclusters)) # how large are the clusters?
myGRGclusters <- igraph::cluster_infomap(myGRG)
igraph::modularity(myGRGclusters) # ... measures how separated the different
# membership types are from each other
igraph::membership(myGRGclusters) # which nodes are in what cluster?
table(igraph::membership(myGRGclusters)) # how large are the clusters?
# The largest cluster has 48 members, the second largest has 25, etc.
@ -661,7 +665,7 @@ table(membership(myGRGclusters)) # how large are the clusters?
# their cluster membership:
# first, make a vector with as many grey colors as we have communities ...
commColors <- rep("#f1eef6", max(membership(myGRGclusters)))
commColors <- rep("#f1eef6", max(igraph::membership(myGRGclusters)))
# ... then overwrite the first five with "real colors" - something like rust,
# lilac, pink, and mauve or so.
commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da")
@ -673,8 +677,8 @@ plot(myGRG,
rescale = FALSE,
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
vertex.color=commColors[membership(myGRGclusters)],
vertex.size = 0.1 + (0.1 * degree(myGRG)),
vertex.color=commColors[igraph::membership(myGRGclusters)],
vertex.size = 0.1 + (0.1 * igraph::degree(myGRG)),
vertex.label = NA)
par(oPar)

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the FND-STA-Probability_distribution unit.
#
# Version: 1.2
# Version: 1.3
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.3 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.2 Update set.seed() usage
# 1.1 Corrected empirical p-value
# 1.0 First code live version
@ -28,21 +30,21 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------------------------------------
#TOC> 1 Introduction 50
#TOC> 2 Three fundamental distributions 113
#TOC> 2.1 The Poisson Distribution 116
#TOC> 2.2 The uniform distribution 170
#TOC> 2.3 The Normal Distribution 190
#TOC> 3 quantile-quantile comparison 231
#TOC> 3.1 qqnorm() 241
#TOC> 3.2 qqplot() 307
#TOC> 4 Quantifying the difference 324
#TOC> 4.1 Chi2 test for discrete distributions 359
#TOC> 4.2 Kullback-Leibler divergence 451
#TOC> 4.2.1 An example from tossing dice 462
#TOC> 4.2.2 An example from lognormal distributions 585
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
#TOC> -----------------------------------------------------------------------------
#TOC> 1 Introduction 52
#TOC> 2 Three fundamental distributions 115
#TOC> 2.1 The Poisson Distribution 118
#TOC> 2.2 The uniform distribution 172
#TOC> 2.3 The Normal Distribution 192
#TOC> 3 quantile-quantile comparison 233
#TOC> 3.1 qqnorm() 243
#TOC> 3.2 qqplot() 309
#TOC> 4 Quantifying the difference 326
#TOC> 4.1 Chi2 test for discrete distributions 361
#TOC> 4.2 Kullback-Leibler divergence 452
#TOC> 4.2.1 An example from tossing dice 463
#TOC> 4.2.2 An example from lognormal distributions 586
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629
#TOC>
#TOC> ==========================================================================
@ -385,9 +387,8 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4])
# package information - plotrix has _many_ useful utilities to enhance
# plots or produce informative visualizations.
if (! require(plotrix, quietly=TRUE)) {
if (! requireNamespace("plotrix", quietly = TRUE)) {
install.packages("plotrix")
library(plotrix)
}
# Package information:
# library(help = plotrix) # basic information
@ -395,7 +396,7 @@ if (! require(plotrix, quietly=TRUE)) {
# data(package = "plotrix") # available datasets
h <- multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
h <- plotrix::multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
breaks = myBreaks,
col = myCols)
legend("topright",

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Biostrings unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 20
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.0 2017 Revisions
# 0.1 First code copied from 2016 material.
#
@ -28,19 +31,19 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------
#TOC> 1 The Biostrings Package 52
#TOC> 2 Getting Data into Biostrings Objects 85
#TOC> 3 Working with Biostrings Objects 106
#TOC> 3.1 Properties 109
#TOC> 3.2 Subsetting 146
#TOC> 3.3 Operators 158
#TOC> 3.4 Transformations 165
#TOC> 4 Getting Data out of Biostrings Objects 172
#TOC> 5 More 181
#TOC> 5.1 Views 183
#TOC> 5.2 Iranges 195
#TOC> 5.3 StringSets 201
#TOC> ---------------------------------------------------------------
#TOC> 1 The Biostrings Package 55
#TOC> 2 Getting Data into Biostrings Objects 86
#TOC> 3 Working with Biostrings Objects 108
#TOC> 3.1 Properties 125
#TOC> 3.2 Subsetting 163
#TOC> 3.3 Operators 175
#TOC> 3.4 Transformations 182
#TOC> 4 Getting Data out of Biostrings Objects 189
#TOC> 5 More 198
#TOC> 5.1 Views 200
#TOC> 5.2 Iranges 214
#TOC> 5.3 StringSets 220
#TOC>
#TOC> ==========================================================================
@ -54,14 +57,12 @@
# First, we install and load the Biostrings package from bioconductor
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Examine the package information:
library(help = Biostrings) # basic information
browseVignettes("Biostrings") # available vignettes
@ -72,72 +73,88 @@ data(package = "Biostrings") # available datasets
# of a "class" in R as a special kind of list), that can take on particular
# flavours for RNA, DNA or amino acid sequence information.
class(RNAString("AUG"))
class(DNAString("ATG"))
class(AAString("M"))
class(Biostrings::RNAString("AUG"))
class(Biostrings::DNAString("ATG"))
class(Biostrings::AAString("M"))
# An essential property of Biostrings objects is that they only allow letters
# from the applicable IUPAC alphabet:
RNAString("AUG")
DNAString("AUG") # Error! No "U" in IUPAC DNA codes
Biostrings::RNAString("AUG")
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
# = 2 Getting Data into Biostrings Objects ================================
# Example: read FASTA. Extract sequence. Convert to DNAString object.
x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
x <- dbSanitizeSequence(x)
myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a
# object of class DNAstring
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
rawSeq <- dbSanitizeSequence(rawSeq)
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
# into an object of class DNAstring
# Multi FASTA files can be read directly as a "XStringSet) ...
(myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa"))
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
# ... and if you subset one sequence from the set, you get an XString object
# back again.
(Xseq <- myDNASet[[1]])
(Xseq <- biosDNASet[[1]])
myDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical.
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
# = 3 Working with Biostrings Objects =====================================
# Biostrings is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with
# Biostrings::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from.
library(Biostrings)
# == 3.1 Properties ========================================================
str(myDNAseq)
length(myDNAseq) # This gives you the _number of nucleotides_!
# By comparison ...
length(x) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar().
nchar(x) # However ...
nchar(myDNAseq) # ... also works.
str(rawSeq)
str(biosDNAseq)
uniqueLetters(myDNAseq)
length(rawSeq) # ... is 1: one string only. To get the number of
# characters in a string, you need nchar().
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
nchar(rawSeq)
nchar(biosDNAseq) # ... but nchar() works too.
(uL <- Biostrings::uniqueLetters(biosDNAseq))
# Count frequencies - with strings, you would strsplit() into a character
# vector and then use table(). biost
alphabetFrequency(myDNAseq)
Biostrings::alphabetFrequency(biosDNAseq)
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
# returns.
letterFrequency(myDNAseq, uniqueLetters(myDNAseq))
Biostrings::letterFrequency(biosDNAseq, uL)
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
length(biosDNAseq) # GC contents
sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents
Biostrings::dinucleotideFrequency(biosDNAseq)
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
dinucleotideFrequency(myDNAseq)
barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5)
(triNuc <- trinucleotideFrequency(myDNAseq))
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
barplot(sort(triNuc), col="#4499EE33")
triNuc[triNuc == max(triNuc)]
triNuc[triNuc == min(triNuc)]
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
# compare to a shuffled sequence:
(triNuc <- trinucleotideFrequency(sample(myDNAseq)))
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
# Interpret this plot.
@ -146,34 +163,34 @@ barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
# == 3.2 Subsetting ========================================================
# Subsetting any XString object works as expected:
myDNAseq[4:15]
biosDNAseq[4:15]
# ... well - maybe not expected, because x[4:15] would not work.
# ... well - maybe not expected, because rawSeq[4:15] would not work.
# Alternatively to the "[" operator, use the subseq() function - especially for
# long sequences. This is far more efficient.
subseq(myDNAseq, start = 1, end = 30)
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
# == 3.3 Operators =========================================================
# RNAstring() and DNAstring() objects compare U and T as equals!
RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
# == 3.4 Transformations ===================================================
myDNAseq[4:15]
reverseComplement(myDNAseq[4:15])
translate(myDNAseq[4:15])
biosDNAseq[4:15]
Biostrings::reverseComplement(biosDNAseq[4:15])
Biostrings::translate(biosDNAseq[4:15])
# = 4 Getting Data out of Biostrings Objects ==============================
# If you need a character object, use toString():
toString(myDNAseq[4:15])
Biostrings::toString(biosDNAseq[4:15])
# save() and load() works like on all other R objects.
@ -185,7 +202,9 @@ toString(myDNAseq[4:15])
# Biostring "Views" are objects that store multiple substrings of one
# Biostring object.
(myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45)))
(myView <- Biostrings::Views(biosDNAseq,
start = c(1, 19, 37),
end = c(15, 30, 45)))
# Views are convenient to store feature annotations
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
@ -202,20 +221,20 @@ cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
# Biostring "StringSets" store multiple sequences.
#
ompA <- AAString("MKKTAIAIAVALAGFATVAQA")
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
sample(ompA) # sample can work directly on a Biostring object to shuffle it
x[1] <- toString(ompA)
x <- Biostrings::toString(ompA)
for (i in 2:10) {
x[i] <- toString(sample(ompA))
x[i] <- Biostrings::toString(sample(ompA))
}
shuffledPeptideSet <- AAStringSet(x)
shuffledPeptideSet <- Biostrings::AAStringSet(x)
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
shuffledPeptideSet
length(shuffledPeptideSet)
width(shuffledPeptideSet)
alphabetFrequency(shuffledPeptideSet)
Biostrings::width(shuffledPeptideSet)
Biostrings::alphabetFrequency(shuffledPeptideSet)
# [END]

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR_GEO2R unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 09 - 2018 01
# Date: 2017 09 - 2019 01
# Author: Boris Steipe <boris.steipe@utoronto.ca>
#
# Versions:
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.1 Add section on GPL annotations
# 1.0 Updates for BCH441 2017
# 0.1 First code copied from 2016 material.
@ -33,19 +36,19 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 1 Preparations 53
#TOC> 2 Loading a GEO Dataset 84
#TOC> 3 Column wise analysis - time points 154
#TOC> 3.1 Task - Comparison of experiments 160
#TOC> 3.2 Grouped Samples 207
#TOC> 4 Row-wise Analysis: Expression Profiles 242
#TOC> 4.1 Task - Read a table of features 277
#TOC> 4.2 Selected Expression profiles 325
#TOC> 5 Differential Expression 366
#TOC> 5.1 Final task: Gene descriptions 490
#TOC> 6 Improving on Discovery by Differential Expression 495
#TOC> 7 Annotation data 577
#TOC> --------------------------------------------------------------------------
#TOC> 1 Preparations 56
#TOC> 2 Loading a GEO Dataset 82
#TOC> 3 Column wise analysis - time points 152
#TOC> 3.1 Task - Comparison of experiments 158
#TOC> 3.2 Grouped Samples 205
#TOC> 4 Row-wise Analysis: Expression Profiles 240
#TOC> 4.1 Task - Read a table of features 275
#TOC> 4.2 Selected Expression profiles 323
#TOC> 5 Differential Expression 364
#TOC> 5.1 Final task: Gene descriptions 504
#TOC> 6 Improving on Discovery by Differential Expression 510
#TOC> 7 Annotation data 594
#TOC>
#TOC> ==========================================================================
@ -55,12 +58,11 @@
# To load and analyze GEO datasets we use a number of Bioconductor packages:
if (! require(Biobase, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biobase")
library(Biobase)
if (! requireNamespace("Biobase", quietly = TRUE)) {
BiocManager::install("Biobase")
}
# Package information:
# library(help = Biobase) # basic information
@ -68,12 +70,8 @@ if (! require(Biobase, quietly=TRUE)) {
# data(package = "Biobase") # available datasets
if (! require(GEOquery, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("GEOquery")
library(GEOquery)
if (! requireNamespace("GEOquery", quietly = TRUE)) {
BiocManager::install("GEOquery")
}
# Package information:
# library(help = GEOquery) # basic information
@ -94,7 +92,7 @@ if (! require(GEOquery, quietly=TRUE)) {
# I have experienced outages over several hours. If the command below does
# not work for you, skip ahead to the fallback procedure.
GSE3635 <- getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
GSE3635 <- GEOquery::getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
# Note: GEO2R scripts call the expression data set
# "gset" throughout ... in this script I give
# it the name "GSE3635" for clarity.
@ -136,14 +134,14 @@ help("ExpressionSet-class")
GSE3635
# Access contents via methods:
featureNames(GSE3635)[1:20] # Rows. What are these features?
sampleNames(GSE3635)[1:10] # Columns. What are these columns?
Biobase::featureNames(GSE3635)[1:20] # Rows. What are these features?
Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns?
# Access contents by subsetting:
( tmp <- GSE3635[12:17, 1:6] )
# Access data
exprs(tmp) # exprs() gives us the actual expression values.
Biobase::exprs(tmp) # exprs() gives us the actual expression values.
#TASK> What are the data:
@ -160,9 +158,9 @@ exprs(tmp) # exprs() gives us the actual expression values.
# == 3.1 Task - Comparison of experiments ==================================
# Get an overview of the distribution of data values in individual columns
summary(exprs(GSE3635)[ , 1])
summary(exprs(GSE3635)[ , 4])
summary(exprs(GSE3635)[ , 7])
summary(Biobase::exprs(GSE3635)[ , 1])
summary(Biobase::exprs(GSE3635)[ , 4])
summary(Biobase::exprs(GSE3635)[ , 7])
# as a boxplot
cyclicPalette <- colorRampPalette(c("#00AAFF",
@ -173,7 +171,7 @@ cyclicPalette <- colorRampPalette(c("#00AAFF",
"#FFAA00",
"#00AAFF"))
tCols <- cyclicPalette(13)
boxplot(exprs(GSE3635), col = tCols)
boxplot(Biobase::exprs(GSE3635), col = tCols)
#TASK> Study this boxplot. What's going on? Are these expression values?
@ -181,11 +179,11 @@ boxplot(exprs(GSE3635), col = tCols)
# Lets plot the distributions of values in a more fine-grained manner:
hT0 <- hist(exprs(GSE3635)[ , 1], breaks = 100)
hT3 <- hist(exprs(GSE3635)[ , 4], breaks = 100)
hT6 <- hist(exprs(GSE3635)[ , 7], breaks = 100)
hT9 <- hist(exprs(GSE3635)[ , 10], breaks = 100)
hT12 <- hist(exprs(GSE3635)[ , 13], breaks = 100)
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100)
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100)
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100)
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100)
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100)
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
@ -218,7 +216,7 @@ for (i in 1:nchar(gsms)) {
sml <- paste("G", sml, sep="") # set group names
# order samples by group
ex <- exprs(GSE3635)[ , order(sml)]
ex <- Biobase::exprs(GSE3635)[ , order(sml)]
sml <- sml[order(sml)]
fl <- as.factor(sml)
labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
@ -231,8 +229,8 @@ labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5",
"#dff4e4", "#f4dff4", "#AABBCC")
dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window
par(mar = c(2 + round(max(nchar(sampleNames(GSE3635))) / 2), 4, 2, 1))
title <- paste ("GSE3635", '/', annotation(GSE3635),
par(mar = c(2 + round(max(nchar(Biobase::sampleNames(GSE3635))) / 2), 4, 2, 1))
title <- paste ("GSE3635", '/', Biobase::annotation(GSE3635),
" grouped samples", sep ='')
boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE,
las = 2, col = GEOcols[fl])
@ -331,7 +329,7 @@ gName <- "MBP1"
(iFeature <- which(SGD_features$name == gName))
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
plot(seq(0, 120, by = 10),
exprs(GSE3635)[iExprs, ],
Biobase::exprs(GSE3635)[iExprs, ],
main = paste("Expression profile for", gName),
xlab = "time (min)",
ylab = "expression",
@ -368,12 +366,8 @@ SGD_features$description[iFeature]
# GEO2R discovers the top differentially expressed expressed genes by
# using functions in the Bioconductor limma package.
if (! require(limma, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
}
biocLite("limma")
library(limma)
if (! requireNamespace("limma", quietly = TRUE)) {
BiocManager::install("limma")
}
# Package information:
# library(help = limma) # basic information
@ -392,6 +386,20 @@ if (! require(limma, quietly=TRUE)) {
# the groups
# 4. Format results.
# Biobase is a highly engineered package that is tightly integrated into
# the Bioconductor world - unfortunately that brings with it a somewhat
# undesirable level of computational overhead and dependencies. Using the
# package as we normally do - i.e. calling required functions with their
# explicit package prefix is therefore not advisable. There are generics
# that won't be propery dispatched. If you only need a small number of
# functions for a very specific context, you will probably get away with
# Biobase::<function>() - but even in the demonstration code of this script
# not everything works out of the box. We'll therefore load the library,
# but we'll (redundantly) use the prefix anyway so as to emphasize where
# the functions come from.
library(Biobase)
# We are recapitulating the experiment in which we assigned the 0, 10, 60 and
# 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to
# another group, and calculated differential expression values between these
@ -415,15 +423,15 @@ myDesign
# Now we can calculate the fit of all rows to a linear model that depends
# on the two groups as specified in the design:
myFit <- lmFit(mySet, myDesign)
myFit <- limma::lmFit(mySet, myDesign)
# Next we calculate the contrasts, given the fit ...
myCont.matrix <- makeContrasts(A - B, levels = myDesign)
myFit2 <- contrasts.fit(myFit, myCont.matrix)
myCont.matrix <- limma::makeContrasts(A - B, levels = myDesign)
myFit2 <- limma::contrasts.fit(myFit, myCont.matrix)
# ... compute appropriate probabilites from a modified t-test
# (empirical Bayes) ...
myFit2 <- eBayes(myFit2, 0.01)
myFit2 <- limma::eBayes(myFit2, 0.01)
# ... add the gene names to the fit - object ...
myFit2$genes <- featureNames(mySet)
@ -433,7 +441,10 @@ myFit2$genes <- featureNames(mySet)
# gave us only the top 250 genes, but we might as well do 1000, just so we
# can be reasonable sure that our gens of interest are included.
N <- 1000
myTable <- topTable(myFit2, adjust.method = "fdr", sort.by = "B", number = N)
myTable <- limma::topTable(myFit2,
adjust.method = "fdr",
sort.by = "B",
number = N)
str(myTable)
# The gene names are now in the $ID column
@ -461,7 +472,7 @@ abline(h = 0, col = "#00000055")
for (i in 1:10) {
thisID <- myTable$ID[i]
points(seq(0, 120, by = 10), exprs(GSE3635)[thisID, ], type = "b")
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
}
# Our guess that we might discover interesting genes be selecting groups A and B
@ -480,7 +491,10 @@ for (i in 1:10) {
myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3")
for (name in toupper(myControls)) {
thisID <- SGD_features$sysName[which(SGD_features$name == name)]
points(seq(0, 120, by=10), exprs(GSE3635)[thisID, ], type="b", col="#AA0000")
points(seq(0, 120, by=10),
Biobase::exprs(GSE3635)[thisID, ],
type="b",
col="#AA0000")
}
# Indeed, the discovered gene profiles look much "cleaner" than the real cycle
@ -504,7 +518,7 @@ for (name in toupper(myControls)) {
gName <- "CLN2"
(iFeature <- which(SGD_features$name == gName))
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
Cln2Profile <- exprs(GSE3635)[iExprs, ]
Cln2Profile <- Biobase::exprs(GSE3635)[iExprs, ]
plot(seq(0, 120, by = 10),
Cln2Profile,
ylim = c(-1, 1),
@ -519,16 +533,16 @@ abline(v = 60, col = "#00000055")
# Set up a vector of correlation values
myCorrelations <- numeric(nrow(exprs(GSE3635)))
names(myCorrelations) <- featureNames(GSE3635)
myCorrelations <- numeric(nrow(Biobase::exprs(GSE3635)))
names(myCorrelations) <- Biobase::featureNames(GSE3635)
for (i in 1:length(myCorrelations)) {
myCorrelations[i] <- cor(Cln2Profile, exprs(GSE3635)[i, ])
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
}
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten
# Number 1
(ID <- featureNames(GSE3635)[myTopC[1]])
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
# Get information
SGD_features[which(SGD_features$sysName == ID), ]
@ -537,12 +551,13 @@ SGD_features[which(SGD_features$sysName == ID), ]
# Let's plot the rest
for (i in 2:length(myTopC)) {
ID <- featureNames(GSE3635)[myTopC[i]]
ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
points(seq(0, 120, by = 10),
exprs(GSE3635)[ID, ],
Biobase::exprs(GSE3635)[ID, ],
type = "b",
col= "chartreuse")
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
print(SGD_features[which(SGD_features$sysName == ID),
c("name", "description")])
}
# Note that all of these genes are highly correlated with a known cell cycle
@ -554,12 +569,13 @@ for (i in 2:length(myTopC)) {
# And we haven't even looked at the anticorrelated genes yet...
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten
for (i in 1:length(myBottomC)) {
ID <- featureNames(GSE3635)[myBottomC[i]]
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
points(seq(0, 120, by = 10),
exprs(GSE3635)[ID, ],
Biobase::exprs(GSE3635)[ID, ],
type = "b",
col= "coral")
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
print(SGD_features[which(SGD_features$sysName == ID),
c("name", "description")])
}
# ... which are very interesting in their own right.
@ -583,7 +599,7 @@ for (i in 1:length(myBottomC)) {
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
# annotations were not loaded. We could use getGPL = TRUE instead ...
GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
GSE3635annot <- GEOquery::getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
GSE3635annot <- GSE3635annot[[1]]
# ... and the feature data is then available in the GSE3635@featureData@data
@ -597,13 +613,8 @@ GSE3635annot@featureData@data[ 1:20 , ]
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
str(myAnnot)
# ... Note that this is a data frame, but - oy veh - the gene symbols are
# factors. Really, we need to fix this! To convert a factor into a string,
# we need to cast it to character.
myAnnot$Gene <- as.character(myAnnot$Gene)
# ... whereupon we can find things we might be looking for ...
# ... Note that this is a data frame and it is easy to find things we
# might be looking for ...
myAnnot[which(myAnnot$Gene == "MBP1"), ]
# ... or identify rows that might give us trouble, such as probes that
@ -614,13 +625,11 @@ myAnnot[which(myAnnot$Gene == "MBP1"), ]
GSE3635@annotation # "GPL1914"
# ... and downloaded it directly from NCBI:
GPL1914 <- getGEO("GPL1914")
GPL1914 <- GEOquery::getGEO("GPL1914")
str(GPL1914)
# ... from which we can get the data - which is however NOT necessarily
# matched to the rows of our expression dataset. Note that here too: the
# majority of data elements are factors and will likely have to be converted
# before use.
# matched to the rows of our expression dataset.
# [END]

View File

@ -3,12 +3,15 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Genetic_code_optimality unit.
#
# Version: 1.1
# Version: 1.2
#
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.2 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# use Biocmanager:: not biocLite()
# 1.1 Update set.seed() usage
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
# 1.0 New material.
@ -30,16 +33,16 @@
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------
#TOC> 1 Designing a computational experiment 54
#TOC> 2 Setting up the tools 70
#TOC> 2.1 Natural and alternative genetic codes 73
#TOC> 2.2 Effect of mutations 132
#TOC> 2.2.1 reverse-translate 143
#TOC> 2.2.2 Randomly mutate 168
#TOC> 2.2.3 Forward- translate 193
#TOC> 2.2.4 measure effect 211
#TOC> 3 Run the experiment 258
#TOC> 4 Task solutions 351
#TOC> 1 Designing a computational experiment 57
#TOC> 2 Setting up the tools 73
#TOC> 2.1 Natural and alternative genetic codes 76
#TOC> 2.2 Effect of mutations 134
#TOC> 2.2.1 reverse-translate 145
#TOC> 2.2.2 Randomly mutate 170
#TOC> 2.2.3 Forward- translate 195
#TOC> 2.2.4 measure effect 213
#TOC> 3 Run the experiment 260
#TOC> 4 Task solutions 356
#TOC>
#TOC> ==========================================================================
@ -73,12 +76,11 @@
# == 2.1 Natural and alternative genetic codes =============================
# Load genetic code tables from the Biostrings package
if (! require(Biostrings, quietly=TRUE)) {
if (! exists("biocLite")) {
source("https://bioconductor.org/biocLite.R")
if (! requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
biocLite("Biostrings")
library(Biostrings)
if (! requireNamespace("Biostrings", quietly = TRUE)) {
BiocManager::install("Biostrings")
}
# Package information:
# library(help = Biostrings) # basic information
@ -257,52 +259,55 @@ evalMut <- function(nat, mut) {
# = 3 Run the experiment ==================================================
# Fetch the standard Genetic code from Biostrings::
stdCode <- Biostrings::GENETIC_CODE
# Fetch the nucleotide sequence for MBP1:
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
myDNA <- paste0(myDNA, collapse = "")
myDNA <- as.character(codons(DNAString(myDNA)))
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
myAA <- traFor(myDNA, GENETIC_CODE)
myAA <- traFor(myDNA, stdCode)
# Mutate and evaluate
set.seed(112358)
x <- randMut(myDNA)
set.seed(NULL)
x <- traFor(x, GENETIC_CODE)
x <- traFor(x, stdCode)
evalMut(myAA, x) # 166.4
# Try this 200 times, and see how the values are distributed.
N <- 200
valUGC <- numeric(N)
valSTDC <- numeric(N)
set.seed(112358) # set RNG seed for repeatable randomness
for (i in 1:N) {
x <- randMut(myDNA) # mutate
x <- traFor(x, GENETIC_CODE) # translate
valUGC[i] <- evalMut(myAA, x) # evaluate
x <- traFor(x, stdCode) # translate
valSTDC[i] <- evalMut(myAA, x) # evaluate
}
set.seed(NULL) # reset the RNG
hist(valUGC,
hist(valSTDC,
breaks = 15,
col = "palegoldenrod",
xlim = c(0, 400),
ylim = c(0, N/4),
main = "Universal vs. Synthetic Genetic Code",
main = "Standard vs. Synthetic Genetic Code",
xlab = "Mutation penalty")
# This looks like a normal distribution. Let's assume the effect of mutations
# under the universal genetic code is the mean of this distribution:
effectUGC <- mean(valUGC) # 178.1
# under the standard genetic code is the mean of this distribution:
effectSTDC <- mean(valSTDC) # 178.1
# Now we can look at the effects of alternate genetic codes:
set.seed(112358)
# choose a new code
GC <- randomGC(GENETIC_CODE)
GC <- randomGC(stdCode)
set.seed(NULL)
# reverse translate hypothetical sequence according to the new code
@ -321,7 +326,7 @@ valXGC <- numeric(N)
set.seed(1414214) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- randomGC(GENETIC_CODE) # Choose code
GC <- randomGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate
@ -355,7 +360,7 @@ valSGC <- numeric(N)
set.seed(2718282) # set RNG seed for repeatable randomness
for (i in 1:N) {
GC <- swappedGC(GENETIC_CODE) # Choose code
GC <- swappedGC(stdCode) # Choose code
x <- traRev(myAA, GC) # reverse translate
x <- randMut(x) # mutate
x <- traFor(x, GC) # translate

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.0.1
# Version: 1.1
#
# Date: 2017 10 - 2018 12
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout,
# 1.0.1 Updates for slightly changed interfaces
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
@ -29,10 +31,10 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 44
#TOC> 1.1 Task - fetchPrositeFeatures() function 145
#TOC> 2 Task solutions 153
#TOC> ---------------------------------------------------------------------
#TOC> 1 Constructing a POST command from a Web query 42
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
#TOC> 2 Task solutions 150
#TOC>
#TOC> ==========================================================================
@ -40,9 +42,8 @@
# = 1 Constructing a POST command from a Web query ========================
if (! require(httr, quietly=TRUE)) {
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
library(httr)
}
# Package information:
# library(help = httr) # basic information
@ -60,7 +61,7 @@ UniProtID <- "P39678"
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- POST(URL,
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = UniProtID,
@ -70,14 +71,14 @@ response <- POST(URL,
# Send off this request, and you should have a response in a few
# seconds. Let's check the status first:
status_code(response) # If this is not 200, something went wrong and it
httr::status_code(response) # If this is not 200, something went wrong and it
# makes no sense to continue. If this persists, ask
# on the mailing list what to do.
# The text contents of the response is available with the
# content() function:
content(response, "text")
httr::content(response, "text")
# ... should show you the same as the page contents that
# you have seen in the browser. The date we need Now we need to extract
@ -86,7 +87,7 @@ content(response, "text")
# individual lines, since each of our data elements is on
# its own line. We simply split on the "\\n" newline character.
lines <- unlist(strsplit(content(response, "text"), "\\n"))
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
head(lines)
# Now we define a query pattern for the lines we want:

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-SX-PDB unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 19
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First live version, completely refactores 2016 code
# with remarkable speed gains. Added section on x, y, z
# (density) plots.
@ -30,19 +32,19 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 Introduction to the bio3D package 63
#TOC> 2 A Ramachandran plot 151
#TOC> 3 Density plots 227
#TOC> 3.1 Density-based colours 241
#TOC> 3.2 Plotting with smoothScatter() 260
#TOC> 3.3 Plotting hexbins 275
#TOC> 3.4 Plotting density contours 299
#TOC> 3.4.1 ... as overlay on a colored grid 333
#TOC> 3.4.2 ... as filled countour 350
#TOC> 3.4.3 ... as a perspective plot 381
#TOC> 4 cis-peptide bonds 399
#TOC> 5 H-bond lengths 414
#TOC> ----------------------------------------------------------
#TOC> 1 Introduction to the bio3D package 61
#TOC> 2 A Ramachandran plot 152
#TOC> 3 Density plots 228
#TOC> 3.1 Density-based colours 242
#TOC> 3.2 Plotting with smoothScatter() 261
#TOC> 3.3 Plotting hexbins 276
#TOC> 3.4 Plotting density contours 304
#TOC> 3.4.1 ... as overlay on a colored grid 337
#TOC> 3.4.2 ... as filled countour 354
#TOC> 3.4.3 ... as a perspective plot 385
#TOC> 4 cis-peptide bonds 403
#TOC> 5 H-bond lengths 418
#TOC>
#TOC> ==========================================================================
@ -59,9 +61,8 @@
# = 1 Introduction to the bio3D package ===================================
if (! require(bio3d, quietly=TRUE)) {
if (! requireNamespace("bio3d", quietly = TRUE)) {
install.packages("bio3d")
library(bio3d)
}
# Package information:
# library(help = bio3d) # basic information
@ -89,8 +90,8 @@ file.show("./data/1BM8.pdb")
# Are all atoms of the N-terminal residue present?
# Are all atoms of the C-terminal residue present?
apses <- read.pdb("1bm8") # load a molecule directly from the PDB via the
# Internet. (This is not your local version.)
apses <- bio3d::read.pdb("1bm8") # load a molecule directly from the PDB via
# the Internet.
# check what we have:
apses
@ -121,10 +122,11 @@ apses$atom[apses$atom[,"resno"] == i, ]
apses$seqres[1:10] # the "A"s here identify chain "A"
# Can we convert this to one letter code?
aa321(apses$seqres[1:10])
bio3d::aa321(apses$seqres[1:10])
# Lets get the implicit sequence:
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
bio3d::aa321((apses$atom$resid[apses$calpha])[1:10])
# Do you understand this code?
# Do explicit and implicit sequence have the same length?
length(apses$seqres)
@ -140,7 +142,10 @@ apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
# The introduction to bio3d tutorial at
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
# has the following example:
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
bio3d::plot.bio3d(apses$atom$b[apses$calpha],
sse=apses,
typ="l",
ylab="B-factor")
# Good for now. Let's do some real work.
@ -149,7 +154,7 @@ plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
# calculates all dihedral angles for backbone and sidechain bonds, NA where
# the bond does not exist in an amino acid.
tor <- torsion.pdb(apses)
tor <- bio3d::torsion.pdb(apses)
plot(tor$phi, tor$psi,
xlim = c(-180, 180), ylim = c(-180, 180),
main = "Ramachandran plot for 1BM8",
@ -164,7 +169,7 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
# color the points for glycine residues differently. First, we
# get a vector of glycine residue indices in the structure:
mySeq <- pdbseq(apses)
mySeq <- bio3d::pdbseq(apses)
# Explore the result object and extract the indices of GLY resiues.
mySeq == "G"
@ -210,7 +215,7 @@ for (i in 1:nrow(dat)) {
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
text(dat$phi[i],
dat$psi[i],
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]),
pos = 4,
offset = 0.4,
cex = 0.7)
@ -272,9 +277,8 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
# If we wish to approximate values in a histogram-like fashion, we can use
# hexbin()
if (! require(hexbin, quietly=TRUE)) {
if (! requireNamespace("hexbin", quietly = TRUE)) {
install.packages("hexbin")
library(hexbin)
}
# Package information:
# library(help = hexbin) # basic information
@ -285,12 +289,17 @@ if (! require(hexbin, quietly=TRUE)) {
myColorRamp <- colorRampPalette(c("#EEEEEE",
"#3399CC",
"#2266DD"))
plot(hexbin(phi, psi, xbins = 10),
hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10),
colramp = myColorRamp,
main = "phi-psi Density Bins for 1BM8",
xlab = expression(phi),
ylab = expression(psi))
# Note:
# Had we loaded hexbin with library(hexbin), the plot function would have
# been dispatched by the plot() generic, and we could simply have written:
# plot(hexbin(phi, psi, xbins = 10), ...
# == 3.4 Plotting density contours =========================================
@ -305,17 +314,16 @@ plot(hexbin(phi, psi, xbins = 10),
# distributions. But for 2D data like or phi-psi plots, we need a function from
# the MASS package: kde2d()
if (! require(MASS, quietly=TRUE)) {
if (! requireNamespace("MASS", quietly = TRUE)) {
install.packages("MASS")
library(MASS)
}
# Package information:
# library(help = MASS) # basic information
# browseVignettes("MASS") # available vignettes
# data(package = "MASS") # available datasets
?kde2d
dPhiPsi <-kde2d(phi, psi,
?MASS::kde2d
dPhiPsi <-MASS::kde2d(phi, psi,
n = 60,
lims = c(-180, 180, -180, 180))
@ -469,7 +477,7 @@ ssSelect <- function(PDB, myChain = "A", ssType, myElety) {
# get id's from PDB
x <- atom.select(PDB,
x <- bio3d::atom.select(PDB,
string = "protein",
type = "ATOM",
chain = myChain,
@ -506,7 +514,7 @@ pairDist <- function(PDB, a, b) {
A <- PDB$atom[a, c("x", "y", "z")]
B <- PDB$atom[b, c("x", "y", "z")]
dMat <- dist.xyz(A, B)
dMat <- bio3d::dist.xyz(A, B)
}
return(dMat)

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 05
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
@ -28,10 +30,10 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ----------------------------------------------------
#TOC> 1 UniProt files via GET 44
#TOC> 1.1 Task - fetchUniProtSeq() function 107
#TOC> 2 Task solutions 114
#TOC> ----------------------------------------------------------
#TOC> 1 UniProt files via GET 41
#TOC> 1.1 Task - fetchUniProtSeq() function 103
#TOC> 2 Task solutions 110
#TOC>
#TOC> ==========================================================================
@ -48,9 +50,8 @@
# a Web browser. Since this is a short and simple request, the GET verb is the
# right tool:
if (! require(httr, quietly=TRUE)) {
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
library(httr)
}
# Package information:
# library(help = httr) # basic information
@ -69,7 +70,7 @@ UniProtID <- "P39678"
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
# the GET() function from httr will get the data.
response <- GET(URL)
response <- httr::GET(URL)
str(response) # the response object is a bit complex ...
as.character(response) # ... but it is easy to pull out the data.
@ -82,21 +83,21 @@ dbSanitizeSequence(x)
# Simple.
# But what happens if there is an error, e.g. the uniprot ID does not exist?
response <- GET("http://www.uniprot.org/uniprot/X000000.fasta")
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
as.character(response)
# this is a large HTML page that tells us the URL was not found. So we need to
# check for errors. The Right way to do this is to evaluate the staus code that
# check for errors. The Right Way to do this is to evaluate the staus code that
# every Web server returns for every transaction.
#
status_code(response) # 404 == Page Not Found
httr::status_code(response) # 404 == Page Not Found
# There are many possible codes, but the only code we will be happy with
# is 200 - oK.
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
response <- GET(URL)
status_code(response)
response <- httr::GET(URL)
httr::status_code(response)
# == 1.1 Task - fetchUniProtSeq() function =================================

View File

@ -3,12 +3,13 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Unit_testing unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 15
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace()
# 1.0 New code
#
#
@ -27,10 +28,11 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -------------------------------------------
#TOC> 1 Unit Tests with testthat 43
#TOC> 2 Organizing your tests 156
#TOC> 3 Task solutions 181
#TOC> -------------------------------------------------
#TOC> 1 Unit Tests with testthat 40
#TOC> 2 Organizing your tests 159
#TOC> 2.1 Testing scripts 183
#TOC> 3 Task solutions 198
#TOC>
#TOC> ==========================================================================
@ -39,15 +41,22 @@
# The testthat package supports writing and executing unit tests in many ways.
if (! require(testthat, quietly=TRUE)) {
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
library(testthat)
}
# Package information:
# library(help = testthat) # basic information
# browseVignettes("testthat") # available vignettes
# data(package = "testthat") # available datasets
# testthat is one of those packages that we either use A LOT in a script,
# or not at all. Therfore it's more reasonable to depart from our usual
# <package>::<function>() idiom, and load the entire library. In fact, if
# we author packages, it is common practice to load testthat in the part
# of the package that automates testing.
library(testthat)
# An atomic test consists of an expectation about the bahaviour of a function or
# the existence of an object. testthat provides a number of useful expectations:
@ -171,6 +180,20 @@ test_file("./tests/test_biCode.R")
# .. or execute all the test files in the directory:
test_dir("./tests")
# == 2.1 Testing scripts ===================================================
# Scripts need special consideration since we do not necessarily source() them
# entirely. Therefore automated testing is not reasonable. What you can do
# instead is to place a conditional block at the end of your script, that
# never gets executed - then you can manually execute the code in the block
# whenever you wish to test your functions. For example:
if (FALSE) {
# ... your tests go here
}
# = 3 Task solutions ======================================================

View File

@ -3,12 +3,14 @@
# Purpose: A Bioinformatics Course:
# R code accompanying the RPR-Scripting_data_downloads unit.
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 05
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First ABC units version
# 0.1 First code copied from 2016 material.
#
@ -28,10 +30,10 @@
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> -----------------------------------------------------
#TOC> 1 Working with NCBI eUtils 44
#TOC> 1.1 Task - fetchNCBItaxData() function 162
#TOC> 2 Task solutions 169
#TOC> -----------------------------------------------------------
#TOC> 1 Working with NCBI eUtils 41
#TOC> 1.1 Task - fetchNCBItaxData() function 144
#TOC> 2 Task solutions 151
#TOC>
#TOC> ==========================================================================
@ -40,26 +42,11 @@
# To begin, we load some libraries with functions
# we need...
# ... the package httr, which sends and receives information via the http
# protocol, just like a Web browser.
if (! require(httr, quietly=TRUE)) {
install.packages("httr")
library(httr)
}
# Package information:
# library(help = httr) # basic information
# browseVignettes("httr") # available vignettes
# data(package = "httr") # available datasets
# ...plus the package xml2: NCBI's eUtils send information in XML format so we
# need to be able to parse XML.
if (! require(xml2, quietly=TRUE)) {
# To begin, we load the xml2 package that contains functions
# we need to receive and parse html data. NCBI's eUtils send information in
# XML format so we need to be able to parse XML.
if (! requireNamespace("xml2", quietly=TRUE)) {
install.packages("xml2")
library(xml2)
}
# Package information:
# library(help = xml2) # basic information
@ -91,24 +78,23 @@ URL <- paste(eUtilsBase,
# what the response should look like.
URL
# To fetch a response in R, we use the function GET() from the httr package
# To fetch a response in R, we use the function read_xml()
# with our URL as its argument.
myXML <- read_xml(URL)
myXML
(myXML <- xml2::read_xml(URL))
# This is XML. We can take the response apart into
# its indvidual components with the as_list() function.
as_list(myXML)
xml2::as_list(myXML)
# Note how the XML "tree" is represented as a list of
# lists of lists ...
# If we know exactly what elelement we are looking for,
# we can extract it from this structure:
as_list(myXML)[["IdList"]][["Id"]][[1]]
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
# But this is not very robust, it would break with the
# slightest change that the NCBI makes to their response
# slightest change that the NCBI makes to their data format -
# and the NCBI changes things A LOT!
# Somewhat more robust is to specify the type of element
@ -116,11 +102,12 @@ as_list(myXML)[["IdList"]][["Id"]][[1]]
# element, and use the XPath XML parsing language to
# retrieve it.
xml_find_all(myXML, "//Id") # returns a "node set"
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
# of the node set
# We will need doing this a lot, so we write a function
# We will need to do this more than once, so we write a function
# for it...
node2text <- function(doc, tag) {
# an extractor function for the contents of elements
@ -128,8 +115,8 @@ node2text <- function(doc, tag) {
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml_find_all(doc, path)
return(xml_text(nodes))
nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes))
}
# using node2text() ...
@ -145,7 +132,7 @@ URL <- paste0(eUtilsBase,
"&id=",
GID,
"&version=2.0")
(myXML <- read_xml(URL))
(myXML <- xml2::read_xml(URL))
(taxID <- node2text(myXML, "TaxId"))
(organism <- node2text(myXML, "Organism"))

View File

@ -22,17 +22,18 @@ setwd("<your/project/directory>")
# ==== PACKAGES ==============================================================
# Load all required packages.
# Check that required packages have been installed. Install if needed.
if (! require(seqinr, quietly=TRUE)) {
if (! requireNamespace("seqinr", quietly=TRUE)) {
install.packages("seqinr")
library(seqinr)
}
# Package information:
# library(help = seqinr) # basic information
# browseVignettes("seqinr") # available vignettes
# data(package = "seqinr") # available datasets
# Note: use package functions with the :: operator - eg.
# seqinr::aaa("K")

View File

@ -9,21 +9,18 @@
# ====== PACKAGES ==============================================================
if (! require(jsonlite, quietly = TRUE)) {
if (! requireNamespace("jsonlite", quietly = TRUE)) {
install.packages("jsonlite")
library(jsonlite)
}
if (! require(httr, quietly = TRUE)) {
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
library(httr)
}
if (! require(xml2, quietly = TRUE)) {
if (! requireNamespace("xml2", quietly = TRUE)) {
install.packages("xml2")
library(xml2)
}
@ -226,10 +223,10 @@ dbFetchUniProtSeq <- function(ID) {
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
response <- GET(URL)
response <- httr::GET(URL)
mySeq <- character()
if (status_code(response) == 200) {
if (httr::status_code(response) == 200) {
x <- as.character(response)
x <- strsplit(x, "\n")
mySeq <- dbSanitizeSequence(x)
@ -253,7 +250,7 @@ dbFetchPrositeFeatures <- function(ID) {
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
response <- POST(URL,
response <- httr::POST(URL,
body = list(meta = "opt1",
meta1_protein = "opt1",
seq = ID,
@ -261,9 +258,9 @@ dbFetchPrositeFeatures <- function(ID) {
output = "tabular"))
myFeatures <- data.frame()
if (status_code(response) == 200) {
if (httr::status_code(response) == 200) {
lines <- unlist(strsplit(content(response, "text"), "\\n"))
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
patt <- sprintf("\\|%s\\|", UniProtID)
lines <- lines[grep(patt, lines)]
@ -289,8 +286,8 @@ node2text <- function(doc, tag) {
# Contents of all matching elements is returned in
# a vector of strings.
path <- paste0("//", tag)
nodes <- xml_find_all(doc, path)
return(xml_text(nodes))
nodes <- xml2::xml_find_all(doc, path)
return(xml2::xml_text(nodes))
}
@ -309,7 +306,7 @@ dbFetchNCBItaxData <- function(ID) {
"db=protein",
"&term=", ID,
sep="")
myXML <- read_xml(URL)
myXML <- xml2::read_xml(URL)
GID <- node2text(myXML, "Id")
URL <- paste0(eUtilsBase,
@ -318,7 +315,7 @@ dbFetchNCBItaxData <- function(ID) {
"&id=",
GID,
"&version=2.0")
myXML <- read_xml(URL)
myXML <- xml2::read_xml(URL)
x <- as.integer(node2text(myXML, "TaxId"))
y <- node2text(myXML, "Organism")
@ -346,14 +343,14 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
# for IDs that are not mapped.
URL <- "https://www.uniprot.org/uploadlists/"
response <- POST(URL,
response <- httr::POST(URL,
body = list(from = mapFrom,
to = mapTo,
format = "tab",
query = s))
if (status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(content(response)),
if (httr::status_code(response) == 200) { # 200: oK
myMap <- read.delim(file = textConnection(httr::content(response)),
sep = "\t",
stringsAsFactors = FALSE)
myMap <- myMap[ , c(1,3)]
@ -362,12 +359,23 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
myMap <- data.frame()
warning(paste("No uniProt ID mapping returned:",
"server sent status",
status_code(response)))
httr::status_code(response)))
}
return(myMap)
}
# ====== TESTS =================================================================
if (FALSE) {
if (! requireNamespace("testthat", quietly = TRUE)) {
install.packages("testthat")
}
# ToDo: test everything here
}
# [END]

View File

@ -3,14 +3,16 @@
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.1.2
# Version: 1.2
#
# Date: 2016 09 - 2017 09
# Date: 2016 09 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 1.1.2 Moved BLAST.R to ./scripts directory
# V 1.1 Update 2017
# V 1.0 First code 2016
# Versions
# 1.2 Change from require() to requireNamespace()
# 1.1.2 Moved BLAST.R to ./scripts directory
# 1.1 Update 2017
# 1.0 First code 2016
#
# TODO:
#
@ -31,27 +33,25 @@
# the respective intermediate results.
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> ---------------------------------------------------
#TOC> 1 The strategy 54
#TOC> 2 GOLD species 66
#TOC> 2.1 Initialize 71
#TOC> 2.2 Import 77
#TOC> 2.3 Unique species 129
#TOC> 3 BLAST species 171
#TOC> 3.1 find homologous proteins 178
#TOC> 3.2 Identify species in "hits" 202
#TOC> 4 Intersect GOLD and BLAST species 247
#TOC> 5 Cleanup and finish 265
#TOC> ---------------------------------------------------------
#TOC> 1 The strategy 55
#TOC> 2 GOLD species 67
#TOC> 2.1 Initialize 72
#TOC> 2.2 Import 79
#TOC> 2.3 Unique species 131
#TOC> 3 BLAST species 173
#TOC> 3.1 find homologous proteins 180
#TOC> 3.2 Identify species in "hits" 204
#TOC> 4 Intersect GOLD and BLAST species 249
#TOC> 5 Cleanup and finish 267
#TOC>
#TOC> ==========================================================================
#TOC>
#TOC>
# = 1 The strategy ========================================================
# This script will create a list of "MYSPE" species and save it in an R object
@ -70,9 +70,10 @@
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
# == 2.1 Initialize ========================================================
if (! require(httr)) { # httr provides interfaces to Webservers on the Internet
# httr provides interfaces to Webservers on the Internet
if (! requireNamespace("httr", quietly = TRUE)) {
install.packages("httr")
library(httr)
}
# == 2.2 Import ============================================================

View File

@ -15,12 +15,14 @@
# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab
#
#
# Version: 1.0
# Version: 1.1
#
# Date: 2017 10 06
# Date: 2017 10 - 2019 01
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions:
# 1.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 1.0 First code copied from 2016 material.
#
# TODO:
@ -28,16 +30,16 @@
#
# ==============================================================================
if (! require(readr, quietly = TRUE)) {
if (! requireNamespace("readr", quietly = TRUE)) {
install.packages("readr")
library(readr)
}
# STRING functional interaction data
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
STR <- read_delim("./data/4932.protein.links.full.v10.5.txt", delim = " ")
STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt",
delim = " ")
# Subset only IDs and combined_score column
STR <- STR[ , c("protein1", "protein2", "combined_score")]
@ -61,7 +63,7 @@ myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
#
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
Gsl <- read_tsv("./data/go_slim_mapping.tab",
Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
col_names = c("ID",
"name",
"SGDId",

View File

@ -7,11 +7,13 @@
# https://ncbi.github.io/blast-cloud/dev/api.html
#
#
# Version: 3
# Date: 2016 09 - 2017 11
# Version: 3.1
# Date: 2016 09 - 2019 01
# Author: Boris Steipe
#
# Versions:
# 3.1 Change from require() to requireNamespace(),
# use <package>::<function>() idiom throughout
# 3 parsing logic had not been fully implemented; Fixed.
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
# refactored parseBLASTalignment() to handle lists with multiple hits.
@ -29,9 +31,8 @@
# ==============================================================================
if (! require(httr, quietly = TRUE)) {
if (! requireNamespace(httr, quietly = TRUE)) {
install.packages("httr")
library(httr)
}
@ -92,13 +93,13 @@ BLAST <- function(q,
}
# send it off ...
response <- GET(results$query)
if (http_status(response)$category != "Success" ) {
response <- httr::GET(results$query)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
http_status(response)$message))
httr::http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
txt <- httr::content(response, "text", encoding = "UTF-8")
patt <- "RID = (\\w+)" # match the request id
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
@ -127,13 +128,13 @@ BLAST <- function(q,
while (TRUE) {
# Check whether the result is ready
response <- GET(checkStatus)
if (http_status(response)$category != "Success" ) {
response <- httr::GET(checkStatus)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
http_status(response)$message))
httr::http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
txt <- httr::content(response, "text", encoding = "UTF-8")
if (length(grep("Status=WAITING", txt)) > 0) {
myTimeout <- myTimeout - EXTRAWAIT
@ -184,13 +185,13 @@ BLAST <- function(q,
"&FORMAT_TYPE=Text",
sep = "")
response <- GET(retrieve)
if (http_status(response)$category != "Success" ) {
response <- httr::GET(retrieve)
if (httr::http_status(response)$category != "Success" ) {
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
http_status(response)$message))
httr::http_status(response)$message))
}
txt <- content(response, "text", encoding = "UTF-8")
txt <- httr::content(response, "text", encoding = "UTF-8")
# txt contains the whole set of results. Process:
@ -357,7 +358,7 @@ parseBLASTalignment <- function(hit) {
# ==== TESTS ===================================================================
# define query:
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
# sep="")