Use requireNamespace(), <package>::<function>() idiom,
Biocmanager:: - not biocLite()
This commit is contained in:
parent
8cdbb9f4cc
commit
cffd803c23
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-BLAST unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 23
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.1 Fixed parsing logic.
|
||||
# 1.0 First live version 2017.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
@ -29,29 +31,15 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------
|
||||
#TOC> 1 Preparations 41
|
||||
#TOC> 2 Defining the APSES domain 54
|
||||
#TOC> 3 Executing the BLAST search 76
|
||||
#TOC> 4 Analysing results 98
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 Defining the APSES domain 42
|
||||
#TOC> 2 Executing the BLAST search 64
|
||||
#TOC> 3 Analysing results 86
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations ========================================================
|
||||
|
||||
if (!require(Biostrings, quietly=TRUE)) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
# = 2 Defining the APSES domain ===========================================
|
||||
# = 1 Defining the APSES domain ===========================================
|
||||
|
||||
# Load your protein database
|
||||
source("makeProteinDB.R")
|
||||
@ -73,7 +61,7 @@ source("makeProteinDB.R")
|
||||
# BLAST search.
|
||||
|
||||
|
||||
# = 3 Executing the BLAST search ==========================================
|
||||
# = 2 Executing the BLAST search ==========================================
|
||||
|
||||
# The ./scripts/BLAST.R code defines two functions to access the BLAST interface
|
||||
# through its Web API, and to parse results. Have a look at the script, then
|
||||
@ -91,11 +79,11 @@ BLASTresults <- BLAST(apses, # MYSPE APSES domain sequence
|
||||
limits = "txid559292[ORGN]") # S. cerevisiae S288c
|
||||
|
||||
|
||||
length(BLASTresults$hits) # There should be at least one hit there. Ask for advice
|
||||
# in case this step fails.
|
||||
length(BLASTresults$hits) # There should be at least one hit there. Ask for
|
||||
# advice in case this step fails.
|
||||
|
||||
|
||||
# = 4 Analysing results ===================================================
|
||||
# = 3 Analysing results ===================================================
|
||||
|
||||
(topHit <- BLASTresults$hits[[1]]) # Get the top hit
|
||||
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Dotplot unit.
|
||||
#
|
||||
# Version: 0.1
|
||||
# Version: 0.2
|
||||
#
|
||||
# Date: 2017 08 28
|
||||
# Date: 2019 01 07
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 0.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
#
|
||||
@ -23,24 +25,37 @@
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
# = 1 ___Section___
|
||||
|
||||
# First, we install and load the Biostrings package.
|
||||
if (!require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------
|
||||
#TOC> 1 ___Section___ 39
|
||||
#TOC> 2 Tasks 187
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 ___Section___ =======================================================
|
||||
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
if (!requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
|
||||
|
||||
# Let's load BLOSUM62
|
||||
data(BLOSUM62)
|
||||
data(BLOSUM62, package = "Biostrings")
|
||||
|
||||
# Now let's craft code for a dotplot. That's surprisingly simple. We build a
|
||||
# matrix that has as many rows as one sequence, as many columns as another. Then
|
||||
@ -51,10 +66,10 @@ data(BLOSUM62)
|
||||
|
||||
# First we fetch our sequences and split them into single characters.
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
MBP1_SACCE <- s2c(myDB$protein$sequence[sel])
|
||||
MBP1_SACCE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
MBP1_MYSPE <- s2c(myDB$protein$sequence[sel])
|
||||
MBP1_MYSPE <- seqinr::s2c(myDB$protein$sequence[sel])
|
||||
|
||||
# Check that we have two character vectors of the expected length.
|
||||
str(MBP1_SACCE)
|
||||
@ -136,7 +151,7 @@ axis(4, at = c(1, seq(10, len, by=10)))
|
||||
# utilities file and called it dotPlot2(). Why not dotPlot() ... that's because
|
||||
# there already is a dotplot function in the seqinr package:
|
||||
|
||||
dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||
seqinr::dotPlot(MBP1_SACCE, MBP1_MYSPE) # seqinr
|
||||
dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE") # Our's
|
||||
|
||||
# Which one do you prefer? You can probably see the block patterns that arise
|
||||
@ -169,7 +184,7 @@ dotPlot2(MBP1_SACCE, MBP1_MYSPE, xlab = "SACCE", ylab = "MYSPE", f = myFilter)
|
||||
|
||||
|
||||
|
||||
# = 1 Tasks
|
||||
# = 2 Tasks ===============================================================
|
||||
|
||||
|
||||
|
||||
|
131
BIN-ALI-MSA.R
131
BIN-ALI-MSA.R
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-MSA unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.1 Added fetchMSAmotif()
|
||||
# 1.0 Fully refactored and rewritten for 2017
|
||||
# 0.1 First code copied from 2016 material.
|
||||
@ -29,22 +31,22 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------
|
||||
#TOC> 1 Preparations 51
|
||||
#TOC> 2 Aligning full length MBP1 proteins 99
|
||||
#TOC> 2.1 Preparing Sequences 110
|
||||
#TOC> 2.2 Compute the MSA 135
|
||||
#TOC> 3 Analyzing an MSA 156
|
||||
#TOC> 4 Comparing MSAs 227
|
||||
#TOC> 4.1 Importing an alignment to msa 236
|
||||
#TOC> 4.1.1 importing an .aln file 245
|
||||
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 276
|
||||
#TOC> 4.2 More alignments 313
|
||||
#TOC> 4.3 Computing comparison metrics 325
|
||||
#TOC> 5 Profile-Profile alignments 462
|
||||
#TOC> 6 Sequence Logos 539
|
||||
#TOC> 6.1 Subsetting an alignment by motif 548
|
||||
#TOC> 6.2 Plot a Sequence Logo 591
|
||||
#TOC> ------------------------------------------------------------------
|
||||
#TOC> 1 Preparations 54
|
||||
#TOC> 2 Aligning full length MBP1 proteins 96
|
||||
#TOC> 2.1 Preparing Sequences 107
|
||||
#TOC> 2.2 Compute the MSA 132
|
||||
#TOC> 3 Analyzing an MSA 153
|
||||
#TOC> 4 Comparing MSAs 224
|
||||
#TOC> 4.1 Importing an alignment to msa 233
|
||||
#TOC> 4.1.1 importing an .aln file 242
|
||||
#TOC> 4.1.2 Creating an MsaAAMultipleAlignment object 273
|
||||
#TOC> 4.2 More alignments 324
|
||||
#TOC> 4.3 Computing comparison metrics 336
|
||||
#TOC> 5 Profile-Profile alignments 473
|
||||
#TOC> 6 Sequence Logos 546
|
||||
#TOC> 6.1 Subsetting an alignment by motif 555
|
||||
#TOC> 6.2 Plot a Sequence Logo 604
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -59,28 +61,22 @@
|
||||
source("makeProteinDB.R")
|
||||
|
||||
|
||||
# Multiple sequence alignment algorithms are provided in
|
||||
# the Bioconductor msa package.
|
||||
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
# Multiple sequence alignment algorithms are provided in
|
||||
# the Bioconductor msa package.
|
||||
|
||||
if (! require(msa, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("msa")
|
||||
library(msa)
|
||||
if (! requireNamespace("msa", quietly=TRUE)) {
|
||||
BiocManager::install("msa")
|
||||
}
|
||||
# Package information:
|
||||
# library(help=msa) # basic information
|
||||
@ -115,7 +111,7 @@ help(package = "msa")
|
||||
# of sequence.
|
||||
|
||||
sel <- grep("MBP1", myDB$protein$name)
|
||||
MBP1set <- AAStringSet(myDB$protein$sequence[sel])
|
||||
MBP1set <- Biostrings::AAStringSet(myDB$protein$sequence[sel])
|
||||
|
||||
# To help us make sense of the alignment we need to add the names for
|
||||
# the sequences. Names for a seqSet object are held in the ranges slot...
|
||||
@ -142,10 +138,10 @@ MBP1set
|
||||
|
||||
|
||||
# Let's run an alignment with "Muscle"
|
||||
(msaM <- msaMuscle( MBP1set, order = "aligned"))
|
||||
(msaM <- msa::msaMuscle( MBP1set, order = "aligned"))
|
||||
|
||||
# ... or to see the whole thing (cf. ?MsaAAMultipleAlignment ... print method):
|
||||
print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
||||
msa::print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
||||
|
||||
|
||||
# You see that the alignment object has sequence strings with hyphens as
|
||||
@ -173,7 +169,7 @@ print(msaM, show=c("alignment", "complete"), showConsensus=FALSE)
|
||||
|
||||
data("BLOSUM62") # fetch the BLOSUM62 package from the Biostrings package
|
||||
|
||||
msaMScores <- msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
|
||||
msaMScores <- msa::msaConservationScore(msaM, substitutionMatrix = BLOSUM62)
|
||||
plot(msaMScores, type = "l", col = "#205C5E", xlab = "Alignment Position")
|
||||
|
||||
# That plot shows the well-aligned regions (domains ?) of the sequence, but it
|
||||
@ -246,17 +242,17 @@ for (i in seq_along(highScoringRanges$lengths)) {
|
||||
# === 4.1.1 importing an .aln file
|
||||
|
||||
# The seqinr package has a function to read CLUSTAL W formatted .aln files ...
|
||||
if (! require(seqinr, quietly=TRUE)) {
|
||||
install.packages(seqinr)
|
||||
library(seqinr)
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help=seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# read the donwloaded file
|
||||
tmp <- read.alignment("msaT.aln", format = "clustal")
|
||||
# read the T-coffee aligned file that you donwloaded from the EBI MSA tools
|
||||
# (cf. http://steipe.biochemistry.utoronto.ca/abc/index.php/BIN-ALI-MSA)
|
||||
tmp <- seqinr::read.alignment("msaT.aln", format = "clustal")
|
||||
|
||||
# read.alignment() returns a list. $seq is a list of strings, one for each
|
||||
# complete alignment. However, they are converted to lower case.
|
||||
@ -278,12 +274,12 @@ for (i in seq_along(x)) {
|
||||
|
||||
# MsaAAMultipleAlignment objects are S4 objects that contain AAStringSet objects
|
||||
# in their @unmasked slot, and a few additional items. Rather then build the
|
||||
# object from scratch, we copy an axisting object, and overwrite the dta in its
|
||||
# object from scratch, we copy an existing object, and overwrite the data in its
|
||||
# slots with what we need. Our goal is pragmatic, we want an object that msa's
|
||||
# functions will accept as input.
|
||||
|
||||
# First: convert our named char vector into an AAstringSet
|
||||
x <- AAStringSet(x)
|
||||
x <- Biostrings::AAStringSet(x)
|
||||
|
||||
# Then: create a new MsaAAMultipleAlignment S4 object. The msa package has
|
||||
# defined what such an object should look like, with the SetClass() function. To
|
||||
@ -294,8 +290,22 @@ x <- AAStringSet(x)
|
||||
|
||||
str(msaM)
|
||||
|
||||
# There is a catch however in the way R makes such operations specific to
|
||||
# the packages they need them: the function that creates the class is
|
||||
# defined as a "generic", and when it is called, R looks in the package
|
||||
# namespace for a more specific function with precise instructions what
|
||||
# to do. However, we have not loaded the package namespace - we access all
|
||||
# of the functions directly with the msa:: prefix. This method breaks down
|
||||
# when generic functions are involved. I.e. - we could make it work, but
|
||||
# the amount of code we need then is unreasonable. The straightforward
|
||||
# way is to load the package. We can still use the prefix notation for
|
||||
# its functions, just to emphasize where the function comes from. But since
|
||||
# the namespace then exists, we ensure that generics are properly dispatched.
|
||||
|
||||
library(msa) # load the msa package namespace
|
||||
|
||||
msaT <- new("MsaAAMultipleAlignment", # create new MsaAAMultipleAlignment object
|
||||
unmasked = x, # "unmasked" slot takes an AASringSet
|
||||
unmasked = x, # "unmasked" slot takes an AAStringSet
|
||||
version = "T-Coffee", # "version" slot takes a string
|
||||
params = list(), # "params" takes a list(), we leave the
|
||||
# list empty, but we could add the
|
||||
@ -309,18 +319,18 @@ str(msaT)
|
||||
msaT # Now we have fabricated an msaAAMultipleAlignment object, and we can
|
||||
# use the msa package functions on it
|
||||
|
||||
msaTScores <- msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
|
||||
msaTScores <- msa::msaConservationScore(msaT, substitutionMatrix = BLOSUM62)
|
||||
|
||||
# == 4.2 More alignments ===================================================
|
||||
|
||||
# Next, we calculate alignments with msa's two other alignment options:
|
||||
# CLUSTAL Omega
|
||||
(msaO <- msaClustalOmega( MBP1set, order = "aligned"))
|
||||
msaOScores <- msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
|
||||
(msaO <- msa::msaClustalOmega( MBP1set, order = "aligned"))
|
||||
msaOScores <- msa::msaConservationScore(msaO, substitutionMatrix = BLOSUM62)
|
||||
|
||||
# CLUSTAL W
|
||||
(msaW <- msaClustalW( MBP1set, order = "aligned"))
|
||||
msaWScores <- msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
|
||||
(msaW <- msa::msaClustalW( MBP1set, order = "aligned"))
|
||||
msaWScores <- msa::msaConservationScore(msaW, substitutionMatrix = BLOSUM62)
|
||||
|
||||
|
||||
# == 4.3 Computing comparison metrics ======================================
|
||||
@ -454,7 +464,7 @@ legend("bottomright",
|
||||
|
||||
# Your alignment is going to be different from mine, due to the inclusion of
|
||||
# MYSPE - but what I see is that MUSCLE gives the highest score overall, and
|
||||
# achieves this with fewer indels then most, and the lowest number of gaps of
|
||||
# achieves this with fewer indels than most, and the lowest number of gaps of
|
||||
# all algorithms.
|
||||
|
||||
# To actually compare regions of alignments, we need to align alignments.
|
||||
@ -470,12 +480,8 @@ legend("bottomright",
|
||||
# to compare two MSAs with each other, by aligning them. The algorithm is
|
||||
# provided by the DECIPHER package.
|
||||
|
||||
if (! require(DECIPHER, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("DECIPHER")
|
||||
library(DECIPHER)
|
||||
if (! requireNamespace("DECIPHER", quietly=TRUE)) {
|
||||
BiocManager::install("DECIPHER")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = DECIPHER) # basic information
|
||||
@ -484,14 +490,14 @@ if (! require(DECIPHER, quietly=TRUE)) {
|
||||
|
||||
# AlignProfiles() takes two AAStringSets as input. Let's compare the MUSCLE and
|
||||
# CLUSTAL W alignments: we could do this directly ...
|
||||
AlignProfiles(msaW@unmasked, msaM@unmasked)
|
||||
DECIPHER::AlignProfiles(msaW@unmasked, msaM@unmasked)
|
||||
|
||||
# But for ease of comparison, we'll reorder the sequences of the CLUSTAL W
|
||||
# alignment into the same order as the MUSCLE alignment:
|
||||
m <- as.character(msaM)
|
||||
w <- as.character(msaW)[names(m)]
|
||||
|
||||
(ppa <- AlignProfiles(AAStringSet(w), AAStringSet(m)))
|
||||
(ppa <- DECIPHER::AlignProfiles(msa::AAStringSet(w), msa::AAStringSet(m)))
|
||||
|
||||
# Conveniently, AlignProfiles() returns an AAStringSet, so we can use our
|
||||
# writeALN function to show it. Here is an arbitrary block, from somewhere in
|
||||
@ -533,8 +539,8 @@ writeALN(ppa2, range = c(800, 960))
|
||||
# Again, go explore, and get a sense of what's going on. You may find that
|
||||
# CLUSTAL W has a tendency to insert short gaps all over the alignment, whereas
|
||||
# MUSCLE keeps indels in blocks. CLUSTAL's behaviour is exactly what I would
|
||||
# expect from an algorithm that builds alignments from pairwise local
|
||||
# alignments, without global refinement.
|
||||
# expect from an algorithm that builds alignments incrementally from pairwise
|
||||
# local alignments, without global refinement.
|
||||
|
||||
|
||||
# = 6 Sequence Logos ======================================================
|
||||
@ -602,16 +608,15 @@ writeALN(fetchMSAmotif(msaM, wing))
|
||||
# ggseqlogo written by by Omar Waghi, a former UofT BCB student who is now at
|
||||
# the EBI.
|
||||
|
||||
if (! require(ggseqlogo, quietly=TRUE)) {
|
||||
if (! requireNamspace("ggseqlogo", quietly=TRUE)) {
|
||||
install.packages(("ggseqlogo"))
|
||||
library(ggseqlogo)
|
||||
}
|
||||
# Package information:
|
||||
# library(help=ggseqlogo) # basic information
|
||||
# browseVignettes("ggseqlogo") # available vignettes
|
||||
# data(package = "ggseqlogo") # available datasets
|
||||
|
||||
ggseqlogo(as.character(motifAli))
|
||||
ggseqlogo::ggseqlogo(as.character(motifAli))
|
||||
|
||||
|
||||
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Optimal_sequence_alignment unit.
|
||||
#
|
||||
# Version: 1.4
|
||||
# Version: 1.5
|
||||
#
|
||||
# Date: 2017 09 - 2017 11
|
||||
# Date: 2017 09 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.5 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.4 Pull s2c() from seqinr package, rather then loading the
|
||||
# entire library.
|
||||
# 1.3 Updated confirmation task with correct logic
|
||||
@ -34,27 +36,30 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------
|
||||
#TOC> 1 Prepare 52
|
||||
#TOC> 2 Biostrings Pairwise Alignment 66
|
||||
#TOC> 2.1 Optimal global alignment 84
|
||||
#TOC> 2.2 Optimal local alignment 147
|
||||
#TOC> 3 APSES Domain annotation by alignment 171
|
||||
#TOC> 4 Update your database script 252
|
||||
#TOC> 4.1 Preparing an annotation file ... 258
|
||||
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 260
|
||||
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 303
|
||||
#TOC> 4.2 Execute and Validate 327
|
||||
#TOC> --------------------------------------------------------------------------
|
||||
#TOC> 1 Prepare 54
|
||||
#TOC> 2 Biostrings Pairwise Alignment 71
|
||||
#TOC> 2.1 Optimal global alignment 89
|
||||
#TOC> 2.2 Optimal local alignment 152
|
||||
#TOC> 3 APSES Domain annotation by alignment 176
|
||||
#TOC> 4 Update your database script 257
|
||||
#TOC> 4.1 Preparing an annotation file ... 263
|
||||
#TOC> 4.1.1 If you HAVE NOT done the BIN-FUNC-Annotation unit 265
|
||||
#TOC> 4.1.2 If you HAVE done the BIN-FUNC-Annotation unit 308
|
||||
#TOC> 4.2 Execute and Validate 332
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Prepare =============================================================
|
||||
|
||||
# To simplify code, we pull the function s2c(x) from the seqinr package,
|
||||
# rather than using the lengthier idiom unlist(strsplit(x, "").
|
||||
# This assumes that the seqinr package has been installed previously.
|
||||
s2c <- seqinr::s2c
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
}
|
||||
# You can get package information with the following commands:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
|
||||
# You need to recreate the protein database that you have constructed in the
|
||||
@ -66,13 +71,13 @@ source("makeProteinDB.R")
|
||||
# = 2 Biostrings Pairwise Alignment =======================================
|
||||
|
||||
|
||||
if (!require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (!requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (!requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
# browseVignettes("Biostrings") # available vignettes
|
||||
# data(package = "Biostrings") # available datasets
|
||||
@ -88,15 +93,15 @@ if (!require(Biostrings, quietly=TRUE)) {
|
||||
|
||||
# First: make AAString objects ...
|
||||
sel <- myDB$protein$name == "MBP1_SACCE"
|
||||
aaMBP1_SACCE <- AAString(myDB$protein$sequence[sel])
|
||||
aaMBP1_SACCE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
sel <- myDB$protein$name == paste("MBP1_", biCode(MYSPE), sep = "")
|
||||
aaMBP1_MYSPE <- AAString(myDB$protein$sequence[sel])
|
||||
aaMBP1_MYSPE <- Biostrings::AAString(myDB$protein$sequence[sel])
|
||||
|
||||
?pairwiseAlignment
|
||||
# ... and align.
|
||||
# Global optimal alignment with end-gap penalties is default.
|
||||
ali1 <- pairwiseAlignment(
|
||||
ali1 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
substitutionMatrix = "BLOSUM62",
|
||||
@ -108,7 +113,7 @@ str(ali1) # ... it's complicated
|
||||
# This is a Biostrings alignment object. But we can use Biostrings functions to
|
||||
# tame it:
|
||||
ali1
|
||||
writePairwiseAlignments(ali1) # That should look familiar
|
||||
Biostrings::writePairwiseAlignments(ali1) # That should look familiar
|
||||
|
||||
# And we can make the internal structure work for us (@ is for classes as
|
||||
# $ is for lists ...)
|
||||
@ -147,7 +152,7 @@ percentID(ali1)
|
||||
# == 2.2 Optimal local alignment ===========================================
|
||||
|
||||
# Compare with local optimal alignment (like EMBOSS Water)
|
||||
ali2 <- pairwiseAlignment(
|
||||
ali2 <- Biostrings::pairwiseAlignment(
|
||||
aaMBP1_SACCE,
|
||||
aaMBP1_MYSPE,
|
||||
type = "local",
|
||||
@ -155,9 +160,9 @@ ali2 <- pairwiseAlignment(
|
||||
gapOpening = 50,
|
||||
gapExtension = 10)
|
||||
|
||||
writePairwiseAlignments(ali2) # This has probably only aligned the N-terminal
|
||||
# DNA binding domain - but that one has quite
|
||||
# high sequence identity:
|
||||
Biostrings::writePairwiseAlignments(ali2)
|
||||
# This has probably only aligned the N-terminal DNA binding domain - but that
|
||||
# one has quite high sequence identity:
|
||||
percentID(ali2)
|
||||
|
||||
# == TASK: ==
|
||||
@ -209,14 +214,14 @@ myDB$annotation[myDB$annotation$ID == proID &
|
||||
# the sequence, and used the start and end coordinates to extract a substring.
|
||||
|
||||
# Let's convert this to an AAstring and assign it:
|
||||
aaMB1_SACCE_APSES <- AAString(apses)
|
||||
aaMB1_SACCE_APSES <- Biostrings::AAString(apses)
|
||||
|
||||
# Now let's align these two sequences of very different length without end-gap
|
||||
# penalties using the "overlap" type. "overlap" turns the
|
||||
# end-gap penalties off and that is crucially important since
|
||||
# the sequences have very different length.
|
||||
|
||||
aliApses <- pairwiseAlignment(
|
||||
aliApses <- Biostrings::pairwiseAlignment(
|
||||
aaMB1_SACCE_APSES,
|
||||
aaMBP1_MYSPE,
|
||||
type = "overlap",
|
||||
@ -228,7 +233,7 @@ aliApses <- pairwiseAlignment(
|
||||
# homologous, and have (almost) no indels. The entire "pattern"
|
||||
# sequence from QIYSAR ... to ... KPLFDF should be matched
|
||||
# with the "query". Is this correct?
|
||||
writePairwiseAlignments(aliApses)
|
||||
Biostrings::writePairwiseAlignments(aliApses)
|
||||
|
||||
# If this is correct, you can extract the matched sequence from
|
||||
# the alignment object. The syntax is a bit different from what
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-ALI-Similarity unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 20
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 Refactored for 2017; add aaindex, ternary plot.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,10 +30,10 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------
|
||||
#TOC> 1 Amino Acid Properties 43
|
||||
#TOC> 2 Mutation Data matrix 163
|
||||
#TOC> 3 Background score 205
|
||||
#TOC> ----------------------------------------------
|
||||
#TOC> 1 Amino Acid Properties 41
|
||||
#TOC> 2 Mutation Data matrix 158
|
||||
#TOC> 3 Background score 199
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -41,9 +43,8 @@
|
||||
# A large collection of amino acid property tables is available via the seqinr
|
||||
# package:
|
||||
|
||||
if (!require(seqinr)) {
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
library(seqinr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
@ -127,9 +128,8 @@ text(Y$I, K$I, names(Y$I))
|
||||
# plots are in general unintuitive and hard to interpret. One alternative is a
|
||||
# so-called "ternary plot":
|
||||
|
||||
if (!require(ggtern)) {
|
||||
if (! requireNamespace("ggtern", quietly=TRUE)) {
|
||||
install.packages("ggtern")
|
||||
library(ggtern)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = ggtern) # basic information
|
||||
@ -145,12 +145,11 @@ myDat <- data.frame("phi" = 0.9*(((Y$I-min(Y$I))/(max(Y$I)-min(Y$I))))+0.05,
|
||||
stringsAsFactors = FALSE)
|
||||
rownames(myDat) <- names(Y$I)
|
||||
|
||||
ggtern(data = myDat,
|
||||
aes(x = vol,
|
||||
ggtern::ggtern(data = myDat,
|
||||
ggplot2::aes(x = vol,
|
||||
y = phi,
|
||||
z = pK,
|
||||
label = rownames(myDat))) +
|
||||
geom_text()
|
||||
label = rownames(myDat))) + ggplot2::geom_text()
|
||||
|
||||
# This results in a mapping of amino acids relative to each other that is
|
||||
# similar to the Venn diagram you have seen in the notes.
|
||||
@ -162,12 +161,11 @@ ggtern(data = myDat,
|
||||
|
||||
# The Biostrings package contains the most common mutation data matrices.
|
||||
|
||||
if (!require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly=TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly=TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help=Biostrings) # basic information
|
||||
@ -200,7 +198,9 @@ BLOSUM62["W", "R"]
|
||||
|
||||
# = 3 Background score ====================================================
|
||||
|
||||
# The mutation data matrix is designed to give high scores to homologous sequences, low scores to non-homologous sequences. What score on average should we expect for a random sequence?
|
||||
# The mutation data matrix is designed to give high scores to homologous
|
||||
# sequences, low scores to non-homologous sequences. What score on average
|
||||
# should we expect for a random sequence?
|
||||
|
||||
# If we sample amino acid pairs at random, we will get a score that is the
|
||||
# average of the individual pairscores in the matrix. Omitting the ambiguity
|
||||
@ -219,12 +219,12 @@ sum(BLOSUM62[1:20, 1:20])/400
|
||||
# PDB ID 3FG7 - a villin headpiece structure with a large amount of
|
||||
# low-complexity amino acid sequence ...
|
||||
|
||||
aa3FG7 <- readAAStringSet("./data/3FG7.fa")[[1]]
|
||||
aa3FG7 <- Biostrings::readAAStringSet("./data/3FG7.fa")[[1]]
|
||||
|
||||
# ... and the FASTA file for the E. coli OmpG outer membrane porin (PDB: 2F1C)
|
||||
# with an exceptionally high percentage of hydrophobic residues.
|
||||
|
||||
aa2F1C <- readAAStringSet("./data/2F1C.fa")[[1]]
|
||||
aa2F1C <- Biostrings::readAAStringSet("./data/2F1C.fa")[[1]]
|
||||
|
||||
# Here is a function that takes two sequences and
|
||||
# returns their average pairscore.
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Data_integration unit.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2018 10 30
|
||||
# Date: 2018 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0.1 Bugfix: UniProt ID Mapping service API change
|
||||
# 1.0 First live version
|
||||
#
|
||||
@ -31,8 +33,8 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Identifier mapping 40
|
||||
#TOC> 2 Cross-referencing tables 164
|
||||
#TOC> 1 Identifier mapping 42
|
||||
#TOC> 2 Cross-referencing tables 165
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -54,9 +56,8 @@
|
||||
|
||||
# To begin, we load httr, which supports sending and receiving data via the
|
||||
# http protocol, just like a Web browser.
|
||||
if (!require(httr, quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
if (! requireNamespace("httpr", quietly=TRUE)) {
|
||||
install.packages("httpr")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
@ -75,22 +76,22 @@ myQueryIDs <- "NP_010227 NP_00000 NP_011036"
|
||||
# of the request. GET() and POST() are functions from httr.
|
||||
|
||||
URL <- "https://www.uniprot.org/mapping/"
|
||||
response <- POST(URL,
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = "P_REFSEQ_AC", # Refseq Protein
|
||||
to = "ACC", # UniProt ID
|
||||
format = "tab",
|
||||
query = myQueryIDs))
|
||||
|
||||
cat(content(response))
|
||||
cat(httr::content(response))
|
||||
|
||||
# We need to check the status code - if it is not 200, an error ocurred and we
|
||||
# can't process the result:
|
||||
status_code(response)
|
||||
httr::status_code(response)
|
||||
|
||||
# If the query is successful, tabbed text is returned. We can assign that to a
|
||||
# data frame. Note that we use textConnection() to read data directly from a char object, which can go in the spot where read.delim() expects a file-name argument.
|
||||
|
||||
myMappedIDs <- read.delim(file = textConnection(content(response)),
|
||||
myMappedIDs <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMappedIDs
|
||||
@ -132,14 +133,14 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
# for IDs that are not mapped.
|
||||
|
||||
URL <- "https://www.uniprot.org/uploadlists/"
|
||||
response <- POST(URL,
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = mapFrom,
|
||||
to = mapTo,
|
||||
format = "tab",
|
||||
query = s))
|
||||
|
||||
if (status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(content(response)),
|
||||
if (httr::status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMap <- myMap[ , c(1,3)]
|
||||
@ -148,7 +149,7 @@ myIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
myMap <- data.frame()
|
||||
warning(paste("No uniProt ID mapping returned:",
|
||||
"server sent status",
|
||||
status_code(response)))
|
||||
httr::status_code(response)))
|
||||
}
|
||||
|
||||
return(myMap)
|
||||
@ -168,7 +169,8 @@ myIDmap("NP_010227 NP_011036 NP_012881 NP_013729 NP_012165")
|
||||
# Nomenclature commission. How do we map one set of identifiers to another one?
|
||||
|
||||
# The function to use is match().
|
||||
# Here is a tiny set of identifiers taken from a much larger table to illustrate the principle:
|
||||
# Here is a tiny set of identifiers taken from a much larger table to
|
||||
# illustrate the principle:
|
||||
#
|
||||
|
||||
myIDs <- data.frame(uID = c("P38903", "P31383", "P47177", "P47096", "Q07747",
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-FUNC_Semantic_similarity unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 11 12
|
||||
# Date: 2017 11 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 New code.
|
||||
#
|
||||
#
|
||||
@ -27,59 +30,65 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 39
|
||||
#TOC> 2 Fetch GO Annotations 89
|
||||
#TOC> 3 Semantic Similarities 98
|
||||
#TOC> 4 GO Term Enrichment in Gene Sets 116
|
||||
#TOC> --------------------------------------------------------------------
|
||||
#TOC> 1 Preparations: Packages, AnnotationDB, Setup 42
|
||||
#TOC> 2 Fetch GO Annotations 98
|
||||
#TOC> 3 Semantic Similarities 107
|
||||
#TOC> 4 GO Term Enrichment in Gene Sets 125
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparations: Packages, AnnotationDB, Setup =========================
|
||||
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
|
||||
# GOSim is an R-package in the Bioconductor project.
|
||||
if (! require(GOSim, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("GOSim")
|
||||
library(GOSim)
|
||||
if (! requireNamespace("GOSim", quietly = TRUE)) {
|
||||
BiocManager::install("GOSim")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = GOSim) # basic information
|
||||
# browseVignettes("GOSim") # available vignettes
|
||||
# data(package = "GOSim") # available datasets
|
||||
|
||||
# GOSim makes extensive assumptions about loaded packages, and many base
|
||||
# methods are masked. We will thus use library(GOSim) to load it
|
||||
# in its entirety and with all packages it depends on. We will still use
|
||||
# the <package>::<function>() syntax in the code below, but this now serves
|
||||
# more of a didactic purpose, rather than actual syntax requirements.
|
||||
|
||||
library(GOSim)
|
||||
|
||||
# GOSim loads human annotations by default. We load yeast annotations instead...
|
||||
if (!require(org.Sc.sgd.db, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("org.Sc.sgd.db", quietly = TRUE)) {
|
||||
BiocManager::install("org.Sc.sgd.db")
|
||||
}
|
||||
biocLite("org.Sc.sgd.db")
|
||||
|
||||
# Bioconductor annotation packages won't work stably unless we actually load
|
||||
# them:
|
||||
library(org.Sc.sgd.db)
|
||||
}
|
||||
|
||||
# org.Sc.sgd.db is a Bioconductor annotation database curated by SGD. Such
|
||||
# databases exist for all model organisms. It's a kind of a fancy data frame
|
||||
# from which we can get annotations by rows (genes) with the keys() funtion ...
|
||||
keys(org.Sc.sgd.db)[1500:1510]
|
||||
AnnotationDbi::keys(org.Sc.sgd.db)[1500:1510]
|
||||
|
||||
# ... and the types of available annotations with the columns() function
|
||||
columns(org.Sc.sgd.db)
|
||||
AnnotationDbi::columns(org.Sc.sgd.db)
|
||||
|
||||
# Note that one of the columns is "GO" ... and we load that into the
|
||||
# datastructures used by GOSim:
|
||||
|
||||
# Choose GOterms to use
|
||||
setEvidenceLevel(evidences="all",
|
||||
GOSim::setEvidenceLevel(evidences = "all",
|
||||
organism = org.Sc.sgdORGANISM,
|
||||
gomap = org.Sc.sgdGO)
|
||||
|
||||
# Use Biological Process ontology
|
||||
setOntology("BP", loadIC=FALSE)
|
||||
GOSim::setOntology("BP", loadIC = FALSE)
|
||||
|
||||
# confirm that we loaded the correct ontology
|
||||
head(get("gomap", envir = GOSimEnv))
|
||||
@ -92,7 +101,7 @@ head(get("gomap", envir=GOSimEnv))
|
||||
# All keys being used here are yeast systematic names.
|
||||
|
||||
# Get one set of annotations
|
||||
getGOInfo(c("YDL056W")) # Mbp1
|
||||
GOSim::getGOInfo(c("YDL056W")) # Mbp1
|
||||
|
||||
|
||||
# = 3 Semantic Similarities ===============================================
|
||||
@ -105,30 +114,30 @@ getGOInfo(c("YDL056W")) # Mbp1
|
||||
# in this package.
|
||||
|
||||
# Mbp1 and...
|
||||
getGeneSim("YDL056W", "YLR182W", similarity = "OA") # Swi6 - MCB complex
|
||||
getGeneSim("YDL056W", "YER111C", similarity = "OA") # Swi4 - collaborators
|
||||
getGeneSim("YDL056W", "YBR160W", similarity = "OA") # Cdc28 - mediator
|
||||
getGeneSim("YDL056W", "YGR108W", similarity = "OA") # Clb1 - antagonist
|
||||
getGeneSim("YDL056W", "YLR079W", similarity = "OA") # Sic1 - antagonist
|
||||
getGeneSim("YDL056W", "YJL130C", similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||
GOSim::getGeneSim("YDL056W","YLR182W",similarity = "OA") # Swi6 - MCB complex
|
||||
GOSim::getGeneSim("YDL056W","YER111C",similarity = "OA") # Swi4 - collaborators
|
||||
GOSim::getGeneSim("YDL056W","YBR160W",similarity = "OA") # Cdc28 - mediator
|
||||
GOSim::getGeneSim("YDL056W","YGR108W",similarity = "OA") # Clb1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YLR079W",similarity = "OA") # Sic1 - antagonist
|
||||
GOSim::getGeneSim("YDL056W","YJL130C",similarity = "OA") # Pgk1 - Gluconeogenesis
|
||||
|
||||
|
||||
# = 4 GO Term Enrichment in Gene Sets =====================================
|
||||
|
||||
|
||||
# Calculating GO term enrichment in gene sets is done with the topGO package.
|
||||
if (! require(topGO, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("topGO")
|
||||
library(topGO)
|
||||
# Calculating GO term enrichment in gene sets is done with the Bioconductor
|
||||
# topGO package.
|
||||
if (! requireNamespace("topGO", quietly = TRUE)) {
|
||||
BiocManager::install("topGO")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = topGO) # basic information
|
||||
# browseVignettes("topGO") # available vignettes
|
||||
# data(package = "topGO") # available datasets
|
||||
|
||||
# Once again - assumptions are made by GOsim that require us to load the
|
||||
# topGO package wholesale:
|
||||
library(topGO)
|
||||
|
||||
# Let's define a gene set: GOterm enrichment for G1/S switch activators:
|
||||
mySet <- c("YFR028C", # Cdc14
|
||||
@ -141,7 +150,7 @@ mySet <- c("YFR028C", # Cdc14
|
||||
"YPL256C", # Cln2
|
||||
"YAL040C") # Cln3
|
||||
|
||||
allGenes <- keys(org.Sc.sgd.db)
|
||||
allGenes <- AnnotationDbi::keys(org.Sc.sgd.db)
|
||||
allGenes <- allGenes[grep("^Y", allGenes)] # This is the context against which
|
||||
# we define enrichment
|
||||
|
||||
@ -164,7 +173,7 @@ setdiff(fullSet, mySet) # These are annotated to that term but not in mySet.
|
||||
|
||||
# What are these genes?
|
||||
# Select annotations from the annotation database:
|
||||
select(org.Sc.sgd.db,
|
||||
AnnotationDbi::select(org.Sc.sgd.db,
|
||||
keys = setdiff(fullSet, mySet),
|
||||
columns = c("COMMON", "DESCRIPTION"))
|
||||
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Data_preparation unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 31
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,12 +31,12 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 Preparations 41
|
||||
#TOC> 2 Fetching sequences 78
|
||||
#TOC> 3 Multiple Sequence Alignment 119
|
||||
#TOC> 4 Reviewing and Editing Alignments 138
|
||||
#TOC> 4.1 Masking workflow 154
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> 1 Preparations 44
|
||||
#TOC> 2 Fetching sequences 76
|
||||
#TOC> 3 Multiple Sequence Alignment 117
|
||||
#TOC> 4 Reviewing and Editing Alignments 136
|
||||
#TOC> 4.1 Masking workflow 152
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -49,12 +52,11 @@ source("makeProteinDB.R")
|
||||
|
||||
# Load packages we need
|
||||
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
@ -62,12 +64,8 @@ if (! require(Biostrings, quietly=TRUE)) {
|
||||
# data(package = "Biostrings") # available datasets
|
||||
|
||||
|
||||
if (! require(msa, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("msa")
|
||||
library(msa)
|
||||
if (! requireNamespace("msa", quietly = TRUE)) {
|
||||
BiocManager::install("msa")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = msa) # basic information
|
||||
@ -123,8 +121,8 @@ tail(APSI)
|
||||
# the MSA algorithms in Biostrings.
|
||||
#
|
||||
|
||||
APSESSet <- AAStringSet(APSI)
|
||||
APSESMsa <- msaMuscle(APSESSet, order = "aligned")
|
||||
APSESSet <- Biostrings::AAStringSet(APSI)
|
||||
APSESMsa <- msa::msaMuscle(APSESSet, order = "aligned")
|
||||
|
||||
# Nb. msaMuscle() sometimes fails - reproducibly, but I am not sure why. If
|
||||
# that happens in your case, just use msaClustalOmega() instead.
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_analysis unit.
|
||||
#
|
||||
# Version: 1.0.2
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 31
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.2 Typo in variable name, style changes
|
||||
# 1.0.1 Wrong section heading
|
||||
# 1.0 First 2017 version
|
||||
@ -31,11 +34,11 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------
|
||||
#TOC> 1 Preparation and Tree Plot 43
|
||||
#TOC> 2 Tree Analysis 82
|
||||
#TOC> 2.1 Rooting Trees 141
|
||||
#TOC> 2.2 Rotating Clades 187
|
||||
#TOC> 2.3 Computing tree distances 234
|
||||
#TOC> 1 Preparation and Tree Plot 46
|
||||
#TOC> 2 Tree Analysis 86
|
||||
#TOC> 2.1 Rooting Trees 145
|
||||
#TOC> 2.2 Rotating Clades 190
|
||||
#TOC> 2.3 Computing tree distances 241
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -43,19 +46,17 @@
|
||||
# = 1 Preparation and Tree Plot ===========================================
|
||||
|
||||
|
||||
if (!require(Rphylip, quietly=TRUE)) {
|
||||
install.packages("Rphylip")
|
||||
library(Rphylip)
|
||||
if (! requireNamespace("ape", quietly = TRUE)) {
|
||||
install.packages("ape")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Rphylip) # basic information
|
||||
# browseVignettes("Rphylip") # available vignettes
|
||||
# data(package = "Rphylip") # available datasets
|
||||
|
||||
# library(help = ape) # basic information
|
||||
# browseVignettes("ape") # available vignettes
|
||||
# data(package = "ape") # available datasets
|
||||
|
||||
|
||||
# Read the species tree that you have created at the phyloT Website:
|
||||
fungiTree <- read.tree("fungiTree.txt")
|
||||
fungiTree <- ape::read.tree("fungiTree.txt")
|
||||
|
||||
plot(fungiTree)
|
||||
|
||||
@ -73,7 +74,10 @@ for (i in seq_along(fungiTree$tip.label)) {
|
||||
|
||||
# Plot the tree
|
||||
plot(fungiTree, cex = 1.0, root.edge = TRUE, no.margin = TRUE)
|
||||
nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.6,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
# Note that you can use the arrow buttons in the menu above the plot to scroll
|
||||
# back to plots you have created earlier - so you can reference back to the
|
||||
# species tree.
|
||||
@ -91,10 +95,10 @@ nodelabels(text = fungiTree$node.label, cex = 0.6, adj = 0.2, bg = "#D4F2DA")
|
||||
# trees in Newick format and visualize them elsewhere.
|
||||
|
||||
# The "phylo" class object is one of R's "S3" objects and methods to plot and
|
||||
# print it have been defined with the Rphylip package, and the package ape that
|
||||
# Rphylip has loaded. You can simply call plot(<your-tree>) and R knows what to
|
||||
# do with <your-tree> and how to plot it. The underlying function is
|
||||
# plot.phylo(), and documentation for its many options can by found by typing:
|
||||
# print it have been defined with the Rphylip package, and in ape. You can
|
||||
# simply call plot(<your-tree>) and R knows what to do with <your-tree> and how
|
||||
# to plot it. The underlying function is plot.phylo(), and documentation for its
|
||||
# many options can by found by typing:
|
||||
|
||||
?plot.phylo
|
||||
|
||||
@ -125,40 +129,39 @@ apsTree$edge.length
|
||||
|
||||
# show the node / edge and tip labels on a plot
|
||||
plot(apsTree)
|
||||
nodelabels()
|
||||
edgelabels()
|
||||
tiplabels()
|
||||
ape::nodelabels()
|
||||
ape::edgelabels()
|
||||
ape::tiplabels()
|
||||
|
||||
# show the number of nodes, edges and tips
|
||||
Nnode(apsTree)
|
||||
Nedge(apsTree)
|
||||
Ntip(apsTree)
|
||||
ape::Nnode(apsTree)
|
||||
ape::Nedge(apsTree)
|
||||
ape::Ntip(apsTree)
|
||||
|
||||
|
||||
# Finally, write the tree to console in Newick format
|
||||
write.tree(apsTree)
|
||||
ape::write.tree(apsTree)
|
||||
|
||||
# == 2.1 Rooting Trees =====================================================
|
||||
|
||||
# In order to analyse the tree, it is helpful to root it first and reorder its
|
||||
# clades. Contrary to documentation, Rproml() returns an unrooted tree.
|
||||
|
||||
is.rooted(apsTree)
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# You can root the tree with the command root() from the "ape" package. ape is
|
||||
# automatically installed and loaded with Rphylip.
|
||||
# You can root the tree with the command root() from the "ape" package.
|
||||
|
||||
plot(apsTree)
|
||||
|
||||
# add labels for internal nodes and tips
|
||||
nodelabels(cex = 0.5, frame = "circle")
|
||||
tiplabels(cex = 0.5, frame = "rect")
|
||||
ape::nodelabels(cex = 0.5, frame = "circle")
|
||||
ape::tiplabels(cex = 0.5, frame = "rect")
|
||||
|
||||
# The outgroup of the tree is tip "11" in my sample tree, it may be a different
|
||||
# number in yours. Substitute the correct node number below for "outgroup".
|
||||
apsTree <- root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||
apsTree <- ape::root(apsTree, outgroup = 11, resolve.root = TRUE)
|
||||
plot(apsTree)
|
||||
is.rooted(apsTree)
|
||||
ape::is.rooted(apsTree)
|
||||
|
||||
# This tree _looks_ unchanged, beacuse when the root trifurcation was resolved,
|
||||
# an edge of length zero was added to connect the MRCA (Most Recent Common
|
||||
@ -172,7 +175,7 @@ apsTree$edge.length
|
||||
# overlap.
|
||||
apsTree$edge.length[1] <- 0.1
|
||||
plot(apsTree, cex = 0.7)
|
||||
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||
|
||||
|
||||
# This procedure does however not assign an actual length to a root edge, and
|
||||
@ -181,7 +184,7 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.1, bg = "#ff8866")
|
||||
|
||||
apsTree$root.edge <- mean(apsTree$edge.length) * 1.5
|
||||
plot(apsTree, cex = 0.7, root.edge = TRUE)
|
||||
nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||
ape::nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||
|
||||
|
||||
# == 2.2 Rotating Clades ===================================================
|
||||
@ -192,9 +195,9 @@ nodelabels(text = "MRCA", node = 12, cex = 0.5, adj = 0.8, bg = "#ff8866")
|
||||
# We can either rotate around individual internal nodes ...
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(apsTree, no.margin = TRUE, root.edge = TRUE)
|
||||
nodelabels(node = 17, cex = 0.7, bg = "#ff8866")
|
||||
plot(rotate(apsTree, node = 17), no.margin = TRUE, root.edge = TRUE)
|
||||
nodelabels(node = 17, cex = 0.7, bg = "#88ff66")
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#ff8866")
|
||||
plot(ape::rotate(apsTree, node = 13), no.margin = TRUE, root.edge = TRUE)
|
||||
ape::nodelabels(node = 13, cex = 0.7, bg = "#88ff66")
|
||||
# Note that the species at the bottom of the clade descending from node
|
||||
# 17 is now plotted at the top.
|
||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
||||
@ -211,11 +214,15 @@ nOrg <- length(apsTree$tip.label)
|
||||
layout(matrix(1:2, 1, 2))
|
||||
plot(fungiTree,
|
||||
no.margin = TRUE, root.edge = TRUE)
|
||||
nodelabels(text = fungiTree$node.label, cex = 0.5, adj = 0.2, bg = "#D4F2DA")
|
||||
ape::nodelabels(text = fungiTree$node.label,
|
||||
cex = 0.5,
|
||||
adj = 0.2,
|
||||
bg = "#D4F2DA")
|
||||
|
||||
plot(rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
||||
no.margin = TRUE, root.edge = TRUE)
|
||||
add.scale.bar(length = 0.5)
|
||||
plot(ape::rotateConstr(apsTree, apsTree$tip.label[nOrg:1]),
|
||||
no.margin = TRUE,
|
||||
root.edge = TRUE)
|
||||
ape::add.scale.bar(length = 0.5)
|
||||
layout(matrix(1), widths = 1.0, heights = 1.0)
|
||||
|
||||
# Task: Study the two trees and consider their similarities and differences.
|
||||
@ -236,9 +243,8 @@ layout(matrix(1), widths = 1.0, heights = 1.0)
|
||||
|
||||
# Many superb phylogeny tools are contributed by the phangorn package.
|
||||
|
||||
if (!require(phangorn, quietly=TRUE)) {
|
||||
if (! requireNamespace("phangorn", quietly = TRUE)) {
|
||||
install.packages("phangorn")
|
||||
library(phangorn)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = phangorn) # basic information
|
||||
@ -253,14 +259,14 @@ apsTree2$tip.label <- gsub("(MBP1_)|(KILA_)", "", apsTree2$tip.label)
|
||||
# phangorn provides several functions to compute tree-differences (and there
|
||||
# is a _whole_ lot of theory on how to compare trees). treedist() returns the
|
||||
# "symmetric difference"
|
||||
treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||
phangorn::treedist(fungiTree, apsTree2, check.labels = TRUE)
|
||||
|
||||
# Numbers. What do they mean? How much more similar is our apsTree to the
|
||||
# (presumably) ground truth of fungiTree than a random tree would be?
|
||||
# The ape package (which was loaded with RPhylip) provides the function rtree()
|
||||
# The ape package provides the function rtree()
|
||||
# to compute random trees.
|
||||
|
||||
rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
ape::rtree(n = length(apsTree2$tip.label), # number of tips
|
||||
rooted = TRUE, # we rooted the tree above,
|
||||
# and fungiTree is rooted anyway
|
||||
tip.label = apsTree2$tip.label, # use the apsTree2 labels
|
||||
@ -278,17 +284,17 @@ colnames(myTreeDistances) <- c("symm", "path")
|
||||
|
||||
set.seed(112358)
|
||||
for (i in 1:N) {
|
||||
xTree <- rtree(n = length(apsTree2$tip.label),
|
||||
xTree <- ape::rtree(n = length(apsTree2$tip.label),
|
||||
rooted = TRUE,
|
||||
tip.label = apsTree2$tip.label,
|
||||
br = NULL)
|
||||
myTreeDistances[i, ] <- treedist(fungiTree, xTree)
|
||||
myTreeDistances[i, ] <- phangorn::treedist(fungiTree, xTree)
|
||||
}
|
||||
set.seed(NULL) # reset the random number generator
|
||||
|
||||
table(myTreeDistances[, "symm"])
|
||||
|
||||
(symmObs <- treedist(fungiTree, apsTree2)[1])
|
||||
(symmObs <- phangorn::treedist(fungiTree, apsTree2)[1])
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
# events gives us the empirical p-value.
|
||||
@ -298,7 +304,7 @@ cat(sprintf("\nEmpirical p-value for symmetric diff. of observed tree is %1.4f\n
|
||||
hist(myTreeDistances[, "path"],
|
||||
col = "aliceblue",
|
||||
main = "Distances of random Trees to fungiTree")
|
||||
(pathObs <- treedist(fungiTree, apsTree2)[2])
|
||||
(pathObs <- phangorn::treedist(fungiTree, apsTree2)[2])
|
||||
abline(v = pathObs, col = "chartreuse")
|
||||
|
||||
# Random events less-or-equal to observation, divided by total number of
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10. 31
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0 First 2017 version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -29,14 +31,14 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------------
|
||||
#TOC> 1 Calculating Trees 43
|
||||
#TOC> 1.1 PROMLPATH ... 64
|
||||
#TOC> 1.1.1 ... on the Mac 69
|
||||
#TOC> 1.1.2 ... on Windows 80
|
||||
#TOC> 1.1.3 ... on Linux 94
|
||||
#TOC> 1.1.4 Confirming PROMLPATH 99
|
||||
#TOC> 1.2 Building a maximum likelihood tree 108
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Calculating Trees 46
|
||||
#TOC> 1.1 PROMLPATH ... 66
|
||||
#TOC> 1.1.1 ... on the Mac 71
|
||||
#TOC> 1.1.2 ... on Windows 82
|
||||
#TOC> 1.1.3 ... on Linux 96
|
||||
#TOC> 1.1.4 Confirming PROMLPATH 101
|
||||
#TOC> 1.2 Building a maximum likelihood tree 110
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -50,9 +52,8 @@
|
||||
# After you have installed Phylip on your computer, install the R package that
|
||||
# provides an interface to the Phylip functions.
|
||||
|
||||
if (!require(Rphylip, quietly=TRUE)) {
|
||||
if (! requireNamespace("Rphylip", quietly = TRUE)) {
|
||||
install.packages("Rphylip")
|
||||
library(Rphylip)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Rphylip) # basic information
|
||||
@ -110,7 +111,7 @@ list.files(PROMLPATH) # lists the files [1] "proml" "proml.command"
|
||||
# Now read the mfa file you have saved in the BIB-PHYLO-Data_preparation unit,
|
||||
# as a "proseq" object with the read.protein() function of the RPhylip package:
|
||||
|
||||
apsIn <- read.protein("APSESphyloSet.mfa")
|
||||
apsIn <- Rphylip::read.protein("APSESphyloSet.mfa")
|
||||
|
||||
# ... and you are ready to build a tree.
|
||||
|
||||
@ -125,7 +126,7 @@ apsIn <- read.protein("APSESphyloSet.mfa")
|
||||
# process will take us about 5 to 10 minutes. Run this, and anjoy a good cup
|
||||
# of coffee while you are waiting.
|
||||
|
||||
apsTree <- Rproml(apsIn, path=PROMLPATH)
|
||||
apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
|
||||
|
||||
# A quick first look:
|
||||
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-PPI-Analysis unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 08 - 2017 11
|
||||
# Date: 2017 08 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 First live version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -29,13 +32,13 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Setup and data 43
|
||||
#TOC> 2 Functional Edges in the Human Proteome 80
|
||||
#TOC> 2.1 Cliques 123
|
||||
#TOC> 2.2 Communities 164
|
||||
#TOC> 2.3 Betweenness Centrality 178
|
||||
#TOC> 3 biomaRt 224
|
||||
#TOC> 4 Task for submission 295
|
||||
#TOC> 1 Setup and data 46
|
||||
#TOC> 2 Functional Edges in the Human Proteome 82
|
||||
#TOC> 2.1 Cliques 125
|
||||
#TOC> 2.2 Communities 166
|
||||
#TOC> 2.3 Betweenness Centrality 180
|
||||
#TOC> 3 biomaRt 226
|
||||
#TOC> 4 Task for submission 296
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -45,9 +48,8 @@
|
||||
|
||||
# Not surprisingly, the analysis of PPI networks needs iGraph:
|
||||
|
||||
if (!require(igraph, quietly=TRUE)) {
|
||||
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||
install.packages("igraph")
|
||||
library(igraph)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = igraph) # basic information
|
||||
@ -88,9 +90,9 @@ head(STRINGedges)
|
||||
|
||||
|
||||
# Make a graph from this dataframe
|
||||
?graph_from_data_frame
|
||||
?igraph::graph_from_data_frame
|
||||
|
||||
gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||
gSTR <- igraph::graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||
|
||||
# CAUTION you DON'T want to plot a graph with 6,500 nodes and 50,000 edges -
|
||||
# layout of such large graphs is possible, but requires specialized code. Google
|
||||
@ -99,13 +101,13 @@ gSTR <- graph_from_data_frame(STRINGedges, directed = FALSE)
|
||||
|
||||
# Of course simple computations on this graph are reasonably fast:
|
||||
|
||||
compSTR <- components(gSTR)
|
||||
compSTR <- igraph::components(gSTR)
|
||||
summary(compSTR) # our graph is fully connected!
|
||||
|
||||
hist(log(degree(gSTR)), col="#FEE0AF")
|
||||
hist(log(igraph::degree(gSTR)), col="#FEE0AF")
|
||||
# this actually does look rather scale-free
|
||||
|
||||
(freqRank <- table(degree(gSTR)))
|
||||
(freqRank <- table(igraph::degree(gSTR)))
|
||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
log10(as.numeric(freqRank)), type = "b",
|
||||
pch = 21, bg = "#FEE0AF",
|
||||
@ -126,29 +128,29 @@ abline(regressionLine, col = "firebrick")
|
||||
# subgraph, i.e. a subgraph in which every node is connected to every other.
|
||||
# Biological complexes often appear as cliques in interaction graphs.
|
||||
|
||||
clique_num(gSTR)
|
||||
igraph::clique_num(gSTR)
|
||||
# The largest clique has 63 members.
|
||||
|
||||
(C <- largest_cliques(gSTR)[[1]])
|
||||
(C <- igraph::largest_cliques(gSTR)[[1]])
|
||||
|
||||
# Pick one of the proteins and find out what this fully connected cluster of 63
|
||||
# proteins is (you can simply Google for any of the IDs). Is this expected?
|
||||
|
||||
# Plot this ...
|
||||
R <- induced_subgraph(gSTR, C) # makes a graph from a selected set of vertices
|
||||
R <- igraph::induced_subgraph(gSTR, C) # a graph from a selected set of vertices
|
||||
|
||||
# color the vertices along a color spectrum
|
||||
vCol <- rainbow(gorder(R)) # gorder(): order of a graph = number of nodes
|
||||
vCol <- rainbow(igraph::gorder(R)) # "order" of a graph == number of nodes
|
||||
|
||||
# color the edges to have the same color as the originating node
|
||||
eCol <- character()
|
||||
for (i in seq_along(vCol)) {
|
||||
eCol <- c(eCol, rep(vCol[i], gorder(R)))
|
||||
eCol <- c(eCol, rep(vCol[i], igraph::gorder(R)))
|
||||
}
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(R,
|
||||
layout = layout_in_circle(R),
|
||||
layout = igraph::layout_in_circle(R),
|
||||
vertex.size = 3,
|
||||
vertex.color = vCol,
|
||||
edge.color = eCol,
|
||||
@ -164,14 +166,14 @@ par(oPar)
|
||||
# == 2.2 Communities =======================================================
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
gSTRclusters <- cluster_infomap(gSTR)
|
||||
gSTRclusters <- igraph::cluster_infomap(gSTR)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
modularity(gSTRclusters) # ... measures how separated the different membership
|
||||
# types are from each other
|
||||
tMem <- table(membership(gSTRclusters))
|
||||
igraph::modularity(gSTRclusters) # ... measures how separated the different
|
||||
# membership types are from each other
|
||||
tMem <- table(igraph::membership(gSTRclusters))
|
||||
length(tMem) # More than 2000 communities identified
|
||||
hist(tMem, breaks = 50) # most clusters are small ...
|
||||
hist(tMem, breaks = 50, col = "skyblue") # most clusters are small ...
|
||||
range(tMem) # ... but one has > 100 members
|
||||
|
||||
|
||||
@ -179,14 +181,14 @@ range(tMem) # ... but one has > 100 members
|
||||
|
||||
# Let's find the nodes with the 10 - highest betweenness centralities.
|
||||
#
|
||||
BC <- centr_betw(gSTR)
|
||||
BC <- igraph::centr_betw(gSTR)
|
||||
|
||||
# remember: BC$res contains the results
|
||||
head(BC$res)
|
||||
|
||||
BC$res[1] # betweeness centrality of node 1 in the graph ...
|
||||
# ... which one is node 1?
|
||||
V(gSTR)[1]
|
||||
igraph::V(gSTR)[1]
|
||||
|
||||
# to get the ten-highest nodes, we simply label the elements of BC with their
|
||||
# index ...
|
||||
@ -203,7 +205,7 @@ head(sBC)
|
||||
|
||||
# We can use the first ten labels to subset the nodes in gSTR and fetch the
|
||||
# IDs...
|
||||
(ENSPsel <- names(V(gSTR)[BCsel]))
|
||||
(ENSPsel <- names(igraph::V(gSTR)[BCsel]))
|
||||
|
||||
# We are going to use these IDs to produce some output for a submitted task:
|
||||
# so I need you to personalize ENSPsel with the following
|
||||
@ -231,12 +233,11 @@ set.seed(NULL) # reset the random number generator
|
||||
# day), simply a few lines of sample code to get you started on the specific use
|
||||
# case of retrieving descriptions for ensembl protein IDs.
|
||||
|
||||
if (!require(biomaRt, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("biomaRt")
|
||||
library(biomaRt)
|
||||
if (! requireNamespace("biomaRt", quietly = TRUE)) {
|
||||
BiocManager::install("biomaRt")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = biomaRt) # basic information
|
||||
@ -244,14 +245,14 @@ if (!require(biomaRt, quietly=TRUE)) {
|
||||
# data(package = "biomaRt") # available datasets
|
||||
|
||||
# define which dataset to use ...
|
||||
myMart <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||
myMart <- biomaRt::useMart("ensembl", dataset="hsapiens_gene_ensembl")
|
||||
|
||||
# what filters are defined?
|
||||
(filters <- listFilters(myMart))
|
||||
(filters <- biomaRt::listFilters(myMart))
|
||||
|
||||
|
||||
# and what attributes can we filter for?
|
||||
(attributes <- listAttributes(myMart))
|
||||
(attributes <- biomaRt::listAttributes(myMart))
|
||||
|
||||
|
||||
# Soooo many options - let's look for the correct name of filters that are
|
||||
@ -264,7 +265,7 @@ attributes[grep("description", attributes$description, ignore.case=TRUE), ]
|
||||
|
||||
|
||||
# ... so we can put this together: here is a syntax example:
|
||||
getBM(filters = "ensembl_peptide_id",
|
||||
biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
@ -279,7 +280,7 @@ CPdefs <- list() # Since we don't know how many matches one of our queries
|
||||
# will return, we'll put the result dataframes into a list.
|
||||
|
||||
for (ID in ENSPsel) {
|
||||
CPdefs[[ID]] <- getBM(filters = "ensembl_peptide_id",
|
||||
CPdefs[[ID]] <- biomaRt::getBM(filters = "ensembl_peptide_id",
|
||||
attributes = c("hgnc_symbol",
|
||||
"wikigene_description",
|
||||
"interpro_description",
|
||||
|
@ -3,13 +3,17 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-SEQA-Comparison unit
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 11 17
|
||||
# Date: 2017 11 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.0 First live version 2017
|
||||
# V 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# Versions:
|
||||
# 1.0 First live version 2017
|
||||
# 0.1 First code copied from BCH441_A03_makeYFOlist.R
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
@ -27,24 +31,23 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Preparation 41
|
||||
#TOC> 2 Aggregate properties 63
|
||||
#TOC> 3 Sequence Composition Enrichment 106
|
||||
#TOC> 3.1 Barplot, and side-by-side barplot 129
|
||||
#TOC> 3.2 Plotting ratios 164
|
||||
#TOC> 3.3 Plotting log ratios 180
|
||||
#TOC> 3.4 Sort by frequency 195
|
||||
#TOC> 3.5 Color by amino acid type 210
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 Preparation 47
|
||||
#TOC> 2 Aggregate properties 68
|
||||
#TOC> 3 Sequence Composition Enrichment 111
|
||||
#TOC> 3.1 Barplot, and side-by-side barplot 134
|
||||
#TOC> 3.2 Plotting ratios 169
|
||||
#TOC> 3.3 Plotting log ratios 185
|
||||
#TOC> 3.4 Sort by frequency 200
|
||||
#TOC> 3.5 Color by amino acid type 215
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Preparation =========================================================
|
||||
|
||||
if (!require(seqinr, quietly=TRUE)) {
|
||||
if (! requireNamespace("seqinr", quietly = TRUE)) {
|
||||
install.packages("seqinr")
|
||||
library(seqinr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
@ -66,7 +69,7 @@ if (!require(seqinr, quietly=TRUE)) {
|
||||
|
||||
|
||||
# Let's try a simple function from seqinr: computing the pI of the sequence
|
||||
?computePI
|
||||
?seqinr::computePI
|
||||
|
||||
# This takes as input a vector of upper-case AA codes
|
||||
|
||||
@ -82,12 +85,12 @@ s <- unlist(s) # strsplit() returns a list! Why?
|
||||
# the function s2c() to convert strings into
|
||||
# character vectors (and c2s to convert them back).
|
||||
|
||||
s2c(mySeq)
|
||||
seqinr::s2c(mySeq)
|
||||
|
||||
|
||||
computePI(s2c(mySeq)) # isoelectric point
|
||||
pmw(s2c(mySeq)) # molecular weight
|
||||
AAstat(s2c(mySeq)) # This also plots the distribution of
|
||||
seqinr::computePI(s2c(mySeq)) # isoelectric point
|
||||
seqinr::pmw(s2c(mySeq)) # molecular weight
|
||||
seqinr::AAstat(s2c(mySeq)) # This also plots the distribution of
|
||||
# values along the sequence
|
||||
|
||||
# A true Labor of Love has gone into the
|
||||
@ -117,7 +120,7 @@ aaindex[[459]]$D
|
||||
# with the amino acid counts in our sequence.
|
||||
|
||||
(refData <- aaindex[[459]]$I) # reference frequencies in %
|
||||
names(refData) <- a(names(refData)) # change names to single-letter
|
||||
names(refData) <- seqinr::a(names(refData)) # change names to single-letter
|
||||
# code using seqinr's "a()" function
|
||||
sum(refData)
|
||||
refData # ... in %
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the BIN-Sequence unit.
|
||||
#
|
||||
# Version: 1.3
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2017 09 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.4 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.3 Update set.seed() usage
|
||||
# 1.2 Removed irrelevant task. How did that even get in there? smh
|
||||
# 1.1 Add chartr()
|
||||
@ -30,23 +33,23 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Prepare 60
|
||||
#TOC> 2 Storing Sequence 78
|
||||
#TOC> 3 String properties 107
|
||||
#TOC> 4 Substrings 114
|
||||
#TOC> 5 Creating strings: sprintf() 135
|
||||
#TOC> 6 Changing strings 170
|
||||
#TOC> 6.1.1 Changing case 172
|
||||
#TOC> 6.1.2 Reverse 177
|
||||
#TOC> 6.1.3 Change characters 181
|
||||
#TOC> 6.1.4 Substitute characters 209
|
||||
#TOC> 6.2 stringi and stringr 229
|
||||
#TOC> 6.3 dbSanitizeSequence() 239
|
||||
#TOC> 7 Permuting and sampling 251
|
||||
#TOC> 7.1 Permutations 258
|
||||
#TOC> 7.2 Sampling 304
|
||||
#TOC> 7.2.1 Equiprobable characters 306
|
||||
#TOC> 7.2.2 Defined probability vector 348
|
||||
#TOC> 1 Prepare 63
|
||||
#TOC> 2 Storing Sequence 80
|
||||
#TOC> 3 String properties 109
|
||||
#TOC> 4 Substrings 116
|
||||
#TOC> 5 Creating strings: sprintf() 137
|
||||
#TOC> 6 Changing strings 172
|
||||
#TOC> 6.1.1 Changing case 174
|
||||
#TOC> 6.1.2 Reverse 179
|
||||
#TOC> 6.1.3 Change characters 183
|
||||
#TOC> 6.1.4 Substitute characters 211
|
||||
#TOC> 6.2 stringi and stringr 231
|
||||
#TOC> 6.3 dbSanitizeSequence() 241
|
||||
#TOC> 7 Permuting and sampling 253
|
||||
#TOC> 7.1 Permutations 260
|
||||
#TOC> 7.2 Sampling 306
|
||||
#TOC> 7.2.1 Equiprobable characters 308
|
||||
#TOC> 7.2.2 Defined probability vector 350
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -62,12 +65,11 @@
|
||||
# Much basic sequence handling is supported by the Bioconductor package
|
||||
# Biostrings.
|
||||
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
@ -86,7 +88,7 @@ if (! require(Biostrings, quietly=TRUE)) {
|
||||
|
||||
# ... or as more complex objects with rich metadata e.g. as a Biostrings
|
||||
# DNAstring, RNAstring, AAString, etc.
|
||||
(a <- AAString("DIVMTQ"))
|
||||
(a <- Biostrings::AAString("DIVMTQ"))
|
||||
|
||||
# ... and all of these representations can be interconverted:
|
||||
|
||||
@ -314,6 +316,7 @@ N <- 100
|
||||
set.seed(16818) # set RNG seed for repeatable randomness
|
||||
v <- sample(nuc, N, replace = TRUE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
(mySeq <- paste(v, collapse = ""))
|
||||
|
||||
# What's the GC content?
|
||||
@ -323,9 +326,8 @@ sum(table(v)[c("G", "C")]) # 51 is close to expected
|
||||
# What's the number of CpG motifs? Easy to check with the stringi
|
||||
# stri_match_all() function
|
||||
|
||||
if (! require(stringi, quietly=TRUE)) {
|
||||
if (! requireNamespace("stringi", quietly = TRUE)) {
|
||||
install.packages("stringi")
|
||||
library(stringi)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = stringi) # basic information
|
||||
|
@ -29,27 +29,27 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------------------
|
||||
#TOC> 1 A Relational Datamodel in R: review 62
|
||||
#TOC> 1.1 Building a sample database structure 102
|
||||
#TOC> 1.1.1 completing the database 213
|
||||
#TOC> 1.2 Querying the database 248
|
||||
#TOC> 1.3 Task: submit for credit (part 1/2) 277
|
||||
#TOC> 2 Implementing the protein datamodel 289
|
||||
#TOC> 2.1 JSON formatted source data 315
|
||||
#TOC> 2.2 "Sanitizing" sequence data 355
|
||||
#TOC> 2.3 Create a protein table for our data model 375
|
||||
#TOC> 2.3.1 Initialize the database 377
|
||||
#TOC> 2.3.2 Add data 389
|
||||
#TOC> 2.4 Complete the database 409
|
||||
#TOC> 2.4.1 Examples of navigating the database 436
|
||||
#TOC> 2.5 Updating the database 468
|
||||
#TOC> 3 Add your own data 480
|
||||
#TOC> 3.1 Find a protein 488
|
||||
#TOC> 3.2 Put the information into JSON files 517
|
||||
#TOC> 3.3 Create an R script to create your own database 540
|
||||
#TOC> 3.3.1 Check and validate 560
|
||||
#TOC> 3.4 Task: submit for credit (part 2/2) 601
|
||||
#TOC> -----------------------------------------------------------------------
|
||||
#TOC> 1 A Relational Datamodel in R: review 57
|
||||
#TOC> 1.1 Building a sample database structure 97
|
||||
#TOC> 1.1.1 completing the database 208
|
||||
#TOC> 1.2 Querying the database 243
|
||||
#TOC> 1.3 Task: submit for credit (part 1/2) 272
|
||||
#TOC> 2 Implementing the protein datamodel 284
|
||||
#TOC> 2.1 JSON formatted source data 310
|
||||
#TOC> 2.2 "Sanitizing" sequence data 350
|
||||
#TOC> 2.3 Create a protein table for our data model 370
|
||||
#TOC> 2.3.1 Initialize the database 372
|
||||
#TOC> 2.3.2 Add data 384
|
||||
#TOC> 2.4 Complete the database 404
|
||||
#TOC> 2.4.1 Examples of navigating the database 431
|
||||
#TOC> 2.5 Updating the database 463
|
||||
#TOC> 3 Add your own data 475
|
||||
#TOC> 3.1 Find a protein 483
|
||||
#TOC> 3.2 Put the information into JSON files 512
|
||||
#TOC> 3.3 Create an R script to create your own database 535
|
||||
#TOC> 3.3.1 Check and validate 555
|
||||
#TOC> 3.4 Task: submit for credit (part 2/2) 596
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -328,11 +328,11 @@ file.show("./data/MBP1_SACCE.json")
|
||||
# sanitize the sequence at some point. But since we need to do that
|
||||
# anyway, it is easier to see the whole sequence if we store it in chunks.
|
||||
|
||||
# Let's load the "jsonlite" package and have a look at how it reads this data.
|
||||
# Let's make sure the "jsonlite" package exists on your computer, then we'll
|
||||
# explore how it reads this data.
|
||||
|
||||
if (! require(jsonlite, quietly=TRUE)) {
|
||||
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||
install.packages("jsonlite")
|
||||
library(jsonlite)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = jsonlite) # basic information
|
||||
@ -340,7 +340,7 @@ if (! require(jsonlite, quietly=TRUE)) {
|
||||
# data(package = "jsonlite") # available datasets
|
||||
|
||||
|
||||
x <- fromJSON("./data/MBP1_SACCE.json")
|
||||
x <- jsonlite::fromJSON("./data/MBP1_SACCE.json")
|
||||
str(x)
|
||||
|
||||
x$name
|
||||
@ -389,7 +389,7 @@ str(myDB)
|
||||
|
||||
dbAddProtein
|
||||
|
||||
myDB <- dbAddProtein(myDB, fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||
str(myDB)
|
||||
|
||||
# Lets check that the 833 amino acids of the yeast MBP1 sequence have
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-Genetic_code unit.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 12
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0.1 Comment on "incomplete final line" warning in FASTA
|
||||
# 1.0 First live version
|
||||
#
|
||||
@ -27,14 +30,14 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 Storing the genetic code 47
|
||||
#TOC> 1.1 Genetic code in Biostrings 65
|
||||
#TOC> 2 Working with the genetic code 97
|
||||
#TOC> 2.1 Translate a sequence. 126
|
||||
#TOC> 3 An alternative representation: 3D array 208
|
||||
#TOC> 3.1 Print a Genetic code table 241
|
||||
#TOC> 4 Tasks 267
|
||||
#TOC> ----------------------------------------------------------------
|
||||
#TOC> 1 Storing the genetic code 45
|
||||
#TOC> 1.1 Genetic code in Biostrings 63
|
||||
#TOC> 2 Working with the genetic code 94
|
||||
#TOC> 2.1 Translate a sequence. 129
|
||||
#TOC> 3 An alternative representation: 3D array 212
|
||||
#TOC> 3.1 Print a Genetic code table 246
|
||||
#TOC> 4 Tasks 272
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -63,12 +66,11 @@ x["TAA"]
|
||||
# available in the Bioconductor "Biostrings" package:
|
||||
|
||||
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
@ -77,45 +79,51 @@ if (! require(Biostrings, quietly=TRUE)) {
|
||||
|
||||
|
||||
# The standard genetic code vector
|
||||
GENETIC_CODE
|
||||
Biostrings::GENETIC_CODE
|
||||
|
||||
# The table of genetic codes. This information corresponds to this page
|
||||
# at the NCBI:
|
||||
# https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=tgencodes
|
||||
GENETIC_CODE_TABLE
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
|
||||
# Most of the alternative codes are mitochondrial codes. The id of the
|
||||
# Alternative Yeast Nuclear code is "12"
|
||||
getGeneticCode("12") # Alternative Yeast Nuclear
|
||||
Biostrings::getGeneticCode("12") # Alternative Yeast Nuclear
|
||||
|
||||
|
||||
# = 2 Working with the genetic code =======================================
|
||||
|
||||
# GENETIC_CODE is a "named vector"
|
||||
# We'll use Biostrings::GENETIC_CODE a lot in this script, so we'll assign it
|
||||
# to a "local" variable, rather than retrieving it from the package all the
|
||||
# time.
|
||||
|
||||
str(GENETIC_CODE)
|
||||
genCode <- Biostrings::GENETIC_CODE
|
||||
|
||||
# This is a named vector of characters ...
|
||||
|
||||
str(genCode)
|
||||
|
||||
# ... which also stores the alternative initiation codons TTG and CTG in
|
||||
# an attribute of the vector. (Alternative initiation codons sometimes are
|
||||
# used instead of ATG to intiate translation, if if not ATG they are translated
|
||||
# with fMet.)
|
||||
|
||||
attr(GENETIC_CODE, "alt_init_codons")
|
||||
attr(genCode, "alt_init_codons")
|
||||
|
||||
# But the key to use this vector is in the "names" which we use for subsetting
|
||||
# the list of amino acids in whatever way we need.
|
||||
names(GENETIC_CODE)
|
||||
names(genCode)
|
||||
|
||||
# The translation of "TGG" ...
|
||||
GENETIC_CODE["TGG"]
|
||||
genCode["TGG"]
|
||||
|
||||
# All stop codons
|
||||
names(GENETIC_CODE)[GENETIC_CODE == "*"]
|
||||
names(genCode)[genCode == "*"]
|
||||
|
||||
# All start codons
|
||||
names(GENETIC_CODE)[GENETIC_CODE == "M"] # ... or
|
||||
c(names(GENETIC_CODE)[GENETIC_CODE == "M"],
|
||||
attr(GENETIC_CODE, "alt_init_codons"))
|
||||
names(genCode)[genCode == "M"] # ... or
|
||||
c(names(genCode)[genCode == "M"],
|
||||
attr(genCode, "alt_init_codons"))
|
||||
|
||||
|
||||
# == 2.1 Translate a sequence. =============================================
|
||||
@ -165,7 +173,7 @@ nchar(mbp1)/3
|
||||
# attributes that are useful for Biostrings. Thus we convert the sequence first
|
||||
# with DNAstring(), then split it up, then convert it into a plain
|
||||
# character vector.
|
||||
mbp1Codons <- as.character(codons(DNAString(mbp1)))
|
||||
mbp1Codons <- as.character(Biostrings::codons(Biostrings::DNAString(mbp1)))
|
||||
|
||||
head(mbp1Codons)
|
||||
|
||||
@ -173,7 +181,7 @@ head(mbp1Codons)
|
||||
|
||||
mbp1AA <- character(834)
|
||||
for (i in seq_along(mbp1Codons)) {
|
||||
mbp1AA[i] <- GENETIC_CODE[mbp1Codons[i]]
|
||||
mbp1AA[i] <- genCode[mbp1Codons[i]]
|
||||
}
|
||||
|
||||
head(mbp1Codons)
|
||||
@ -196,7 +204,8 @@ sort(table(mbp1AA), decreasing = TRUE)
|
||||
mbp1AA <- mbp1AA[-(length(mbp1AA))]
|
||||
tail(mbp1AA) # Note the stop is gone!
|
||||
|
||||
# paste it together, collapsing the elements without separation-character
|
||||
# paste it together, collapsing the elements using an empty string as the
|
||||
# separation-character (i.e.: nothing)
|
||||
(Mbp1 <- paste(mbp1AA, sep = "", collapse = ""))
|
||||
|
||||
|
||||
@ -204,14 +213,15 @@ tail(mbp1AA) # Note the stop is gone!
|
||||
|
||||
|
||||
# We don't use 3D arrays often - usually just 2D tables and data frames, so
|
||||
# here is a good opportunity to review the syntax with a genetic code cube:
|
||||
# here is a good opportunity to review the syntax of 3D arrays with a
|
||||
# genetic code cube:
|
||||
|
||||
# Initialize, using A C G T as the names of the elements in each dimension
|
||||
# Initialize, using A G C T as the names of the elements in each dimension
|
||||
cCube <- array(data = character(64),
|
||||
dim = c(4, 4, 4),
|
||||
dimnames = list(c("A", "C", "G", "T"),
|
||||
c("A", "C", "G", "T"),
|
||||
c("A", "C", "G", "T")))
|
||||
dimnames = list(c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T"),
|
||||
c("A", "G", "C", "T")))
|
||||
|
||||
# fill it with amino acid codes using three nested loops
|
||||
for (i in 1:4) {
|
||||
@ -222,7 +232,7 @@ for (i in 1:4) {
|
||||
dimnames(cCube)[[3]][k],
|
||||
sep = "",
|
||||
collapse = "")
|
||||
cCube[i, j, k] <- GENETIC_CODE[myCodon]
|
||||
cCube[i, j, k] <- genCode[myCodon]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -291,14 +301,14 @@ for (i in nuc) {
|
||||
# Solution:
|
||||
|
||||
# Fetch the code
|
||||
GENETIC_CODE_TABLE
|
||||
GENETIC_CODE_TABLE$name[GENETIC_CODE_TABLE$id == "12"]
|
||||
altYcode <- getGeneticCode("12")
|
||||
Biostrings::GENETIC_CODE_TABLE
|
||||
Biostrings::GENETIC_CODE_TABLE$name[Biostrings::GENETIC_CODE_TABLE$id=="12"]
|
||||
altYcode <- Biostrings::getGeneticCode("12")
|
||||
|
||||
# what's the difference?
|
||||
(delta <- which(GENETIC_CODE != altYcode))
|
||||
(delta <- which(Biostrings::GENETIC_CODE != altYcode))
|
||||
|
||||
GENETIC_CODE[delta]
|
||||
Biostrings::GENETIC_CODE[delta]
|
||||
altYcode[delta]
|
||||
|
||||
# translate
|
||||
@ -319,7 +329,7 @@ for (i in nuc) {
|
||||
#
|
||||
#
|
||||
# Solution:
|
||||
table(table(GENETIC_CODE))
|
||||
table(table(Biostrings::GENETIC_CODE))
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-MAT-Graphs_and_networks unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0 First final version for learning units.
|
||||
# 0.1 First code copied from 2016 material.
|
||||
@ -30,17 +32,17 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ------------------------------------------------------------
|
||||
#TOC> 1 Review 48
|
||||
#TOC> 2 DEGREE DISTRIBUTIONS 201
|
||||
#TOC> 2.1 Random graph 207
|
||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 255
|
||||
#TOC> 2.3 Random geometric graph 320
|
||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 442
|
||||
#TOC> 3.1 Basics 445
|
||||
#TOC> 3.2 Components 517
|
||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 536
|
||||
#TOC> 4.1 Diameter 573
|
||||
#TOC> 5 GRAPH CLUSTERING 641
|
||||
#TOC> 1 Review 50
|
||||
#TOC> 2 DEGREE DISTRIBUTIONS 204
|
||||
#TOC> 2.1 Random graph 210
|
||||
#TOC> 2.2 scale-free graph (Barabasi-Albert) 258
|
||||
#TOC> 2.3 Random geometric graph 323
|
||||
#TOC> 3 A CLOSER LOOK AT THE igraph PACKAGE 445
|
||||
#TOC> 3.1 Basics 448
|
||||
#TOC> 3.2 Components 520
|
||||
#TOC> 4 RANDOM GRAPHS AND GRAPH METRICS 539
|
||||
#TOC> 4.1 Diameter 576
|
||||
#TOC> 5 GRAPH CLUSTERING 645
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -123,9 +125,8 @@ set.seed(NULL) # reset the RNG
|
||||
# standard package for work with graphs in r is "igraph". We'll go into more
|
||||
# details of the igraph package a bit later, for now we just use it to plot:
|
||||
|
||||
if (! require(igraph, quietly=TRUE)) {
|
||||
if (! requireNamespace("igraph", quietly = TRUE)) {
|
||||
install.packages("igraph")
|
||||
library(igraph)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = igraph) # basic information
|
||||
@ -133,10 +134,12 @@ if (! require(igraph, quietly=TRUE)) {
|
||||
# data(package = "igraph") # available datasets
|
||||
|
||||
|
||||
myG <- graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
||||
myG <- igraph::graph_from_adjacency_matrix(myRandAM, mode = "undirected")
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
myGxy <- layout_with_graphopt(myG, charge=0.0012) # calculate layout coordinates
|
||||
# calculate layout coordinates
|
||||
myGxy <- igraph::layout_with_graphopt(myG,
|
||||
charge=0.0012)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
@ -157,9 +160,9 @@ plot(myG,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
||||
vertex.size = 1600 + (300 * degree(myG)),
|
||||
vertex.label = sprintf("%s(%i)", names(V(myG)), degree(myG)),
|
||||
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||
vertex.size = 1600 + (300 * igraph::degree(myG)),
|
||||
vertex.label = sprintf("%s(%i)", names(igraph::V(myG)), igraph::degree(myG)),
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.7)
|
||||
par(oPar) # reset plot window
|
||||
@ -179,10 +182,10 @@ sum(myRandAM)
|
||||
rowSums(myRandAM) + colSums(myRandAM) # check this against the plot!
|
||||
|
||||
# The function degree() gives the same values
|
||||
degree(myG)
|
||||
igraph::degree(myG)
|
||||
|
||||
# Let's plot the degree distribution in a histogram:
|
||||
degG <- degree(myG)
|
||||
degG <- igraph::degree(myG)
|
||||
brk <- seq(min(degG)-0.5, max(degG)+0.5, by=1) # define histogram breaks
|
||||
hist(degG, breaks=brk, col="#A5CCF5",
|
||||
xlim = c(-1,8), xaxt = "n",
|
||||
@ -212,8 +215,8 @@ set.seed(31415927) # set RNG seed for repeatable randomness
|
||||
my200AM <- makeRandomAM(as.character(1:N), p = 0.015)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
myG200 <- graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
||||
myGxy <- layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
||||
myG200 <- igraph::graph_from_adjacency_matrix(my200AM, mode = "undirected")
|
||||
myGxy <- igraph::layout_with_graphopt(myG200, charge=0.0001) # calculate layout
|
||||
# coordinates
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||
@ -222,8 +225,8 @@ plot(myG200,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(degree(myG200)+1))[degree(myG200)+1],
|
||||
vertex.size = 150 + (60 * degree(myG200)),
|
||||
vertex.color=heat.colors(max(igraph::degree(myG200)+1))[igraph::degree(myG200)+1],
|
||||
vertex.size = 150 + (60 * igraph::degree(myG200)),
|
||||
vertex.label = NA)
|
||||
par(oPar) # restore graphics state
|
||||
|
||||
@ -231,7 +234,7 @@ par(oPar) # restore graphics state
|
||||
# biological graphs look approximately like this.
|
||||
|
||||
# Calculate degree distributions
|
||||
dg <- degree(myG200)
|
||||
dg <- igraph::degree(myG200)
|
||||
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
||||
hist(dg, breaks=brk, col="#A5F5CC",
|
||||
xlim = c(-1,11), xaxt = "n",
|
||||
@ -263,10 +266,10 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
N <- 200
|
||||
|
||||
set.seed(31415927) # set RNG seed for repeatable randomness
|
||||
GBA <- sample_pa(N, power = 0.8, directed = FALSE)
|
||||
GBA <- igraph::sample_pa(N, power = 0.8, directed = FALSE)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
GBAxy <- layout_with_graphopt(GBA, charge=0.0001) # calculate layout coordinates
|
||||
GBAxy <- igraph::layout_with_graphopt(GBA, charge=0.0001)
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off, save graphics state
|
||||
plot(GBA,
|
||||
@ -274,8 +277,8 @@ plot(GBA,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(GBAxy[,1]) * 0.99, max(GBAxy[,1]) * 1.01),
|
||||
ylim = c(min(GBAxy[,2]) * 0.99, max(GBAxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(degree(GBA)+1))[degree(GBA)+1],
|
||||
vertex.size = 200 + (30 * degree(GBA)),
|
||||
vertex.color=heat.colors(max(igraph::degree(GBA)+1))[igraph::degree(GBA)+1],
|
||||
vertex.size = 200 + (30 * igraph::degree(GBA)),
|
||||
vertex.label = NA)
|
||||
par(oPar) # restore grphics state
|
||||
|
||||
@ -287,7 +290,7 @@ par(oPar) # restore grphics state
|
||||
# singletons.
|
||||
|
||||
# What's the degree distribution of this graph?
|
||||
(dg <- degree(GBA))
|
||||
(dg <- igraph::degree(GBA))
|
||||
brk <- seq(min(dg)-0.5, max(dg)+0.5, by=1)
|
||||
hist(dg, breaks=brk, col="#DCF5B5",
|
||||
xlim = c(0,max(dg)+1), xaxt = "n",
|
||||
@ -307,8 +310,8 @@ plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
# Sort-of linear, but many of the higher ranked nodes have a frequency of only
|
||||
# one. That behaviour smooths out in larger graphs:
|
||||
#
|
||||
X <- sample_pa(100000, power = 0.8, directed = FALSE) # 100,000 nodes
|
||||
freqRank <- table(degree(X))
|
||||
X <- igraph::sample_pa(1e5, power = 0.8, directed = FALSE) # 100,000 nodes
|
||||
freqRank <- table(igraph::degree(X))
|
||||
plot(log10(as.numeric(names(freqRank)) + 1),
|
||||
log10(as.numeric(freqRank)), type = "b",
|
||||
xlab = "log(Rank)", ylab = "log(frequency)",
|
||||
@ -404,7 +407,7 @@ rGAM <- makeRandomGeometricAM(as.character(1:N), t = 0.4)
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
|
||||
myGRG <- graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
||||
myGRG <- igraph::graph_from_adjacency_matrix(rGAM$mat, mode = "undirected")
|
||||
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(myGRG,
|
||||
@ -412,13 +415,13 @@ plot(myGRG,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
||||
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
||||
vertex.color=heat.colors(max(degree(myGRG)+1))[degree(myGRG)+1],
|
||||
vertex.size = 0.1 + (0.2 * degree(myGRG)),
|
||||
vertex.color=heat.colors(max(igraph::degree(myGRG)+1))[igraph::degree(myGRG)+1],
|
||||
vertex.size = 0.1 + (0.2 * igraph::degree(myGRG)),
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
|
||||
# degree distribution:
|
||||
(dg <- degree(myGRG))
|
||||
(dg <- igraph::degree(myGRG))
|
||||
brk <- seq(min(dg) - 0.5, max(dg) + 0.5, by = 1)
|
||||
hist(dg, breaks = brk, col = "#FCC6D2",
|
||||
xlim = c(0, 25), xaxt = "n",
|
||||
@ -450,7 +453,7 @@ summary(myG)
|
||||
|
||||
# This output means: this is an IGRAPH graph, with U = UN-directed edges
|
||||
# and N = named nodes, that has 20 nodes and 20 edges. For details, see
|
||||
?print.igraph
|
||||
?igraph::print.igraph
|
||||
|
||||
mode(myG)
|
||||
class(myG)
|
||||
@ -463,11 +466,11 @@ class(myG)
|
||||
# recipes, called _games_ in this package.
|
||||
|
||||
# Two basic functions retrieve nodes "Vertices", and "Edges":
|
||||
V(myG)
|
||||
E(myG)
|
||||
igraph::V(myG)
|
||||
igraph::E(myG)
|
||||
|
||||
# additional properties can be retrieved from the Vertices ...
|
||||
V(myG)$name
|
||||
igraph::V(myG)$name
|
||||
|
||||
|
||||
# As with many R objects, loading the package provides special functions that
|
||||
@ -487,12 +490,12 @@ plot(myG) # this is the result of default plot parameters
|
||||
# Plot with some customizing parameters
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(myG,
|
||||
layout = layout_with_fr(myG),
|
||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
||||
vertex.size = 9 + (2 * degree(myG)),
|
||||
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
|
||||
layout = igraph::layout_with_fr(myG),
|
||||
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||
vertex.size = 9 + (2 * igraph::degree(myG)),
|
||||
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
|
||||
edge.width = 2,
|
||||
vertex.label = V(myG)$name,
|
||||
vertex.label = igraph::V(myG)$name,
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.9)
|
||||
par(oPar)
|
||||
@ -500,12 +503,12 @@ par(oPar)
|
||||
# ... or with a different layout:
|
||||
oPar <- par(mar= rep(0,4)) # Turn margins off
|
||||
plot(myG,
|
||||
layout = layout_in_circle(myG),
|
||||
vertex.color=heat.colors(max(degree(myG)+1))[degree(myG)+1],
|
||||
vertex.size = 9 + (2 * degree(myG)),
|
||||
vertex.label.cex = 0.5 + (0.05 * degree(myG)),
|
||||
layout = igraph::layout_in_circle(myG),
|
||||
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||
vertex.size = 9 + (2 * igraph::degree(myG)),
|
||||
vertex.label.cex = 0.5 + (0.05 * igraph::degree(myG)),
|
||||
edge.width = 2,
|
||||
vertex.label = V(myG)$name,
|
||||
vertex.label = igraph::V(myG)$name,
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.9)
|
||||
par(oPar)
|
||||
@ -518,18 +521,18 @@ par(oPar)
|
||||
|
||||
# The igraph function components() tells us whether there are components of the
|
||||
# graph in which there is no path to other components.
|
||||
components(myG)
|
||||
igraph::components(myG)
|
||||
|
||||
# In the _membership_ vector, nodes are annotated with the index of the
|
||||
# component they are part of. Sui7 is the only node of component 2, Cyj1 is in
|
||||
# the third component etc. This is perhaps more clear if we sort by component
|
||||
# index
|
||||
sort(components(myG)$membership, decreasing = TRUE)
|
||||
sort(igraph::components(myG)$membership, decreasing = TRUE)
|
||||
|
||||
# Retrieving e.g. the members of the first component from the list can be done by subsetting:
|
||||
|
||||
(sel <- components(myG)$membership == 1) # boolean vector ..
|
||||
(c1 <- components(myG)$membership[sel])
|
||||
(sel <- igraph::components(myG)$membership == 1) # boolean vector ..
|
||||
(c1 <- igraph::components(myG)$membership[sel])
|
||||
names(c1)
|
||||
|
||||
|
||||
@ -542,9 +545,9 @@ names(c1)
|
||||
# preferential-attachment ... but igraph has ways to simulate the basic ones
|
||||
# (and we could easily simulate our own). Look at the following help pages:
|
||||
|
||||
?sample_gnm # see also sample_gnp for the Erdös-Rényi models
|
||||
?sample_smallworld # for the Watts & Strogatz model
|
||||
?sample_pa # for the Barabasi-Albert model
|
||||
?igraph::sample_gnm # see also sample_gnp for the Erdös-Rényi models
|
||||
?igraph::sample_smallworld # for the Watts & Strogatz model
|
||||
?igraph::sample_pa # for the Barabasi-Albert model
|
||||
|
||||
# But note that there are many more sample_ functions. Check out the docs!
|
||||
|
||||
@ -554,7 +557,7 @@ names(c1)
|
||||
# layout drawas them, obviously.
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
myGxy <- layout_with_fr(myG) # calculate layout coordinates
|
||||
myGxy <- igraph::layout_with_fr(myG) # calculate layout coordinates
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
oPar <- par(mar = rep(0, 4)) # turn margins off, save graphics state
|
||||
@ -563,30 +566,31 @@ plot(myG,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(degree(myG) + 1))[degree(myG) + 1],
|
||||
vertex.size = 20 + (10 * degree(myG)),
|
||||
vertex.label = V(myG)$name,
|
||||
vertex.color=heat.colors(max(igraph::degree(myG)+1))[igraph::degree(myG)+1],
|
||||
vertex.size = 20 + (10 * igraph::degree(myG)),
|
||||
vertex.label = igraph::V(myG)$name,
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.8)
|
||||
par(oPar) # restore graphics state
|
||||
|
||||
# == 4.1 Diameter ==========================================================
|
||||
|
||||
diameter(myG) # The diameter of a graph is its maximum length shortest path.
|
||||
igraph::diameter(myG) # The diameter of a graph is its maximum length
|
||||
# shortest path.
|
||||
|
||||
# let's plot this path: here are the nodes ...
|
||||
get_diameter(myG)
|
||||
igraph::get_diameter(myG)
|
||||
|
||||
# ... and we can get the x, y coordinates from iGxy by subsetting with the node
|
||||
# names. The we draw the diameter-path with a transparent, thick pink line:
|
||||
lines(myGxy[get_diameter(myG),], lwd=10, col="#ff63a788")
|
||||
lines(myGxy[igraph::get_diameter(myG),], lwd=10, col="#ff63a788")
|
||||
|
||||
# == Centralization scores
|
||||
|
||||
?centralize
|
||||
?igraph::centralize
|
||||
# replot our graph, and color by log_betweenness:
|
||||
|
||||
bC <- centr_betw(myG) # calculate betweenness centrality
|
||||
bC <- igraph::centr_betw(myG) # calculate betweenness centrality
|
||||
nodeBetw <- bC$res
|
||||
nodeBetw <- round(log(nodeBetw +1)) + 1
|
||||
|
||||
@ -597,8 +601,8 @@ plot(myG,
|
||||
xlim = c(min(myGxy[,1]) * 0.99, max(myGxy[,1]) * 1.01),
|
||||
ylim = c(min(myGxy[,2]) * 0.99, max(myGxy[,2]) * 1.01),
|
||||
vertex.color=heat.colors(max(nodeBetw))[nodeBetw],
|
||||
vertex.size = 20 + (10 * degree(myG)),
|
||||
vertex.label = V(myG)$name,
|
||||
vertex.size = 20 + (10 * igraph::degree(myG)),
|
||||
vertex.label = igraph::V(myG)$name,
|
||||
vertex.label.family = "sans",
|
||||
vertex.label.cex = 0.7)
|
||||
par(oPar)
|
||||
@ -613,7 +617,7 @@ par(oPar)
|
||||
#
|
||||
# Lets plot betweenness centrality for our random geometric graph:
|
||||
|
||||
bCmyGRG <- centr_betw(myGRG) # calculate betweenness centrality
|
||||
bCmyGRG <- igraph::centr_betw(myGRG) # calculate betweenness centrality
|
||||
|
||||
nodeBetw <- bCmyGRG$res
|
||||
nodeBetw <- round((log(nodeBetw +1))^2.5) + 1
|
||||
@ -630,9 +634,9 @@ plot(myGRG,
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
|
||||
diameter(myGRG)
|
||||
lines(rGAM$x[get_diameter(myGRG)],
|
||||
rGAM$y[get_diameter(myGRG)],
|
||||
igraph::diameter(myGRG)
|
||||
lines(rGAM$x[igraph::get_diameter(myGRG)],
|
||||
rGAM$y[igraph::get_diameter(myGRG)],
|
||||
lwd = 10,
|
||||
col = "#ff335533")
|
||||
|
||||
@ -648,11 +652,11 @@ lines(rGAM$x[get_diameter(myGRG)],
|
||||
# http://www.ncbi.nlm.nih.gov/pubmed/18216267 and htttp://www.mapequation.org
|
||||
|
||||
|
||||
myGRGclusters <- cluster_infomap(myGRG)
|
||||
modularity(myGRGclusters) # ... measures how separated the different membership
|
||||
# types are from each other
|
||||
membership(myGRGclusters) # which nodes are in what cluster?
|
||||
table(membership(myGRGclusters)) # how large are the clusters?
|
||||
myGRGclusters <- igraph::cluster_infomap(myGRG)
|
||||
igraph::modularity(myGRGclusters) # ... measures how separated the different
|
||||
# membership types are from each other
|
||||
igraph::membership(myGRGclusters) # which nodes are in what cluster?
|
||||
table(igraph::membership(myGRGclusters)) # how large are the clusters?
|
||||
|
||||
# The largest cluster has 48 members, the second largest has 25, etc.
|
||||
|
||||
@ -661,7 +665,7 @@ table(membership(myGRGclusters)) # how large are the clusters?
|
||||
# their cluster membership:
|
||||
|
||||
# first, make a vector with as many grey colors as we have communities ...
|
||||
commColors <- rep("#f1eef6", max(membership(myGRGclusters)))
|
||||
commColors <- rep("#f1eef6", max(igraph::membership(myGRGclusters)))
|
||||
# ... then overwrite the first five with "real colors" - something like rust,
|
||||
# lilac, pink, and mauve or so.
|
||||
commColors[1:5] <- c("#980043", "#dd1c77", "#df65b0", "#c994c7", "#d4b9da")
|
||||
@ -673,8 +677,8 @@ plot(myGRG,
|
||||
rescale = FALSE,
|
||||
xlim = c(min(rGAM$x) * 0.9, max(rGAM$x) * 1.1),
|
||||
ylim = c(min(rGAM$y) * 0.9, max(rGAM$y) * 1.1),
|
||||
vertex.color=commColors[membership(myGRGclusters)],
|
||||
vertex.size = 0.1 + (0.1 * degree(myGRG)),
|
||||
vertex.color=commColors[igraph::membership(myGRGclusters)],
|
||||
vertex.size = 0.1 + (0.1 * igraph::degree(myGRG)),
|
||||
vertex.label = NA)
|
||||
par(oPar)
|
||||
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the FND-STA-Probability_distribution unit.
|
||||
#
|
||||
# Version: 1.2
|
||||
# Version: 1.3
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.3 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.2 Update set.seed() usage
|
||||
# 1.1 Corrected empirical p-value
|
||||
# 1.0 First code live version
|
||||
@ -28,21 +30,21 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------------------------------
|
||||
#TOC> 1 Introduction 50
|
||||
#TOC> 2 Three fundamental distributions 113
|
||||
#TOC> 2.1 The Poisson Distribution 116
|
||||
#TOC> 2.2 The uniform distribution 170
|
||||
#TOC> 2.3 The Normal Distribution 190
|
||||
#TOC> 3 quantile-quantile comparison 231
|
||||
#TOC> 3.1 qqnorm() 241
|
||||
#TOC> 3.2 qqplot() 307
|
||||
#TOC> 4 Quantifying the difference 324
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 359
|
||||
#TOC> 4.2 Kullback-Leibler divergence 451
|
||||
#TOC> 4.2.1 An example from tossing dice 462
|
||||
#TOC> 4.2.2 An example from lognormal distributions 585
|
||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 628
|
||||
#TOC> -----------------------------------------------------------------------------
|
||||
#TOC> 1 Introduction 52
|
||||
#TOC> 2 Three fundamental distributions 115
|
||||
#TOC> 2.1 The Poisson Distribution 118
|
||||
#TOC> 2.2 The uniform distribution 172
|
||||
#TOC> 2.3 The Normal Distribution 192
|
||||
#TOC> 3 quantile-quantile comparison 233
|
||||
#TOC> 3.1 qqnorm() 243
|
||||
#TOC> 3.2 qqplot() 309
|
||||
#TOC> 4 Quantifying the difference 326
|
||||
#TOC> 4.1 Chi2 test for discrete distributions 361
|
||||
#TOC> 4.2 Kullback-Leibler divergence 452
|
||||
#TOC> 4.2.1 An example from tossing dice 463
|
||||
#TOC> 4.2.2 An example from lognormal distributions 586
|
||||
#TOC> 4.3 Kolmogorov-Smirnov test for continuous distributions 629
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -385,9 +387,8 @@ hist(rG1.5, breaks = myBreaks, col = myCols[4])
|
||||
# package information - plotrix has _many_ useful utilities to enhance
|
||||
# plots or produce informative visualizations.
|
||||
|
||||
if (! require(plotrix, quietly=TRUE)) {
|
||||
if (! requireNamespace("plotrix", quietly = TRUE)) {
|
||||
install.packages("plotrix")
|
||||
library(plotrix)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = plotrix) # basic information
|
||||
@ -395,7 +396,7 @@ if (! require(plotrix, quietly=TRUE)) {
|
||||
# data(package = "plotrix") # available datasets
|
||||
|
||||
|
||||
h <- multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
|
||||
h <- plotrix::multhist(list(rL1, rL2, rG1.2, rG1.5, rG1.9 ),
|
||||
breaks = myBreaks,
|
||||
col = myCols)
|
||||
legend("topright",
|
||||
|
151
RPR-Biostrings.R
151
RPR-Biostrings.R
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Biostrings unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 20
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.0 2017 Revisions
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,19 +31,19 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> 1 The Biostrings Package 52
|
||||
#TOC> 2 Getting Data into Biostrings Objects 85
|
||||
#TOC> 3 Working with Biostrings Objects 106
|
||||
#TOC> 3.1 Properties 109
|
||||
#TOC> 3.2 Subsetting 146
|
||||
#TOC> 3.3 Operators 158
|
||||
#TOC> 3.4 Transformations 165
|
||||
#TOC> 4 Getting Data out of Biostrings Objects 172
|
||||
#TOC> 5 More 181
|
||||
#TOC> 5.1 Views 183
|
||||
#TOC> 5.2 Iranges 195
|
||||
#TOC> 5.3 StringSets 201
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 The Biostrings Package 55
|
||||
#TOC> 2 Getting Data into Biostrings Objects 86
|
||||
#TOC> 3 Working with Biostrings Objects 108
|
||||
#TOC> 3.1 Properties 125
|
||||
#TOC> 3.2 Subsetting 163
|
||||
#TOC> 3.3 Operators 175
|
||||
#TOC> 3.4 Transformations 182
|
||||
#TOC> 4 Getting Data out of Biostrings Objects 189
|
||||
#TOC> 5 More 198
|
||||
#TOC> 5.1 Views 200
|
||||
#TOC> 5.2 Iranges 214
|
||||
#TOC> 5.3 StringSets 220
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -54,14 +57,12 @@
|
||||
|
||||
# First, we install and load the Biostrings package from bioconductor
|
||||
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
|
||||
# Examine the package information:
|
||||
library(help = Biostrings) # basic information
|
||||
browseVignettes("Biostrings") # available vignettes
|
||||
@ -72,72 +73,88 @@ data(package = "Biostrings") # available datasets
|
||||
# of a "class" in R as a special kind of list), that can take on particular
|
||||
# flavours for RNA, DNA or amino acid sequence information.
|
||||
|
||||
class(RNAString("AUG"))
|
||||
class(DNAString("ATG"))
|
||||
class(AAString("M"))
|
||||
class(Biostrings::RNAString("AUG"))
|
||||
class(Biostrings::DNAString("ATG"))
|
||||
class(Biostrings::AAString("M"))
|
||||
|
||||
# An essential property of Biostrings objects is that they only allow letters
|
||||
# from the applicable IUPAC alphabet:
|
||||
RNAString("AUG")
|
||||
DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
||||
Biostrings::RNAString("AUG")
|
||||
Biostrings::DNAString("AUG") # Error! No "U" in IUPAC DNA codes
|
||||
|
||||
|
||||
# = 2 Getting Data into Biostrings Objects ================================
|
||||
|
||||
|
||||
# Example: read FASTA. Extract sequence. Convert to DNAString object.
|
||||
x <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
x <- dbSanitizeSequence(x)
|
||||
myDNAseq <- DNAString(x) # takes the nucleotide sequence and converts into a
|
||||
# object of class DNAstring
|
||||
rawSeq <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")
|
||||
rawSeq <- dbSanitizeSequence(rawSeq)
|
||||
biosDNAseq <- Biostrings::DNAString(rawSeq) # converts the nucleotide sequence
|
||||
# into an object of class DNAstring
|
||||
|
||||
# Multi FASTA files can be read directly as a "XStringSet) ...
|
||||
(myDNASet <- readDNAStringSet("./data/S288C_YDL056W_MBP1_coding.fsa"))
|
||||
rawMFAfile <- "./data/S288C_YDL056W_MBP1_coding.fsa"
|
||||
(biosDNASet <- Biostrings::readDNAStringSet(rawMFAfile))
|
||||
|
||||
# ... and if you subset one sequence from the set, you get an XString object
|
||||
# back again.
|
||||
(Xseq <- myDNASet[[1]])
|
||||
(Xseq <- biosDNASet[[1]])
|
||||
|
||||
myDNAseq == Xseq # the comparison evaluates to TRUE ...
|
||||
identical(myDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
||||
biosDNAseq == Xseq # the comparison evaluates to TRUE ...
|
||||
identical(biosDNAseq, Xseq) # ... and indeed the objects are deemed identical.
|
||||
|
||||
|
||||
|
||||
# = 3 Working with Biostrings Objects =====================================
|
||||
|
||||
# Biostrings is a highly engineered package that is tightly integrated into
|
||||
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||
# undesirable level of computational overhead and dependencies. Using the
|
||||
# package as we normally do - i.e. calling required functions with their
|
||||
# explicit package prefix is therefore not advisable. There are generics
|
||||
# that won't be propery dispatched. If you only need a small number of
|
||||
# functions for a very specific context, you will probably get away with
|
||||
# Biostrings::<function>() - but even in the demonstration code of this script
|
||||
# not everything works out of the box. We'll therefore load the library,
|
||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||
# the functions come from.
|
||||
|
||||
library(Biostrings)
|
||||
|
||||
|
||||
# == 3.1 Properties ========================================================
|
||||
str(myDNAseq)
|
||||
length(myDNAseq) # This gives you the _number of nucleotides_!
|
||||
# By comparison ...
|
||||
length(x) # ... is 1: one string only. To get the number of
|
||||
# characters in a string, you need nchar().
|
||||
nchar(x) # However ...
|
||||
nchar(myDNAseq) # ... also works.
|
||||
str(rawSeq)
|
||||
str(biosDNAseq)
|
||||
|
||||
uniqueLetters(myDNAseq)
|
||||
length(rawSeq) # ... is 1: one string only. To get the number of
|
||||
# characters in a string, you need nchar().
|
||||
length(biosDNAseq) # but the length of a "Bstring" is the number of elements
|
||||
nchar(rawSeq)
|
||||
nchar(biosDNAseq) # ... but nchar() works too.
|
||||
|
||||
(uL <- Biostrings::uniqueLetters(biosDNAseq))
|
||||
|
||||
# Count frequencies - with strings, you would strsplit() into a character
|
||||
# vector and then use table(). biost
|
||||
alphabetFrequency(myDNAseq)
|
||||
Biostrings::alphabetFrequency(biosDNAseq)
|
||||
|
||||
# letterFrequency() works with a defined alphabet - such as what uniqueLetters()
|
||||
# returns.
|
||||
letterFrequency(myDNAseq, uniqueLetters(myDNAseq))
|
||||
Biostrings::letterFrequency(biosDNAseq, uL)
|
||||
sum(Biostrings::letterFrequency(biosDNAseq, c("G", "C"))) /
|
||||
length(biosDNAseq) # GC contents
|
||||
|
||||
sum(letterFrequency(myDNAseq, c("G", "C"))) / length(myDNAseq) # GC contents
|
||||
Biostrings::dinucleotideFrequency(biosDNAseq)
|
||||
barplot(sort(Biostrings::dinucleotideFrequency(biosDNAseq)), cex.names = 0.5)
|
||||
|
||||
dinucleotideFrequency(myDNAseq)
|
||||
barplot(sort(dinucleotideFrequency(myDNAseq)), cex.names = 0.5)
|
||||
|
||||
(triNuc <- trinucleotideFrequency(myDNAseq))
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(biosDNAseq))
|
||||
barplot(sort(triNuc), col="#4499EE33")
|
||||
triNuc[triNuc == max(triNuc)]
|
||||
triNuc[triNuc == min(triNuc)]
|
||||
max(triNuc) / min(triNuc) # AAA is more than 13 times as frequent as CGT
|
||||
|
||||
# compare to a shuffled sequence:
|
||||
(triNuc <- trinucleotideFrequency(sample(myDNAseq)))
|
||||
(triNuc <- Biostrings::trinucleotideFrequency(sample(biosDNAseq)))
|
||||
barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
||||
|
||||
# Interpret this plot.
|
||||
@ -146,34 +163,34 @@ barplot(sort(triNuc), col="#EEEE4433", add = TRUE)
|
||||
# == 3.2 Subsetting ========================================================
|
||||
|
||||
# Subsetting any XString object works as expected:
|
||||
myDNAseq[4:15]
|
||||
biosDNAseq[4:15]
|
||||
|
||||
# ... well - maybe not expected, because x[4:15] would not work.
|
||||
# ... well - maybe not expected, because rawSeq[4:15] would not work.
|
||||
|
||||
# Alternatively to the "[" operator, use the subseq() function - especially for
|
||||
# long sequences. This is far more efficient.
|
||||
subseq(myDNAseq, start = 1, end = 30)
|
||||
Biostrings::subseq(biosDNAseq, start = 1, end = 30)
|
||||
|
||||
|
||||
# == 3.3 Operators =========================================================
|
||||
|
||||
# RNAstring() and DNAstring() objects compare U and T as equals!
|
||||
RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
||||
DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
||||
Biostrings::RNAString("AUGUCUAACCAAAUAUACUCAGCGAGAUAU") ==
|
||||
Biostrings::DNAString("ATGTCTAACCAAATATACTCAGCGAGATAT")
|
||||
|
||||
|
||||
# == 3.4 Transformations ===================================================
|
||||
|
||||
myDNAseq[4:15]
|
||||
reverseComplement(myDNAseq[4:15])
|
||||
translate(myDNAseq[4:15])
|
||||
biosDNAseq[4:15]
|
||||
Biostrings::reverseComplement(biosDNAseq[4:15])
|
||||
Biostrings::translate(biosDNAseq[4:15])
|
||||
|
||||
|
||||
# = 4 Getting Data out of Biostrings Objects ==============================
|
||||
|
||||
# If you need a character object, use toString():
|
||||
|
||||
toString(myDNAseq[4:15])
|
||||
Biostrings::toString(biosDNAseq[4:15])
|
||||
|
||||
# save() and load() works like on all other R objects.
|
||||
|
||||
@ -185,7 +202,9 @@ toString(myDNAseq[4:15])
|
||||
# Biostring "Views" are objects that store multiple substrings of one
|
||||
# Biostring object.
|
||||
|
||||
(myView <- Views(myDNAseq, start = c(1, 19, 37), end = c(15, 30, 45)))
|
||||
(myView <- Biostrings::Views(biosDNAseq,
|
||||
start = c(1, 19, 37),
|
||||
end = c(15, 30, 45)))
|
||||
|
||||
# Views are convenient to store feature annotations
|
||||
names(myView) <- c("Feature-A", "Feature-B", "Feature-C")
|
||||
@ -202,20 +221,20 @@ cat(sprintf("\n%s\t(%d)\t%s", names(myView), width(myView), myView ))
|
||||
|
||||
# Biostring "StringSets" store multiple sequences.
|
||||
#
|
||||
ompA <- AAString("MKKTAIAIAVALAGFATVAQA")
|
||||
ompA <- Biostrings::AAString("MKKTAIAIAVALAGFATVAQA")
|
||||
sample(ompA) # sample can work directly on a Biostring object to shuffle it
|
||||
|
||||
x[1] <- toString(ompA)
|
||||
x <- Biostrings::toString(ompA)
|
||||
for (i in 2:10) {
|
||||
x[i] <- toString(sample(ompA))
|
||||
x[i] <- Biostrings::toString(sample(ompA))
|
||||
}
|
||||
shuffledPeptideSet <- AAStringSet(x)
|
||||
shuffledPeptideSet <- Biostrings::AAStringSet(x)
|
||||
names(shuffledPeptideSet) <- c("ompA", paste("shuffle.", 1:9, sep=""))
|
||||
shuffledPeptideSet
|
||||
|
||||
length(shuffledPeptideSet)
|
||||
width(shuffledPeptideSet)
|
||||
alphabetFrequency(shuffledPeptideSet)
|
||||
Biostrings::width(shuffledPeptideSet)
|
||||
Biostrings::alphabetFrequency(shuffledPeptideSet)
|
||||
|
||||
|
||||
# [END]
|
||||
|
167
RPR-GEO2R.R
167
RPR-GEO2R.R
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR_GEO2R unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 09 - 2018 01
|
||||
# Date: 2017 09 - 2019 01
|
||||
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.1 Add section on GPL annotations
|
||||
# 1.0 Updates for BCH441 2017
|
||||
# 0.1 First code copied from 2016 material.
|
||||
@ -33,19 +36,19 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------------
|
||||
#TOC> 1 Preparations 53
|
||||
#TOC> 2 Loading a GEO Dataset 84
|
||||
#TOC> 3 Column wise analysis - time points 154
|
||||
#TOC> 3.1 Task - Comparison of experiments 160
|
||||
#TOC> 3.2 Grouped Samples 207
|
||||
#TOC> 4 Row-wise Analysis: Expression Profiles 242
|
||||
#TOC> 4.1 Task - Read a table of features 277
|
||||
#TOC> 4.2 Selected Expression profiles 325
|
||||
#TOC> 5 Differential Expression 366
|
||||
#TOC> 5.1 Final task: Gene descriptions 490
|
||||
#TOC> 6 Improving on Discovery by Differential Expression 495
|
||||
#TOC> 7 Annotation data 577
|
||||
#TOC> --------------------------------------------------------------------------
|
||||
#TOC> 1 Preparations 56
|
||||
#TOC> 2 Loading a GEO Dataset 82
|
||||
#TOC> 3 Column wise analysis - time points 152
|
||||
#TOC> 3.1 Task - Comparison of experiments 158
|
||||
#TOC> 3.2 Grouped Samples 205
|
||||
#TOC> 4 Row-wise Analysis: Expression Profiles 240
|
||||
#TOC> 4.1 Task - Read a table of features 275
|
||||
#TOC> 4.2 Selected Expression profiles 323
|
||||
#TOC> 5 Differential Expression 364
|
||||
#TOC> 5.1 Final task: Gene descriptions 504
|
||||
#TOC> 6 Improving on Discovery by Differential Expression 510
|
||||
#TOC> 7 Annotation data 594
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -55,12 +58,11 @@
|
||||
# To load and analyze GEO datasets we use a number of Bioconductor packages:
|
||||
|
||||
|
||||
if (! require(Biobase, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biobase")
|
||||
library(Biobase)
|
||||
if (! requireNamespace("Biobase", quietly = TRUE)) {
|
||||
BiocManager::install("Biobase")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biobase) # basic information
|
||||
@ -68,12 +70,8 @@ if (! require(Biobase, quietly=TRUE)) {
|
||||
# data(package = "Biobase") # available datasets
|
||||
|
||||
|
||||
if (! require(GEOquery, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("GEOquery")
|
||||
library(GEOquery)
|
||||
if (! requireNamespace("GEOquery", quietly = TRUE)) {
|
||||
BiocManager::install("GEOquery")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = GEOquery) # basic information
|
||||
@ -94,7 +92,7 @@ if (! require(GEOquery, quietly=TRUE)) {
|
||||
# I have experienced outages over several hours. If the command below does
|
||||
# not work for you, skip ahead to the fallback procedure.
|
||||
|
||||
GSE3635 <- getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
|
||||
GSE3635 <- GEOquery::getGEO("GSE3635", GSEMatrix =TRUE, getGPL=FALSE)
|
||||
# Note: GEO2R scripts call the expression data set
|
||||
# "gset" throughout ... in this script I give
|
||||
# it the name "GSE3635" for clarity.
|
||||
@ -136,14 +134,14 @@ help("ExpressionSet-class")
|
||||
GSE3635
|
||||
|
||||
# Access contents via methods:
|
||||
featureNames(GSE3635)[1:20] # Rows. What are these features?
|
||||
sampleNames(GSE3635)[1:10] # Columns. What are these columns?
|
||||
Biobase::featureNames(GSE3635)[1:20] # Rows. What are these features?
|
||||
Biobase::sampleNames(GSE3635)[1:10] # Columns. What are these columns?
|
||||
|
||||
# Access contents by subsetting:
|
||||
( tmp <- GSE3635[12:17, 1:6] )
|
||||
|
||||
# Access data
|
||||
exprs(tmp) # exprs() gives us the actual expression values.
|
||||
Biobase::exprs(tmp) # exprs() gives us the actual expression values.
|
||||
|
||||
|
||||
#TASK> What are the data:
|
||||
@ -160,9 +158,9 @@ exprs(tmp) # exprs() gives us the actual expression values.
|
||||
# == 3.1 Task - Comparison of experiments ==================================
|
||||
|
||||
# Get an overview of the distribution of data values in individual columns
|
||||
summary(exprs(GSE3635)[ , 1])
|
||||
summary(exprs(GSE3635)[ , 4])
|
||||
summary(exprs(GSE3635)[ , 7])
|
||||
summary(Biobase::exprs(GSE3635)[ , 1])
|
||||
summary(Biobase::exprs(GSE3635)[ , 4])
|
||||
summary(Biobase::exprs(GSE3635)[ , 7])
|
||||
|
||||
# as a boxplot
|
||||
cyclicPalette <- colorRampPalette(c("#00AAFF",
|
||||
@ -173,7 +171,7 @@ cyclicPalette <- colorRampPalette(c("#00AAFF",
|
||||
"#FFAA00",
|
||||
"#00AAFF"))
|
||||
tCols <- cyclicPalette(13)
|
||||
boxplot(exprs(GSE3635), col = tCols)
|
||||
boxplot(Biobase::exprs(GSE3635), col = tCols)
|
||||
|
||||
|
||||
#TASK> Study this boxplot. What's going on? Are these expression values?
|
||||
@ -181,11 +179,11 @@ boxplot(exprs(GSE3635), col = tCols)
|
||||
|
||||
|
||||
# Lets plot the distributions of values in a more fine-grained manner:
|
||||
hT0 <- hist(exprs(GSE3635)[ , 1], breaks = 100)
|
||||
hT3 <- hist(exprs(GSE3635)[ , 4], breaks = 100)
|
||||
hT6 <- hist(exprs(GSE3635)[ , 7], breaks = 100)
|
||||
hT9 <- hist(exprs(GSE3635)[ , 10], breaks = 100)
|
||||
hT12 <- hist(exprs(GSE3635)[ , 13], breaks = 100)
|
||||
hT0 <- hist(Biobase::exprs(GSE3635)[ , 1], breaks = 100)
|
||||
hT3 <- hist(Biobase::exprs(GSE3635)[ , 4], breaks = 100)
|
||||
hT6 <- hist(Biobase::exprs(GSE3635)[ , 7], breaks = 100)
|
||||
hT9 <- hist(Biobase::exprs(GSE3635)[ , 10], breaks = 100)
|
||||
hT12 <- hist(Biobase::exprs(GSE3635)[ , 13], breaks = 100)
|
||||
|
||||
|
||||
plot( hT0$mids, hT0$counts, type = "l", col = tCols[1], xlim = c(-0.5, 0.5))
|
||||
@ -218,7 +216,7 @@ for (i in 1:nchar(gsms)) {
|
||||
sml <- paste("G", sml, sep="") # set group names
|
||||
|
||||
# order samples by group
|
||||
ex <- exprs(GSE3635)[ , order(sml)]
|
||||
ex <- Biobase::exprs(GSE3635)[ , order(sml)]
|
||||
sml <- sml[order(sml)]
|
||||
fl <- as.factor(sml)
|
||||
labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
|
||||
@ -231,8 +229,8 @@ labels <- c("t0","t10","t20","t30","t40","t50") # these are the labels we
|
||||
GEOcols <- c("#dfeaf4", "#f4dfdf", "#f2cb98", "#dcdaa5",
|
||||
"#dff4e4", "#f4dff4", "#AABBCC")
|
||||
dev.new(width = 4 + dim(GSE3635)[[2]] / 5, height = 6) # plot into a new window
|
||||
par(mar = c(2 + round(max(nchar(sampleNames(GSE3635))) / 2), 4, 2, 1))
|
||||
title <- paste ("GSE3635", '/', annotation(GSE3635),
|
||||
par(mar = c(2 + round(max(nchar(Biobase::sampleNames(GSE3635))) / 2), 4, 2, 1))
|
||||
title <- paste ("GSE3635", '/', Biobase::annotation(GSE3635),
|
||||
" grouped samples", sep ='')
|
||||
boxplot(ex, boxwex = 0.6, notch = TRUE, main = title, outline=FALSE,
|
||||
las = 2, col = GEOcols[fl])
|
||||
@ -331,7 +329,7 @@ gName <- "MBP1"
|
||||
(iFeature <- which(SGD_features$name == gName))
|
||||
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
||||
plot(seq(0, 120, by = 10),
|
||||
exprs(GSE3635)[iExprs, ],
|
||||
Biobase::exprs(GSE3635)[iExprs, ],
|
||||
main = paste("Expression profile for", gName),
|
||||
xlab = "time (min)",
|
||||
ylab = "expression",
|
||||
@ -368,12 +366,8 @@ SGD_features$description[iFeature]
|
||||
# GEO2R discovers the top differentially expressed expressed genes by
|
||||
# using functions in the Bioconductor limma package.
|
||||
|
||||
if (! require(limma, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
}
|
||||
biocLite("limma")
|
||||
library(limma)
|
||||
if (! requireNamespace("limma", quietly = TRUE)) {
|
||||
BiocManager::install("limma")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = limma) # basic information
|
||||
@ -392,6 +386,20 @@ if (! require(limma, quietly=TRUE)) {
|
||||
# the groups
|
||||
# 4. Format results.
|
||||
|
||||
# Biobase is a highly engineered package that is tightly integrated into
|
||||
# the Bioconductor world - unfortunately that brings with it a somewhat
|
||||
# undesirable level of computational overhead and dependencies. Using the
|
||||
# package as we normally do - i.e. calling required functions with their
|
||||
# explicit package prefix is therefore not advisable. There are generics
|
||||
# that won't be propery dispatched. If you only need a small number of
|
||||
# functions for a very specific context, you will probably get away with
|
||||
# Biobase::<function>() - but even in the demonstration code of this script
|
||||
# not everything works out of the box. We'll therefore load the library,
|
||||
# but we'll (redundantly) use the prefix anyway so as to emphasize where
|
||||
# the functions come from.
|
||||
|
||||
library(Biobase)
|
||||
|
||||
# We are recapitulating the experiment in which we assigned the 0, 10, 60 and
|
||||
# 70 minute samples to one group, the 30, 40, 90 and 100 minute samples to
|
||||
# another group, and calculated differential expression values between these
|
||||
@ -415,15 +423,15 @@ myDesign
|
||||
|
||||
# Now we can calculate the fit of all rows to a linear model that depends
|
||||
# on the two groups as specified in the design:
|
||||
myFit <- lmFit(mySet, myDesign)
|
||||
myFit <- limma::lmFit(mySet, myDesign)
|
||||
|
||||
# Next we calculate the contrasts, given the fit ...
|
||||
myCont.matrix <- makeContrasts(A - B, levels = myDesign)
|
||||
myFit2 <- contrasts.fit(myFit, myCont.matrix)
|
||||
myCont.matrix <- limma::makeContrasts(A - B, levels = myDesign)
|
||||
myFit2 <- limma::contrasts.fit(myFit, myCont.matrix)
|
||||
|
||||
# ... compute appropriate probabilites from a modified t-test
|
||||
# (empirical Bayes) ...
|
||||
myFit2 <- eBayes(myFit2, 0.01)
|
||||
myFit2 <- limma::eBayes(myFit2, 0.01)
|
||||
|
||||
# ... add the gene names to the fit - object ...
|
||||
myFit2$genes <- featureNames(mySet)
|
||||
@ -433,7 +441,10 @@ myFit2$genes <- featureNames(mySet)
|
||||
# gave us only the top 250 genes, but we might as well do 1000, just so we
|
||||
# can be reasonable sure that our gens of interest are included.
|
||||
N <- 1000
|
||||
myTable <- topTable(myFit2, adjust.method = "fdr", sort.by = "B", number = N)
|
||||
myTable <- limma::topTable(myFit2,
|
||||
adjust.method = "fdr",
|
||||
sort.by = "B",
|
||||
number = N)
|
||||
|
||||
str(myTable)
|
||||
# The gene names are now in the $ID column
|
||||
@ -461,7 +472,7 @@ abline(h = 0, col = "#00000055")
|
||||
|
||||
for (i in 1:10) {
|
||||
thisID <- myTable$ID[i]
|
||||
points(seq(0, 120, by = 10), exprs(GSE3635)[thisID, ], type = "b")
|
||||
points(seq(0, 120, by = 10), Biobase::exprs(GSE3635)[thisID, ], type = "b")
|
||||
}
|
||||
|
||||
# Our guess that we might discover interesting genes be selecting groups A and B
|
||||
@ -480,7 +491,10 @@ for (i in 1:10) {
|
||||
myControls <- c("Cdc14", "Mbp1", "Swi6", "Swi4", "Whi5", "Cln1", "Cln2", "Cln3")
|
||||
for (name in toupper(myControls)) {
|
||||
thisID <- SGD_features$sysName[which(SGD_features$name == name)]
|
||||
points(seq(0, 120, by=10), exprs(GSE3635)[thisID, ], type="b", col="#AA0000")
|
||||
points(seq(0, 120, by=10),
|
||||
Biobase::exprs(GSE3635)[thisID, ],
|
||||
type="b",
|
||||
col="#AA0000")
|
||||
}
|
||||
|
||||
# Indeed, the discovered gene profiles look much "cleaner" than the real cycle
|
||||
@ -504,7 +518,7 @@ for (name in toupper(myControls)) {
|
||||
gName <- "CLN2"
|
||||
(iFeature <- which(SGD_features$name == gName))
|
||||
(iExprs <- which(featureNames(GSE3635) == SGD_features$sysName[iFeature]))
|
||||
Cln2Profile <- exprs(GSE3635)[iExprs, ]
|
||||
Cln2Profile <- Biobase::exprs(GSE3635)[iExprs, ]
|
||||
plot(seq(0, 120, by = 10),
|
||||
Cln2Profile,
|
||||
ylim = c(-1, 1),
|
||||
@ -519,16 +533,16 @@ abline(v = 60, col = "#00000055")
|
||||
# Set up a vector of correlation values
|
||||
|
||||
|
||||
myCorrelations <- numeric(nrow(exprs(GSE3635)))
|
||||
names(myCorrelations) <- featureNames(GSE3635)
|
||||
myCorrelations <- numeric(nrow(Biobase::exprs(GSE3635)))
|
||||
names(myCorrelations) <- Biobase::featureNames(GSE3635)
|
||||
for (i in 1:length(myCorrelations)) {
|
||||
myCorrelations[i] <- cor(Cln2Profile, exprs(GSE3635)[i, ])
|
||||
myCorrelations[i] <- cor(Cln2Profile, Biobase::exprs(GSE3635)[i, ])
|
||||
}
|
||||
|
||||
myTopC <- order(myCorrelations, decreasing = TRUE)[1:10] # top ten
|
||||
|
||||
# Number 1
|
||||
(ID <- featureNames(GSE3635)[myTopC[1]])
|
||||
(ID <- Biobase::featureNames(GSE3635)[myTopC[1]])
|
||||
|
||||
# Get information
|
||||
SGD_features[which(SGD_features$sysName == ID), ]
|
||||
@ -537,12 +551,13 @@ SGD_features[which(SGD_features$sysName == ID), ]
|
||||
|
||||
# Let's plot the rest
|
||||
for (i in 2:length(myTopC)) {
|
||||
ID <- featureNames(GSE3635)[myTopC[i]]
|
||||
ID <- Biobase::featureNames(GSE3635)[myTopC[i]]
|
||||
points(seq(0, 120, by = 10),
|
||||
exprs(GSE3635)[ID, ],
|
||||
Biobase::exprs(GSE3635)[ID, ],
|
||||
type = "b",
|
||||
col= "chartreuse")
|
||||
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
|
||||
print(SGD_features[which(SGD_features$sysName == ID),
|
||||
c("name", "description")])
|
||||
}
|
||||
|
||||
# Note that all of these genes are highly correlated with a known cell cycle
|
||||
@ -554,12 +569,13 @@ for (i in 2:length(myTopC)) {
|
||||
# And we haven't even looked at the anticorrelated genes yet...
|
||||
myBottomC <- order(myCorrelations, decreasing = FALSE)[1:10] # bottom ten
|
||||
for (i in 1:length(myBottomC)) {
|
||||
ID <- featureNames(GSE3635)[myBottomC[i]]
|
||||
ID <- Biobase::featureNames(GSE3635)[myBottomC[i]]
|
||||
points(seq(0, 120, by = 10),
|
||||
exprs(GSE3635)[ID, ],
|
||||
Biobase::exprs(GSE3635)[ID, ],
|
||||
type = "b",
|
||||
col= "coral")
|
||||
print(SGD_features[which(SGD_features$sysName == ID), c("name", "description")])
|
||||
print(SGD_features[which(SGD_features$sysName == ID),
|
||||
c("name", "description")])
|
||||
}
|
||||
# ... which are very interesting in their own right.
|
||||
|
||||
@ -583,7 +599,7 @@ for (i in 1:length(myBottomC)) {
|
||||
# we used getGEO("GSE3635", GSEMatrix = TRUE, getGPL = FALSE), and the GPL
|
||||
# annotations were not loaded. We could use getGPL = TRUE instead ...
|
||||
|
||||
GSE3635annot <- getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
|
||||
GSE3635annot <- GEOquery::getGEO("GSE3635", GSEMatrix = TRUE, getGPL = TRUE)
|
||||
GSE3635annot <- GSE3635annot[[1]]
|
||||
|
||||
# ... and the feature data is then available in the GSE3635@featureData@data
|
||||
@ -597,13 +613,8 @@ GSE3635annot@featureData@data[ 1:20 , ]
|
||||
myAnnot <- GSE3635annot@featureData@data[ , c("SPOT_ID", "Gene")]
|
||||
str(myAnnot)
|
||||
|
||||
# ... Note that this is a data frame, but - oy veh - the gene symbols are
|
||||
# factors. Really, we need to fix this! To convert a factor into a string,
|
||||
# we need to cast it to character.
|
||||
|
||||
myAnnot$Gene <- as.character(myAnnot$Gene)
|
||||
|
||||
# ... whereupon we can find things we might be looking for ...
|
||||
# ... Note that this is a data frame and it is easy to find things we
|
||||
# might be looking for ...
|
||||
myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
||||
|
||||
# ... or identify rows that might give us trouble, such as probes that
|
||||
@ -614,13 +625,11 @@ myAnnot[which(myAnnot$Gene == "MBP1"), ]
|
||||
GSE3635@annotation # "GPL1914"
|
||||
|
||||
# ... and downloaded it directly from NCBI:
|
||||
GPL1914 <- getGEO("GPL1914")
|
||||
GPL1914 <- GEOquery::getGEO("GPL1914")
|
||||
str(GPL1914)
|
||||
|
||||
# ... from which we can get the data - which is however NOT necessarily
|
||||
# matched to the rows of our expression dataset. Note that here too: the
|
||||
# majority of data elements are factors and will likely have to be converted
|
||||
# before use.
|
||||
# matched to the rows of our expression dataset.
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -3,12 +3,15 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Genetic_code_optimality unit.
|
||||
#
|
||||
# Version: 1.1
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# use Biocmanager:: not biocLite()
|
||||
# 1.1 Update set.seed() usage
|
||||
# 1.0.1 Fixed two bugs discovered by Suan Chin Yeo.
|
||||
# 1.0 New material.
|
||||
@ -30,16 +33,16 @@
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------------
|
||||
#TOC> 1 Designing a computational experiment 54
|
||||
#TOC> 2 Setting up the tools 70
|
||||
#TOC> 2.1 Natural and alternative genetic codes 73
|
||||
#TOC> 2.2 Effect of mutations 132
|
||||
#TOC> 2.2.1 reverse-translate 143
|
||||
#TOC> 2.2.2 Randomly mutate 168
|
||||
#TOC> 2.2.3 Forward- translate 193
|
||||
#TOC> 2.2.4 measure effect 211
|
||||
#TOC> 3 Run the experiment 258
|
||||
#TOC> 4 Task solutions 351
|
||||
#TOC> 1 Designing a computational experiment 57
|
||||
#TOC> 2 Setting up the tools 73
|
||||
#TOC> 2.1 Natural and alternative genetic codes 76
|
||||
#TOC> 2.2 Effect of mutations 134
|
||||
#TOC> 2.2.1 reverse-translate 145
|
||||
#TOC> 2.2.2 Randomly mutate 170
|
||||
#TOC> 2.2.3 Forward- translate 195
|
||||
#TOC> 2.2.4 measure effect 213
|
||||
#TOC> 3 Run the experiment 260
|
||||
#TOC> 4 Task solutions 356
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -73,12 +76,11 @@
|
||||
# == 2.1 Natural and alternative genetic codes =============================
|
||||
|
||||
# Load genetic code tables from the Biostrings package
|
||||
if (! require(Biostrings, quietly=TRUE)) {
|
||||
if (! exists("biocLite")) {
|
||||
source("https://bioconductor.org/biocLite.R")
|
||||
if (! requireNamespace("BiocManager", quietly = TRUE)) {
|
||||
install.packages("BiocManager")
|
||||
}
|
||||
biocLite("Biostrings")
|
||||
library(Biostrings)
|
||||
if (! requireNamespace("Biostrings", quietly = TRUE)) {
|
||||
BiocManager::install("Biostrings")
|
||||
}
|
||||
# Package information:
|
||||
# library(help = Biostrings) # basic information
|
||||
@ -257,52 +259,55 @@ evalMut <- function(nat, mut) {
|
||||
|
||||
# = 3 Run the experiment ==================================================
|
||||
|
||||
# Fetch the standard Genetic code from Biostrings::
|
||||
|
||||
stdCode <- Biostrings::GENETIC_CODE
|
||||
|
||||
# Fetch the nucleotide sequence for MBP1:
|
||||
|
||||
myDNA <- readLines("./data/S288C_YDL056W_MBP1_coding.fsa")[-1]
|
||||
myDNA <- paste0(myDNA, collapse = "")
|
||||
myDNA <- as.character(codons(DNAString(myDNA)))
|
||||
myDNA <- as.character(Biostrings::codons(Biostrings::DNAString(myDNA)))
|
||||
myDNA <- myDNA[-length(myDNA)] # drop the stop codon
|
||||
|
||||
myAA <- traFor(myDNA, GENETIC_CODE)
|
||||
myAA <- traFor(myDNA, stdCode)
|
||||
|
||||
# Mutate and evaluate
|
||||
set.seed(112358)
|
||||
x <- randMut(myDNA)
|
||||
set.seed(NULL)
|
||||
x <- traFor(x, GENETIC_CODE)
|
||||
x <- traFor(x, stdCode)
|
||||
evalMut(myAA, x) # 166.4
|
||||
|
||||
# Try this 200 times, and see how the values are distributed.
|
||||
N <- 200
|
||||
valUGC <- numeric(N)
|
||||
valSTDC <- numeric(N)
|
||||
|
||||
set.seed(112358) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
x <- randMut(myDNA) # mutate
|
||||
x <- traFor(x, GENETIC_CODE) # translate
|
||||
valUGC[i] <- evalMut(myAA, x) # evaluate
|
||||
x <- traFor(x, stdCode) # translate
|
||||
valSTDC[i] <- evalMut(myAA, x) # evaluate
|
||||
}
|
||||
set.seed(NULL) # reset the RNG
|
||||
|
||||
hist(valUGC,
|
||||
hist(valSTDC,
|
||||
breaks = 15,
|
||||
col = "palegoldenrod",
|
||||
xlim = c(0, 400),
|
||||
ylim = c(0, N/4),
|
||||
main = "Universal vs. Synthetic Genetic Code",
|
||||
main = "Standard vs. Synthetic Genetic Code",
|
||||
xlab = "Mutation penalty")
|
||||
|
||||
# This looks like a normal distribution. Let's assume the effect of mutations
|
||||
# under the universal genetic code is the mean of this distribution:
|
||||
effectUGC <- mean(valUGC) # 178.1
|
||||
# under the standard genetic code is the mean of this distribution:
|
||||
effectSTDC <- mean(valSTDC) # 178.1
|
||||
|
||||
# Now we can look at the effects of alternate genetic codes:
|
||||
|
||||
set.seed(112358)
|
||||
# choose a new code
|
||||
GC <- randomGC(GENETIC_CODE)
|
||||
GC <- randomGC(stdCode)
|
||||
set.seed(NULL)
|
||||
|
||||
# reverse translate hypothetical sequence according to the new code
|
||||
@ -321,7 +326,7 @@ valXGC <- numeric(N)
|
||||
|
||||
set.seed(1414214) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- randomGC(GENETIC_CODE) # Choose code
|
||||
GC <- randomGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
@ -355,7 +360,7 @@ valSGC <- numeric(N)
|
||||
|
||||
set.seed(2718282) # set RNG seed for repeatable randomness
|
||||
for (i in 1:N) {
|
||||
GC <- swappedGC(GENETIC_CODE) # Choose code
|
||||
GC <- swappedGC(stdCode) # Choose code
|
||||
x <- traRev(myAA, GC) # reverse translate
|
||||
x <- randMut(x) # mutate
|
||||
x <- traFor(x, GC) # translate
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0.1
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 - 2018 12
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout,
|
||||
# 1.0.1 Updates for slightly changed interfaces
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
@ -29,10 +31,10 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 Constructing a POST command from a Web query 44
|
||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 145
|
||||
#TOC> 2 Task solutions 153
|
||||
#TOC> ---------------------------------------------------------------------
|
||||
#TOC> 1 Constructing a POST command from a Web query 42
|
||||
#TOC> 1.1 Task - fetchPrositeFeatures() function 142
|
||||
#TOC> 2 Task solutions 150
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -40,9 +42,8 @@
|
||||
# = 1 Constructing a POST command from a Web query ========================
|
||||
|
||||
|
||||
if (! require(httr, quietly=TRUE)) {
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
@ -60,7 +61,7 @@ UniProtID <- "P39678"
|
||||
|
||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||
|
||||
response <- POST(URL,
|
||||
response <- httr::POST(URL,
|
||||
body = list(meta = "opt1",
|
||||
meta1_protein = "opt1",
|
||||
seq = UniProtID,
|
||||
@ -70,14 +71,14 @@ response <- POST(URL,
|
||||
# Send off this request, and you should have a response in a few
|
||||
# seconds. Let's check the status first:
|
||||
|
||||
status_code(response) # If this is not 200, something went wrong and it
|
||||
httr::status_code(response) # If this is not 200, something went wrong and it
|
||||
# makes no sense to continue. If this persists, ask
|
||||
# on the mailing list what to do.
|
||||
|
||||
|
||||
# The text contents of the response is available with the
|
||||
# content() function:
|
||||
content(response, "text")
|
||||
httr::content(response, "text")
|
||||
|
||||
# ... should show you the same as the page contents that
|
||||
# you have seen in the browser. The date we need Now we need to extract
|
||||
@ -86,7 +87,7 @@ content(response, "text")
|
||||
# individual lines, since each of our data elements is on
|
||||
# its own line. We simply split on the "\\n" newline character.
|
||||
|
||||
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||
head(lines)
|
||||
|
||||
# Now we define a query pattern for the lines we want:
|
||||
|
76
RPR-SX-PDB.R
76
RPR-SX-PDB.R
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-SX-PDB unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 19
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First live version, completely refactores 2016 code
|
||||
# with remarkable speed gains. Added section on x, y, z
|
||||
# (density) plots.
|
||||
@ -30,19 +32,19 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 Introduction to the bio3D package 63
|
||||
#TOC> 2 A Ramachandran plot 151
|
||||
#TOC> 3 Density plots 227
|
||||
#TOC> 3.1 Density-based colours 241
|
||||
#TOC> 3.2 Plotting with smoothScatter() 260
|
||||
#TOC> 3.3 Plotting hexbins 275
|
||||
#TOC> 3.4 Plotting density contours 299
|
||||
#TOC> 3.4.1 ... as overlay on a colored grid 333
|
||||
#TOC> 3.4.2 ... as filled countour 350
|
||||
#TOC> 3.4.3 ... as a perspective plot 381
|
||||
#TOC> 4 cis-peptide bonds 399
|
||||
#TOC> 5 H-bond lengths 414
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 Introduction to the bio3D package 61
|
||||
#TOC> 2 A Ramachandran plot 152
|
||||
#TOC> 3 Density plots 228
|
||||
#TOC> 3.1 Density-based colours 242
|
||||
#TOC> 3.2 Plotting with smoothScatter() 261
|
||||
#TOC> 3.3 Plotting hexbins 276
|
||||
#TOC> 3.4 Plotting density contours 304
|
||||
#TOC> 3.4.1 ... as overlay on a colored grid 337
|
||||
#TOC> 3.4.2 ... as filled countour 354
|
||||
#TOC> 3.4.3 ... as a perspective plot 385
|
||||
#TOC> 4 cis-peptide bonds 403
|
||||
#TOC> 5 H-bond lengths 418
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -59,9 +61,8 @@
|
||||
# = 1 Introduction to the bio3D package ===================================
|
||||
|
||||
|
||||
if (! require(bio3d, quietly=TRUE)) {
|
||||
if (! requireNamespace("bio3d", quietly = TRUE)) {
|
||||
install.packages("bio3d")
|
||||
library(bio3d)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = bio3d) # basic information
|
||||
@ -89,8 +90,8 @@ file.show("./data/1BM8.pdb")
|
||||
# Are all atoms of the N-terminal residue present?
|
||||
# Are all atoms of the C-terminal residue present?
|
||||
|
||||
apses <- read.pdb("1bm8") # load a molecule directly from the PDB via the
|
||||
# Internet. (This is not your local version.)
|
||||
apses <- bio3d::read.pdb("1bm8") # load a molecule directly from the PDB via
|
||||
# the Internet.
|
||||
|
||||
# check what we have:
|
||||
apses
|
||||
@ -121,10 +122,11 @@ apses$atom[apses$atom[,"resno"] == i, ]
|
||||
apses$seqres[1:10] # the "A"s here identify chain "A"
|
||||
|
||||
# Can we convert this to one letter code?
|
||||
aa321(apses$seqres[1:10])
|
||||
bio3d::aa321(apses$seqres[1:10])
|
||||
|
||||
# Lets get the implicit sequence:
|
||||
aa321((apses$atom$resid[apses$calpha])[1:10]) # Do you understand this code?
|
||||
bio3d::aa321((apses$atom$resid[apses$calpha])[1:10])
|
||||
# Do you understand this code?
|
||||
|
||||
# Do explicit and implicit sequence have the same length?
|
||||
length(apses$seqres)
|
||||
@ -140,7 +142,10 @@ apses$atom[sel, c("eleno", "elety", "resid", "chain", "resno", "insert")]
|
||||
# The introduction to bio3d tutorial at
|
||||
# http://thegrantlab.org/bio3d/tutorials/structure-analysis
|
||||
# has the following example:
|
||||
plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
|
||||
bio3d::plot.bio3d(apses$atom$b[apses$calpha],
|
||||
sse=apses,
|
||||
typ="l",
|
||||
ylab="B-factor")
|
||||
|
||||
# Good for now. Let's do some real work.
|
||||
|
||||
@ -149,7 +154,7 @@ plot.bio3d(apses$atom$b[apses$calpha], sse=apses, typ="l", ylab="B-factor")
|
||||
# Calculate a Ramachandran plot for the structure. The torsion.pdb() function
|
||||
# calculates all dihedral angles for backbone and sidechain bonds, NA where
|
||||
# the bond does not exist in an amino acid.
|
||||
tor <- torsion.pdb(apses)
|
||||
tor <- bio3d::torsion.pdb(apses)
|
||||
plot(tor$phi, tor$psi,
|
||||
xlim = c(-180, 180), ylim = c(-180, 180),
|
||||
main = "Ramachandran plot for 1BM8",
|
||||
@ -164,7 +169,7 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
|
||||
# color the points for glycine residues differently. First, we
|
||||
# get a vector of glycine residue indices in the structure:
|
||||
|
||||
mySeq <- pdbseq(apses)
|
||||
mySeq <- bio3d::pdbseq(apses)
|
||||
|
||||
# Explore the result object and extract the indices of GLY resiues.
|
||||
mySeq == "G"
|
||||
@ -210,7 +215,7 @@ for (i in 1:nrow(dat)) {
|
||||
points(dat$phi[i], dat$psi[i], pch=21, cex=0.9, bg="#CC0000")
|
||||
text(dat$phi[i],
|
||||
dat$psi[i],
|
||||
labels = sprintf("%s%d", aa321(dat$resid[i]), dat$resno[i]),
|
||||
labels = sprintf("%s%d", bio3d::aa321(dat$resid[i]), dat$resno[i]),
|
||||
pos = 4,
|
||||
offset = 0.4,
|
||||
cex = 0.7)
|
||||
@ -272,9 +277,8 @@ abline(v = 0, lwd = 0.5, col = "#00000044")
|
||||
|
||||
# If we wish to approximate values in a histogram-like fashion, we can use
|
||||
# hexbin()
|
||||
if (! require(hexbin, quietly=TRUE)) {
|
||||
if (! requireNamespace("hexbin", quietly = TRUE)) {
|
||||
install.packages("hexbin")
|
||||
library(hexbin)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = hexbin) # basic information
|
||||
@ -285,12 +289,17 @@ if (! require(hexbin, quietly=TRUE)) {
|
||||
myColorRamp <- colorRampPalette(c("#EEEEEE",
|
||||
"#3399CC",
|
||||
"#2266DD"))
|
||||
plot(hexbin(phi, psi, xbins = 10),
|
||||
hexbin::gplot.hexbin(hexbin::hexbin(phi, psi, xbins = 10),
|
||||
colramp = myColorRamp,
|
||||
main = "phi-psi Density Bins for 1BM8",
|
||||
xlab = expression(phi),
|
||||
ylab = expression(psi))
|
||||
|
||||
# Note:
|
||||
# Had we loaded hexbin with library(hexbin), the plot function would have
|
||||
# been dispatched by the plot() generic, and we could simply have written:
|
||||
# plot(hexbin(phi, psi, xbins = 10), ...
|
||||
|
||||
|
||||
# == 3.4 Plotting density contours =========================================
|
||||
|
||||
@ -305,17 +314,16 @@ plot(hexbin(phi, psi, xbins = 10),
|
||||
# distributions. But for 2D data like or phi-psi plots, we need a function from
|
||||
# the MASS package: kde2d()
|
||||
|
||||
if (! require(MASS, quietly=TRUE)) {
|
||||
if (! requireNamespace("MASS", quietly = TRUE)) {
|
||||
install.packages("MASS")
|
||||
library(MASS)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = MASS) # basic information
|
||||
# browseVignettes("MASS") # available vignettes
|
||||
# data(package = "MASS") # available datasets
|
||||
|
||||
?kde2d
|
||||
dPhiPsi <-kde2d(phi, psi,
|
||||
?MASS::kde2d
|
||||
dPhiPsi <-MASS::kde2d(phi, psi,
|
||||
n = 60,
|
||||
lims = c(-180, 180, -180, 180))
|
||||
|
||||
@ -469,7 +477,7 @@ ssSelect <- function(PDB, myChain = "A", ssType, myElety) {
|
||||
|
||||
# get id's from PDB
|
||||
|
||||
x <- atom.select(PDB,
|
||||
x <- bio3d::atom.select(PDB,
|
||||
string = "protein",
|
||||
type = "ATOM",
|
||||
chain = myChain,
|
||||
@ -506,7 +514,7 @@ pairDist <- function(PDB, a, b) {
|
||||
|
||||
A <- PDB$atom[a, c("x", "y", "z")]
|
||||
B <- PDB$atom[b, c("x", "y", "z")]
|
||||
dMat <- dist.xyz(A, B)
|
||||
dMat <- bio3d::dist.xyz(A, B)
|
||||
|
||||
}
|
||||
return(dMat)
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 05
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,10 +30,10 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ----------------------------------------------------
|
||||
#TOC> 1 UniProt files via GET 44
|
||||
#TOC> 1.1 Task - fetchUniProtSeq() function 107
|
||||
#TOC> 2 Task solutions 114
|
||||
#TOC> ----------------------------------------------------------
|
||||
#TOC> 1 UniProt files via GET 41
|
||||
#TOC> 1.1 Task - fetchUniProtSeq() function 103
|
||||
#TOC> 2 Task solutions 110
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -48,9 +50,8 @@
|
||||
# a Web browser. Since this is a short and simple request, the GET verb is the
|
||||
# right tool:
|
||||
|
||||
if (! require(httr, quietly=TRUE)) {
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
@ -69,7 +70,7 @@ UniProtID <- "P39678"
|
||||
(URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID))
|
||||
|
||||
# the GET() function from httr will get the data.
|
||||
response <- GET(URL)
|
||||
response <- httr::GET(URL)
|
||||
|
||||
str(response) # the response object is a bit complex ...
|
||||
as.character(response) # ... but it is easy to pull out the data.
|
||||
@ -82,21 +83,21 @@ dbSanitizeSequence(x)
|
||||
# Simple.
|
||||
# But what happens if there is an error, e.g. the uniprot ID does not exist?
|
||||
|
||||
response <- GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||
response <- httr::GET("http://www.uniprot.org/uniprot/X000000.fasta")
|
||||
as.character(response)
|
||||
# this is a large HTML page that tells us the URL was not found. So we need to
|
||||
# check for errors. The Right way to do this is to evaluate the staus code that
|
||||
# check for errors. The Right Way to do this is to evaluate the staus code that
|
||||
# every Web server returns for every transaction.
|
||||
#
|
||||
status_code(response) # 404 == Page Not Found
|
||||
httr::status_code(response) # 404 == Page Not Found
|
||||
|
||||
# There are many possible codes, but the only code we will be happy with
|
||||
# is 200 - oK.
|
||||
# (cf. https://en.wikipedia.org/wiki/List_of_HTTP_status_codes )
|
||||
|
||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", UniProtID)
|
||||
response <- GET(URL)
|
||||
status_code(response)
|
||||
response <- httr::GET(URL)
|
||||
httr::status_code(response)
|
||||
|
||||
|
||||
# == 1.1 Task - fetchUniProtSeq() function =================================
|
||||
|
@ -3,12 +3,13 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Unit_testing unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 15
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace()
|
||||
# 1.0 New code
|
||||
#
|
||||
#
|
||||
@ -27,10 +28,11 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------
|
||||
#TOC> 1 Unit Tests with testthat 43
|
||||
#TOC> 2 Organizing your tests 156
|
||||
#TOC> 3 Task solutions 181
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Unit Tests with testthat 40
|
||||
#TOC> 2 Organizing your tests 159
|
||||
#TOC> 2.1 Testing scripts 183
|
||||
#TOC> 3 Task solutions 198
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -39,15 +41,22 @@
|
||||
|
||||
# The testthat package supports writing and executing unit tests in many ways.
|
||||
|
||||
if (! require(testthat, quietly=TRUE)) {
|
||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||
install.packages("testthat")
|
||||
library(testthat)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = testthat) # basic information
|
||||
# browseVignettes("testthat") # available vignettes
|
||||
# data(package = "testthat") # available datasets
|
||||
|
||||
# testthat is one of those packages that we either use A LOT in a script,
|
||||
# or not at all. Therfore it's more reasonable to depart from our usual
|
||||
# <package>::<function>() idiom, and load the entire library. In fact, if
|
||||
# we author packages, it is common practice to load testthat in the part
|
||||
# of the package that automates testing.
|
||||
|
||||
library(testthat)
|
||||
|
||||
# An atomic test consists of an expectation about the bahaviour of a function or
|
||||
# the existence of an object. testthat provides a number of useful expectations:
|
||||
|
||||
@ -171,6 +180,20 @@ test_file("./tests/test_biCode.R")
|
||||
# .. or execute all the test files in the directory:
|
||||
test_dir("./tests")
|
||||
|
||||
# == 2.1 Testing scripts ===================================================
|
||||
|
||||
# Scripts need special consideration since we do not necessarily source() them
|
||||
# entirely. Therefore automated testing is not reasonable. What you can do
|
||||
# instead is to place a conditional block at the end of your script, that
|
||||
# never gets executed - then you can manually execute the code in the block
|
||||
# whenever you wish to test your functions. For example:
|
||||
|
||||
if (FALSE) {
|
||||
# ... your tests go here
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
# = 3 Task solutions ======================================================
|
||||
|
||||
|
@ -3,12 +3,14 @@
|
||||
# Purpose: A Bioinformatics Course:
|
||||
# R code accompanying the RPR-Scripting_data_downloads unit.
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 05
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First ABC units version
|
||||
# 0.1 First code copied from 2016 material.
|
||||
#
|
||||
@ -28,10 +30,10 @@
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -----------------------------------------------------
|
||||
#TOC> 1 Working with NCBI eUtils 44
|
||||
#TOC> 1.1 Task - fetchNCBItaxData() function 162
|
||||
#TOC> 2 Task solutions 169
|
||||
#TOC> -----------------------------------------------------------
|
||||
#TOC> 1 Working with NCBI eUtils 41
|
||||
#TOC> 1.1 Task - fetchNCBItaxData() function 144
|
||||
#TOC> 2 Task solutions 151
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
@ -40,26 +42,11 @@
|
||||
|
||||
|
||||
|
||||
# To begin, we load some libraries with functions
|
||||
# we need...
|
||||
|
||||
# ... the package httr, which sends and receives information via the http
|
||||
# protocol, just like a Web browser.
|
||||
if (! require(httr, quietly=TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = httr) # basic information
|
||||
# browseVignettes("httr") # available vignettes
|
||||
# data(package = "httr") # available datasets
|
||||
|
||||
|
||||
# ...plus the package xml2: NCBI's eUtils send information in XML format so we
|
||||
# need to be able to parse XML.
|
||||
if (! require(xml2, quietly=TRUE)) {
|
||||
# To begin, we load the xml2 package that contains functions
|
||||
# we need to receive and parse html data. NCBI's eUtils send information in
|
||||
# XML format so we need to be able to parse XML.
|
||||
if (! requireNamespace("xml2", quietly=TRUE)) {
|
||||
install.packages("xml2")
|
||||
library(xml2)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = xml2) # basic information
|
||||
@ -91,24 +78,23 @@ URL <- paste(eUtilsBase,
|
||||
# what the response should look like.
|
||||
URL
|
||||
|
||||
# To fetch a response in R, we use the function GET() from the httr package
|
||||
# To fetch a response in R, we use the function read_xml()
|
||||
# with our URL as its argument.
|
||||
myXML <- read_xml(URL)
|
||||
myXML
|
||||
(myXML <- xml2::read_xml(URL))
|
||||
|
||||
# This is XML. We can take the response apart into
|
||||
# its indvidual components with the as_list() function.
|
||||
|
||||
as_list(myXML)
|
||||
xml2::as_list(myXML)
|
||||
|
||||
# Note how the XML "tree" is represented as a list of
|
||||
# lists of lists ...
|
||||
# If we know exactly what elelement we are looking for,
|
||||
# we can extract it from this structure:
|
||||
as_list(myXML)[["IdList"]][["Id"]][[1]]
|
||||
xml2::as_list(myXML)[["eSearchResult"]][["IdList"]][["Id"]][[1]]
|
||||
|
||||
# But this is not very robust, it would break with the
|
||||
# slightest change that the NCBI makes to their response
|
||||
# slightest change that the NCBI makes to their data format -
|
||||
# and the NCBI changes things A LOT!
|
||||
|
||||
# Somewhat more robust is to specify the type of element
|
||||
@ -116,11 +102,12 @@ as_list(myXML)[["IdList"]][["Id"]][[1]]
|
||||
# element, and use the XPath XML parsing language to
|
||||
# retrieve it.
|
||||
|
||||
xml_find_all(myXML, "//Id") # returns a "node set"
|
||||
xml2::xml_find_all(myXML, "//Id") # returns a "node set"
|
||||
|
||||
xml_text(xml_find_all(myXML, "//Id")) # returns the contents of the node set
|
||||
xml2::xml_text(xml2::xml_find_all(myXML, "//Id")) # returns the contents
|
||||
# of the node set
|
||||
|
||||
# We will need doing this a lot, so we write a function
|
||||
# We will need to do this more than once, so we write a function
|
||||
# for it...
|
||||
node2text <- function(doc, tag) {
|
||||
# an extractor function for the contents of elements
|
||||
@ -128,8 +115,8 @@ node2text <- function(doc, tag) {
|
||||
# Contents of all matching elements is returned in
|
||||
# a vector of strings.
|
||||
path <- paste0("//", tag)
|
||||
nodes <- xml_find_all(doc, path)
|
||||
return(xml_text(nodes))
|
||||
nodes <- xml2::xml_find_all(doc, path)
|
||||
return(xml2::xml_text(nodes))
|
||||
}
|
||||
|
||||
# using node2text() ...
|
||||
@ -145,7 +132,7 @@ URL <- paste0(eUtilsBase,
|
||||
"&id=",
|
||||
GID,
|
||||
"&version=2.0")
|
||||
(myXML <- read_xml(URL))
|
||||
(myXML <- xml2::read_xml(URL))
|
||||
|
||||
(taxID <- node2text(myXML, "TaxId"))
|
||||
(organism <- node2text(myXML, "Organism"))
|
||||
|
@ -22,17 +22,18 @@ setwd("<your/project/directory>")
|
||||
|
||||
|
||||
# ==== PACKAGES ==============================================================
|
||||
# Load all required packages.
|
||||
# Check that required packages have been installed. Install if needed.
|
||||
|
||||
if (! require(seqinr, quietly=TRUE)) {
|
||||
if (! requireNamespace("seqinr", quietly=TRUE)) {
|
||||
install.packages("seqinr")
|
||||
library(seqinr)
|
||||
}
|
||||
# Package information:
|
||||
# library(help = seqinr) # basic information
|
||||
# browseVignettes("seqinr") # available vignettes
|
||||
# data(package = "seqinr") # available datasets
|
||||
|
||||
# Note: use package functions with the :: operator - eg.
|
||||
# seqinr::aaa("K")
|
||||
|
||||
|
||||
|
||||
|
@ -9,21 +9,18 @@
|
||||
# ====== PACKAGES ==============================================================
|
||||
|
||||
|
||||
if (! require(jsonlite, quietly = TRUE)) {
|
||||
if (! requireNamespace("jsonlite", quietly = TRUE)) {
|
||||
install.packages("jsonlite")
|
||||
library(jsonlite)
|
||||
}
|
||||
|
||||
|
||||
if (! require(httr, quietly = TRUE)) {
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
|
||||
if (! require(xml2, quietly = TRUE)) {
|
||||
if (! requireNamespace("xml2", quietly = TRUE)) {
|
||||
install.packages("xml2")
|
||||
library(xml2)
|
||||
}
|
||||
|
||||
|
||||
@ -226,10 +223,10 @@ dbFetchUniProtSeq <- function(ID) {
|
||||
|
||||
URL <- sprintf("http://www.uniprot.org/uniprot/%s.fasta", ID)
|
||||
|
||||
response <- GET(URL)
|
||||
response <- httr::GET(URL)
|
||||
|
||||
mySeq <- character()
|
||||
if (status_code(response) == 200) {
|
||||
if (httr::status_code(response) == 200) {
|
||||
x <- as.character(response)
|
||||
x <- strsplit(x, "\n")
|
||||
mySeq <- dbSanitizeSequence(x)
|
||||
@ -253,7 +250,7 @@ dbFetchPrositeFeatures <- function(ID) {
|
||||
|
||||
URL <- "https://prosite.expasy.org/cgi-bin/prosite/PSScan.cgi"
|
||||
|
||||
response <- POST(URL,
|
||||
response <- httr::POST(URL,
|
||||
body = list(meta = "opt1",
|
||||
meta1_protein = "opt1",
|
||||
seq = ID,
|
||||
@ -261,9 +258,9 @@ dbFetchPrositeFeatures <- function(ID) {
|
||||
output = "tabular"))
|
||||
|
||||
myFeatures <- data.frame()
|
||||
if (status_code(response) == 200) {
|
||||
if (httr::status_code(response) == 200) {
|
||||
|
||||
lines <- unlist(strsplit(content(response, "text"), "\\n"))
|
||||
lines <- unlist(strsplit(httr::content(response, "text"), "\\n"))
|
||||
|
||||
patt <- sprintf("\\|%s\\|", UniProtID)
|
||||
lines <- lines[grep(patt, lines)]
|
||||
@ -289,8 +286,8 @@ node2text <- function(doc, tag) {
|
||||
# Contents of all matching elements is returned in
|
||||
# a vector of strings.
|
||||
path <- paste0("//", tag)
|
||||
nodes <- xml_find_all(doc, path)
|
||||
return(xml_text(nodes))
|
||||
nodes <- xml2::xml_find_all(doc, path)
|
||||
return(xml2::xml_text(nodes))
|
||||
}
|
||||
|
||||
|
||||
@ -309,7 +306,7 @@ dbFetchNCBItaxData <- function(ID) {
|
||||
"db=protein",
|
||||
"&term=", ID,
|
||||
sep="")
|
||||
myXML <- read_xml(URL)
|
||||
myXML <- xml2::read_xml(URL)
|
||||
GID <- node2text(myXML, "Id")
|
||||
|
||||
URL <- paste0(eUtilsBase,
|
||||
@ -318,7 +315,7 @@ dbFetchNCBItaxData <- function(ID) {
|
||||
"&id=",
|
||||
GID,
|
||||
"&version=2.0")
|
||||
myXML <- read_xml(URL)
|
||||
myXML <- xml2::read_xml(URL)
|
||||
|
||||
x <- as.integer(node2text(myXML, "TaxId"))
|
||||
y <- node2text(myXML, "Organism")
|
||||
@ -346,14 +343,14 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
# for IDs that are not mapped.
|
||||
|
||||
URL <- "https://www.uniprot.org/uploadlists/"
|
||||
response <- POST(URL,
|
||||
response <- httr::POST(URL,
|
||||
body = list(from = mapFrom,
|
||||
to = mapTo,
|
||||
format = "tab",
|
||||
query = s))
|
||||
|
||||
if (status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(content(response)),
|
||||
if (httr::status_code(response) == 200) { # 200: oK
|
||||
myMap <- read.delim(file = textConnection(httr::content(response)),
|
||||
sep = "\t",
|
||||
stringsAsFactors = FALSE)
|
||||
myMap <- myMap[ , c(1,3)]
|
||||
@ -362,12 +359,23 @@ UniProtIDmap <- function (s, mapFrom = "P_REFSEQ_AC", mapTo = "ACC") {
|
||||
myMap <- data.frame()
|
||||
warning(paste("No uniProt ID mapping returned:",
|
||||
"server sent status",
|
||||
status_code(response)))
|
||||
httr::status_code(response)))
|
||||
}
|
||||
|
||||
return(myMap)
|
||||
}
|
||||
|
||||
|
||||
# ====== TESTS =================================================================
|
||||
|
||||
if (FALSE) {
|
||||
if (! requireNamespace("testthat", quietly = TRUE)) {
|
||||
install.packages("testthat")
|
||||
}
|
||||
|
||||
# ToDo: test everything here
|
||||
|
||||
}
|
||||
|
||||
|
||||
# [END]
|
||||
|
@ -3,14 +3,16 @@
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.1.2
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2016 09 - 2017 09
|
||||
# Date: 2016 09 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# V 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# V 1.1 Update 2017
|
||||
# V 1.0 First code 2016
|
||||
# Versions
|
||||
# 1.2 Change from require() to requireNamespace()
|
||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# 1.1 Update 2017
|
||||
# 1.0 First code 2016
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
@ -31,27 +33,25 @@
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------
|
||||
#TOC> 1 The strategy 54
|
||||
#TOC> 2 GOLD species 66
|
||||
#TOC> 2.1 Initialize 71
|
||||
#TOC> 2.2 Import 77
|
||||
#TOC> 2.3 Unique species 129
|
||||
#TOC> 3 BLAST species 171
|
||||
#TOC> 3.1 find homologous proteins 178
|
||||
#TOC> 3.2 Identify species in "hits" 202
|
||||
#TOC> 4 Intersect GOLD and BLAST species 247
|
||||
#TOC> 5 Cleanup and finish 265
|
||||
#TOC> ---------------------------------------------------------
|
||||
#TOC> 1 The strategy 55
|
||||
#TOC> 2 GOLD species 67
|
||||
#TOC> 2.1 Initialize 72
|
||||
#TOC> 2.2 Import 79
|
||||
#TOC> 2.3 Unique species 131
|
||||
#TOC> 3 BLAST species 173
|
||||
#TOC> 3.1 find homologous proteins 180
|
||||
#TOC> 3.2 Identify species in "hits" 204
|
||||
#TOC> 4 Intersect GOLD and BLAST species 249
|
||||
#TOC> 5 Cleanup and finish 267
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
#TOC>
|
||||
#TOC>
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
@ -70,9 +70,10 @@
|
||||
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
|
||||
|
||||
# == 2.1 Initialize ========================================================
|
||||
if (! require(httr)) { # httr provides interfaces to Webservers on the Internet
|
||||
|
||||
# httr provides interfaces to Webservers on the Internet
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
# == 2.2 Import ============================================================
|
||||
|
@ -15,12 +15,14 @@
|
||||
# Data: (3 mb) https://downloads.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||
#
|
||||
#
|
||||
# Version: 1.0
|
||||
# Version: 1.1
|
||||
#
|
||||
# Date: 2017 10 06
|
||||
# Date: 2017 10 - 2019 01
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
@ -28,16 +30,16 @@
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
if (! require(readr, quietly = TRUE)) {
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
library(readr)
|
||||
}
|
||||
|
||||
|
||||
# STRING functional interaction data
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
STR <- read_delim("./data/4932.protein.links.full.v10.5.txt", delim = " ")
|
||||
STR <- readr::read_delim("./data/4932.protein.links.full.v10.5.txt",
|
||||
delim = " ")
|
||||
|
||||
# Subset only IDs and combined_score column
|
||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||
@ -61,7 +63,7 @@ myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
||||
#
|
||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||
|
||||
Gsl <- read_tsv("./data/go_slim_mapping.tab",
|
||||
Gsl <- readr::read_tsv("./data/go_slim_mapping.tab",
|
||||
col_names = c("ID",
|
||||
"name",
|
||||
"SGDId",
|
||||
|
@ -7,11 +7,13 @@
|
||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
||||
#
|
||||
#
|
||||
# Version: 3
|
||||
# Date: 2016 09 - 2017 11
|
||||
# Version: 3.1
|
||||
# Date: 2016 09 - 2019 01
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 3.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 3 parsing logic had not been fully implemented; Fixed.
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
@ -29,9 +31,8 @@
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (! require(httr, quietly = TRUE)) {
|
||||
if (! requireNamespace(httr, quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
library(httr)
|
||||
}
|
||||
|
||||
|
||||
@ -92,13 +93,13 @@ BLAST <- function(q,
|
||||
}
|
||||
|
||||
# send it off ...
|
||||
response <- GET(results$query)
|
||||
if (http_status(response)$category != "Success" ) {
|
||||
response <- httr::GET(results$query)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||
http_status(response)$message))
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- content(response, "text", encoding = "UTF-8")
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
patt <- "RID = (\\w+)" # match the request id
|
||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||
@ -127,13 +128,13 @@ BLAST <- function(q,
|
||||
|
||||
while (TRUE) {
|
||||
# Check whether the result is ready
|
||||
response <- GET(checkStatus)
|
||||
if (http_status(response)$category != "Success" ) {
|
||||
response <- httr::GET(checkStatus)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||
http_status(response)$message))
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- content(response, "text", encoding = "UTF-8")
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||
myTimeout <- myTimeout - EXTRAWAIT
|
||||
@ -184,13 +185,13 @@ BLAST <- function(q,
|
||||
"&FORMAT_TYPE=Text",
|
||||
sep = "")
|
||||
|
||||
response <- GET(retrieve)
|
||||
if (http_status(response)$category != "Success" ) {
|
||||
response <- httr::GET(retrieve)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||
http_status(response)$message))
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- content(response, "text", encoding = "UTF-8")
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
# txt contains the whole set of results. Process:
|
||||
|
||||
@ -357,7 +358,7 @@ parseBLASTalignment <- function(hit) {
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
# define query:
|
||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain sequence
|
||||
# q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
# "LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
# "GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
# sep="")
|
||||
|
Loading…
Reference in New Issue
Block a user