Updated MYSPE data and entire workflow. Changed all .RData to .rds
This commit is contained in:
parent
3d91337e70
commit
7536473c5d
@ -6,9 +6,6 @@
|
|||||||
# UofT eMail address.
|
# UofT eMail address.
|
||||||
# myStudentNumber numeric Your UofT student number. Take care to have this
|
# myStudentNumber numeric Your UofT student number. Take care to have this
|
||||||
# correct.
|
# correct.
|
||||||
# MYSPE char A string with the species name of a genome-
|
|
||||||
# sequenced fungus. Which species this is will be
|
|
||||||
# determined in the BIN-MYSPE unit.
|
|
||||||
#
|
#
|
||||||
# NOTE:
|
# NOTE:
|
||||||
# After you have updated this script, save the file in your "myScripts" folder.
|
# After you have updated this script, save the file in your "myScripts" folder.
|
||||||
@ -18,6 +15,5 @@
|
|||||||
|
|
||||||
myEMail <- "<your-e-mail-address-here>" # e.g. "u.franklin@utoronto.ca"
|
myEMail <- "<your-e-mail-address-here>" # e.g. "u.franklin@utoronto.ca"
|
||||||
myStudentNumber <- <your-student-number-here> # e.g. 1003141592
|
myStudentNumber <- <your-student-number-here> # e.g. 1003141592
|
||||||
MYSPE <- "<your-adopted-species-here>" # e.g. "Neurospora crassa OR74A"
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
128
.utilities.R
128
.utilities.R
@ -1,11 +1,12 @@
|
|||||||
# .utilities.R
|
# tocID <- "./.utilities.R"
|
||||||
#
|
#
|
||||||
# Miscellaneous R code to suppport the project
|
# Miscellaneous R code to suppport the project
|
||||||
#
|
#
|
||||||
# Version: 1.3.1
|
# Version: 1.4
|
||||||
# Date: 2017 09 - 2019 11
|
# Date: 2017-09 - 2020-09
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
|
# V 1.4 Maintenance
|
||||||
# V 1.3.1 prefix Biostrings:: to subseq()
|
# V 1.3.1 prefix Biostrings:: to subseq()
|
||||||
# V 1.3 load msa support functions
|
# V 1.3 load msa support functions
|
||||||
# V 1.2 update database utilities to support 2017 version of JSON sources
|
# V 1.2 update database utilities to support 2017 version of JSON sources
|
||||||
@ -17,15 +18,39 @@
|
|||||||
#
|
#
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
# ====== SCRIPTS =============================================================
|
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
#TOC>
|
||||||
|
#TOC> Section Title Line
|
||||||
|
#TOC> -----------------------------------------------------------
|
||||||
|
#TOC> 1 SCRIPTS TO SOURCE 42
|
||||||
|
#TOC> 2 SUPPORT FUNCTIONS 49
|
||||||
|
#TOC> 2.1 objectInfo() 52
|
||||||
|
#TOC> 2.2 biCode() 80
|
||||||
|
#TOC> 2.3 pBar() 114
|
||||||
|
#TOC> 2.4 waitTimer() 136
|
||||||
|
#TOC> 2.5 fetchMSAmotif() 164
|
||||||
|
#TOC> 2.6 H() (Shannon entropy) 208
|
||||||
|
#TOC> 3 DATA 222
|
||||||
|
#TOC> 3.1 REFspecies 224
|
||||||
|
#TOC> 4 FUNCTIONS TO CUSTOMIZE ASSIGNMENTS 239
|
||||||
|
#TOC> 4.1 getMYSPE() 242
|
||||||
|
#TOC> 4.2 selectPDBrep() 251
|
||||||
|
#TOC>
|
||||||
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
# = 1 SCRIPTS TO SOURCE ===================================================
|
||||||
|
|
||||||
source("./scripts/ABC-dbUtilities.R")
|
source("./scripts/ABC-dbUtilities.R")
|
||||||
source("./scripts/ABC-writeALN.R")
|
source("./scripts/ABC-writeALN.R")
|
||||||
source("./scripts/ABC-writeMFA.R")
|
source("./scripts/ABC-writeMFA.R")
|
||||||
|
|
||||||
|
|
||||||
# ====== SUPPORT FUNCTIONS =====================================================
|
# = 2 SUPPORT FUNCTIONS ===================================================
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.1 objectInfo() ======================================================
|
||||||
objectInfo <- function(x) {
|
objectInfo <- function(x) {
|
||||||
# Function to combine various information items about R objects
|
# Function to combine various information items about R objects
|
||||||
#
|
#
|
||||||
@ -53,6 +78,7 @@ objectInfo <- function(x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.2 biCode() ==========================================================
|
||||||
biCode <- function(s) {
|
biCode <- function(s) {
|
||||||
# Make a 5 character "biCode" from a binomial name by concatening
|
# Make a 5 character "biCode" from a binomial name by concatening
|
||||||
# the uppercased first three letter of the first word and the first
|
# the uppercased first three letter of the first word and the first
|
||||||
@ -86,6 +112,7 @@ biCode <- function(s) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.3 pBar() ============================================================
|
||||||
pBar <- function(i, l, nCh = 50) {
|
pBar <- function(i, l, nCh = 50) {
|
||||||
# Draw a progress bar in the console
|
# Draw a progress bar in the console
|
||||||
# i: the current iteration
|
# i: the current iteration
|
||||||
@ -107,6 +134,7 @@ pBar <- function(i, l, nCh = 50) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.4 waitTimer() =======================================================
|
||||||
waitTimer <- function(t, nIntervals = 50) {
|
waitTimer <- function(t, nIntervals = 50) {
|
||||||
# pause and wait for t seconds and display a progress bar as
|
# pause and wait for t seconds and display a progress bar as
|
||||||
# you are waiting
|
# you are waiting
|
||||||
@ -134,6 +162,7 @@ waitTimer <- function(t, nIntervals = 50) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.5 fetchMSAmotif() ===================================================
|
||||||
fetchMSAmotif <- function(ali, mot) {
|
fetchMSAmotif <- function(ali, mot) {
|
||||||
# Retrieve a subset from ali that spans the sequence in mot.
|
# Retrieve a subset from ali that spans the sequence in mot.
|
||||||
# Biostrings package must be installed.
|
# Biostrings package must be installed.
|
||||||
@ -177,40 +206,23 @@ fetchMSAmotif <- function(ali, mot) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 2.6 H() (Shannon entropy) =============================================
|
||||||
|
H <- function(x, N) {
|
||||||
# ====== PDB ID selection ======================================================
|
# calculate the Shannon entropy of the vector x given N possible states
|
||||||
|
# (in bits).
|
||||||
selectPDBrep <- function(n, seed = as.numeric(Sys.time())) {
|
# H(x) = - sum_i(P(x_i) * log2(P(x_i)); 0 * log(0) == 0
|
||||||
# Select n PDB IDs from a list of high-resolution, non-homologous, single
|
t <- table(x)
|
||||||
# domain, single chain structure files that represent a CATH topology
|
if (missing(N)) { N <- length(t) }
|
||||||
# group.
|
if (length(t) > N ) { stop("N can't be smaller than observed states.") }
|
||||||
# Parameters:
|
h <- sum(- (t / length(x)) * log2(t / length(x)))
|
||||||
# n num number of IDs to return
|
return(h)
|
||||||
# seed num a seed for the RNG
|
|
||||||
#
|
|
||||||
# Value: char PDB IDs
|
|
||||||
#
|
|
||||||
# Note: the list is loaded from an RData file in the "./data" directory.
|
|
||||||
# If you use this function for a course submissio, it MUST be invoked as:
|
|
||||||
#
|
|
||||||
# selectPDBrep(n, seed = myStudentNumber)
|
|
||||||
#
|
|
||||||
# ... and myStudentNumber MUST be correctly initialized
|
|
||||||
|
|
||||||
load("./data/pdbRep.RData") # loads pdbRep
|
|
||||||
if (n > length(pdbRep)) {
|
|
||||||
stop(sprintf("There are only %d PDB IDs in the table to choose from.",
|
|
||||||
length(pdbRep)))
|
|
||||||
}
|
|
||||||
set.seed(seed)
|
|
||||||
return(sample(pdbRep, n))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ====== DATA ==================================================================
|
|
||||||
|
|
||||||
|
# = 3 DATA ================================================================
|
||||||
|
|
||||||
|
# == 3.1 REFspecies ========================================================
|
||||||
# 10 species of fungi for reference analysis.
|
# 10 species of fungi for reference analysis.
|
||||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||||
REFspecies <- c("Aspergillus nidulans",
|
REFspecies <- c("Aspergillus nidulans",
|
||||||
@ -222,8 +234,52 @@ REFspecies <- c("Aspergillus nidulans",
|
|||||||
"Saccharomyces cerevisiae",
|
"Saccharomyces cerevisiae",
|
||||||
"Schizosaccharomyces pombe",
|
"Schizosaccharomyces pombe",
|
||||||
"Ustilago maydis",
|
"Ustilago maydis",
|
||||||
"Wallemia mellicola"
|
"Wallemia mellicola")
|
||||||
)
|
|
||||||
|
|
||||||
|
# = 4 FUNCTIONS TO CUSTOMIZE ASSIGNMENTS ==================================
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.1 getMYSPE() ========================================================
|
||||||
|
getMYSPE <- function(x) {
|
||||||
|
dat <- readRDS("./data/sDat.rds")
|
||||||
|
map <- readRDS("./data/MYSPEmap.rds")
|
||||||
|
key <- gsub(".+(....).$", "\\1", x)
|
||||||
|
return(dat$species[map[key, "iMYSPE"]])
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# == 4.2 selectPDBrep() ====================================================
|
||||||
|
selectPDBrep <- function(n, seed) {
|
||||||
|
# Select n PDB IDs from a list of high-resolution, non-homologous, single
|
||||||
|
# domain, single chain structure files that represent a CATH topology
|
||||||
|
# group.
|
||||||
|
# Parameters:
|
||||||
|
# n num number of IDs to return
|
||||||
|
# seed num a seed for the RNG
|
||||||
|
#
|
||||||
|
# Value: char PDB IDs
|
||||||
|
#
|
||||||
|
# Note: the list is loaded from an .rds file in the "./data" directory.
|
||||||
|
# If you use this function for a course submission, it MUST be invoked as:
|
||||||
|
#
|
||||||
|
# selectPDBrep(n, seed = myStudentNumber)
|
||||||
|
#
|
||||||
|
# ... and myStudentNumber MUST be correctly initialized
|
||||||
|
|
||||||
|
pdbRep <- readRDS("./data/pdbRep.rds") # loads pdbRep
|
||||||
|
|
||||||
|
if (n > length(pdbRep)) {
|
||||||
|
stop(sprintf("There are only %d PDB IDs in the table to choose from.",
|
||||||
|
length(pdbRep)))
|
||||||
|
}
|
||||||
|
oldSeed <- .Random.seed
|
||||||
|
set.seed(seed)
|
||||||
|
PDBset <- sample(pdbRep, n)
|
||||||
|
.Random.seed <- oldSeed
|
||||||
|
return(PDBset)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
90
BIN-MYSPE.R
90
BIN-MYSPE.R
@ -9,11 +9,12 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-MYSPE unit
|
# R code accompanying the BIN-MYSPE unit
|
||||||
#
|
#
|
||||||
# Version: 1.0.1
|
# Version: 1.1
|
||||||
#
|
#
|
||||||
# Date: 2017 09 21
|
# Date: 2020-09-18
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
|
# V 1.1 2020 Workflow changes
|
||||||
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
# V 1.0.1 Move ABC-makeMYSPElist.R to ./scripts directory
|
||||||
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
# V 1.0 Final code, after rewriting BLAST parser and updating MYSPElist
|
||||||
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
# V 0.1 First code copied from BCH441_A03_makeMYSPElist.R
|
||||||
@ -36,9 +37,9 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> -----------------------------------------------
|
#TOC> -----------------------------------------------
|
||||||
#TOC> 1 Preparations 39
|
#TOC> 1 Preparations 47
|
||||||
#TOC> 2 Suitable MYSPE Species 51
|
#TOC> 2 Suitable MYSPE Species 59
|
||||||
#TOC> 3 Adopt "MYSPE" 65
|
#TOC> 3 Adopt "MYSPE" 83
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -47,7 +48,7 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
# Execute the two conditionals below:
|
# Execute the two conditionals below:
|
||||||
if (! file.exists(".myProfile.R")) {
|
if (! file.exists("scripts/.myProfile.R")) {
|
||||||
stop("PANIC: profile file does not exist. Fix problem or ask for help.")
|
stop("PANIC: profile file does not exist. Fix problem or ask for help.")
|
||||||
}
|
}
|
||||||
if (! exists("myStudentNumber")) {
|
if (! exists("myStudentNumber")) {
|
||||||
@ -64,31 +65,80 @@ if (! exists("myStudentNumber")) {
|
|||||||
|
|
||||||
# A detailed description of the process of compiling the list of genome
|
# A detailed description of the process of compiling the list of genome
|
||||||
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
# sequenced fungi with protein annotations and Mbp1 homologues is in the file
|
||||||
# ./scripts/ABC-makeMYSPElist.R
|
# ./scripts/ABC-makeMYSPElist.R In brief, data for genome-sequenced fungi
|
||||||
|
# was retrieved from https://fungi.ensembl.org; a search for homologues to
|
||||||
|
# yeast Mbp1 was performed with BLAST at the NCBI, and the data was merged.
|
||||||
|
# A representative organism at each genus-level was chosen from those hits
|
||||||
|
# that actual;ly have a homologue. Finally, a mapping table was constructed to
|
||||||
|
# asymmetrically retrieve unique species: a student number will retrieve
|
||||||
|
# a species, but (public) knowledge of the species cannot reconstruct the
|
||||||
|
# student number.
|
||||||
|
|
||||||
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
# Task: Study ./scripts/ABC-makeMYSPElist.R, it implements a typical workflow
|
||||||
# of selecting and combining data from public-domain data resources.
|
# of selecting and combining data from various data resources. Studying
|
||||||
|
# it will give you a better sense of how such workflows can be
|
||||||
|
# implemented in practice.
|
||||||
|
|
||||||
|
|
||||||
# = 3 Adopt "MYSPE" =======================================================
|
# = 3 Adopt "MYSPE" =======================================================
|
||||||
|
|
||||||
|
# Execute:
|
||||||
|
( MYSPE <- getMYSPE(myStudentNumber) )
|
||||||
|
|
||||||
# In the code below, we load the resulting vector of species name, then pick one
|
# If this produced an error, this session has not been properly set up. You
|
||||||
# of them in a random but reproducible way, determined by your student number.
|
# may not yet have run init() and edited .myProfile.R , or that file is not
|
||||||
|
# in your myScripts/ folder. Fix this, and execute source(".Rprofile") .
|
||||||
|
|
||||||
load("data/MYSPEspecies.RData") # load the species names
|
# If this produced NA, your Student Number may not be correct, or you are not
|
||||||
set.seed(myStudentNumber) # seed the random number generator
|
# in my class-list. Contact me.
|
||||||
MYSPE <- sample(MYSPEspecies, 1) # pick a species at random
|
# Otherwise, this should have printed a species name. Your unique species
|
||||||
set.seed(NULL) # reset the random number generator
|
# for this course.
|
||||||
# write the result to your personalized profile data so we can use the result in
|
|
||||||
# other functions
|
|
||||||
cat(sprintf("MYSPE <- \"%s\"\n", MYSPE), file = ".myProfile.R", append = TRUE)
|
|
||||||
|
|
||||||
MYSPE # so, which species is it ... ?
|
|
||||||
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
biCode(MYSPE) # and what is it's "BiCode" ... ?
|
||||||
|
|
||||||
# Task: Note down the species name and its five letter label on your Student
|
# Task: Note down the species name and its five letter BiCode on your Student
|
||||||
# Wiki user page. Use this species whenever this or future assignments refer
|
# Wiki user page. Use this species whenever this or future assignments refer
|
||||||
# to MYSPE. In code, we will automatically load it from your .myProfile.R file.
|
# to MYSPE. Whenever you start a session, it will automatically be loaded
|
||||||
|
# from myScripts/.myProfile.R and is available as MYSPE .
|
||||||
|
|
||||||
|
# Here is some more information:
|
||||||
|
fungiDat <- read.csv("data/Species.csv")
|
||||||
|
|
||||||
|
# number of sequenced fungal genomes:
|
||||||
|
nrow(fungiDat)
|
||||||
|
|
||||||
|
# sequenced genomes of species:
|
||||||
|
sel <- MYSPE == gsub("^(\\S+\\s\\S+).*$", "\\1", fungiDat$Name)
|
||||||
|
( x <- fungiDat[sel, "Name"] )
|
||||||
|
Nspc <- length(x) # save this for later ...
|
||||||
|
|
||||||
|
# sequenced genomes of genus:
|
||||||
|
sel <- gsub("\\s.*", "", MYSPE) == gsub("\\s.*", "", fungiDat$Name)
|
||||||
|
( x <- fungiDat[sel, "Name"] )
|
||||||
|
Ngen <- length(x) - Nspc
|
||||||
|
|
||||||
|
# order:
|
||||||
|
( x <- unique(fungiDat[sel, "Classification"]) )
|
||||||
|
Nord <- sum(fungiDat$Classification == x) - Ngen - Nspc
|
||||||
|
Nfng <- nrow(fungiDat) - Nord - Ngen - Nspc
|
||||||
|
|
||||||
|
# proportions
|
||||||
|
pCol <- c("#ed394e", "#ff9582", "#ffd5c4", "#f2f2f0")
|
||||||
|
pie(c(Nspc, Ngen, Nord, Nfng),
|
||||||
|
labels = "",
|
||||||
|
radius = 1,
|
||||||
|
main = "MYSPE in genome-sequenced fungi",
|
||||||
|
sub = MYSPE,
|
||||||
|
col = pCol,
|
||||||
|
clockwise = TRUE,
|
||||||
|
init.angle = 90)
|
||||||
|
legend(x = 1.3, y = 0.8, # position
|
||||||
|
legend = c("Species", "Genus", "Order", "Fungi"),
|
||||||
|
y.intersp = 1.5, # line spacing for labels
|
||||||
|
cex = 0.9, # character size for labels
|
||||||
|
bty = "n", # "no" box around the legend
|
||||||
|
pt.cex = 2, # size of colour boxes
|
||||||
|
pch = 15,
|
||||||
|
col = pCol)
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -110,7 +110,7 @@ ape::nodelabels(text = fungiTree$node.label,
|
|||||||
|
|
||||||
# We load the APSES sequence tree that you produced in the
|
# We load the APSES sequence tree that you produced in the
|
||||||
# BIN-PHYLO-Tree_building unit:
|
# BIN-PHYLO-Tree_building unit:
|
||||||
load(file = "APSEStreeRproml.RData")
|
apsTree <- readRDS(file = "APSEStreeRproml.rds")
|
||||||
|
|
||||||
plot(apsTree) # default type is "phylogram"
|
plot(apsTree) # default type is "phylogram"
|
||||||
plot(apsTree, type = "unrooted")
|
plot(apsTree, type = "unrooted")
|
||||||
|
@ -9,12 +9,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
# R code accompanying the BIN-PHYLO-Tree_building unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 10. 31
|
# Date: 2017-10 2020-09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 deprecate save()/load() for saveRDS()/readRDS()
|
||||||
# 1.1 Change from require() to requireNamespace(),
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# 1.0 First 2017 version
|
# 1.0 First 2017 version
|
||||||
@ -139,7 +140,7 @@ apsTree <- Rphylip::Rproml(apsIn, path=PROMLPATH)
|
|||||||
plot(apsTree)
|
plot(apsTree)
|
||||||
|
|
||||||
# save your tree:
|
# save your tree:
|
||||||
save(apsTree, file = "APSEStreeRproml.RData")
|
saveRDS(apsTree, file = "APSEStreeRproml.rds")
|
||||||
|
|
||||||
# If this did not work, ask for advice.
|
# If this did not work, ask for advice.
|
||||||
|
|
||||||
|
@ -9,12 +9,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the BIN-PPI-Analysis unit.
|
# R code accompanying the BIN-PPI-Analysis unit.
|
||||||
#
|
#
|
||||||
# Version: 1.1
|
# Version: 1.2
|
||||||
#
|
#
|
||||||
# Date: 2017 08 - 2019 01
|
# Date: 2017-08 - 2020-09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.2 Deprecate save()/load() for saveRDS()/readRDS()
|
||||||
# 1.1 Change from require() to requireNamespace(),
|
# 1.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# use Biocmanager:: not biocLite()
|
# use Biocmanager:: not biocLite()
|
||||||
@ -72,7 +73,7 @@ if (! requireNamespace("igraph", quietly = TRUE)) {
|
|||||||
# a fungal proteome. You can load the saved dataframe here (To read more about
|
# a fungal proteome. You can load the saved dataframe here (To read more about
|
||||||
# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
# what the numbers mean, see http://www.ncbi.nlm.nih.gov/pubmed/15608232 ).
|
||||||
|
|
||||||
load("./data/STRINGedges.RData")
|
STRINGedges <- readRDS("./data/STRINGedges.rds")
|
||||||
|
|
||||||
head(STRINGedges)
|
head(STRINGedges)
|
||||||
|
|
||||||
|
@ -198,7 +198,7 @@ Biostrings::translate(biosDNAseq[4:15])
|
|||||||
|
|
||||||
Biostrings::toString(biosDNAseq[4:15])
|
Biostrings::toString(biosDNAseq[4:15])
|
||||||
|
|
||||||
# save() and load() works like on all other R objects.
|
# saveRDS() and readRDS() works like on all other R objects.
|
||||||
|
|
||||||
|
|
||||||
# = 5 More ================================================================
|
# = 5 More ================================================================
|
||||||
|
@ -9,12 +9,13 @@
|
|||||||
# Purpose: A Bioinformatics Course:
|
# Purpose: A Bioinformatics Course:
|
||||||
# R code accompanying the RPR_GEO2R unit.
|
# R code accompanying the RPR_GEO2R unit.
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
#
|
#
|
||||||
# Date: 2017 09 - 2019 01
|
# Date: 2017-09 - 2020-09
|
||||||
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
# Author: Boris Steipe <boris.steipe@utoronto.ca>
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 1.3 use saveRDS()/readRDS() rather than save()/load()
|
||||||
# 1.2 Change from require() to requireNamespace(),
|
# 1.2 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout,
|
# use <package>::<function>() idiom throughout,
|
||||||
# use Biocmanager:: not biocLite()
|
# use Biocmanager:: not biocLite()
|
||||||
@ -122,7 +123,9 @@ GSE3635 <- GSE3635[[idx]]
|
|||||||
# FALLBACK
|
# FALLBACK
|
||||||
# ... in case the GEO server is not working, load the "GSE3635" object from
|
# ... in case the GEO server is not working, load the "GSE3635" object from
|
||||||
# the data directory:
|
# the data directory:
|
||||||
# load(file="./data/GSE3635.RData")
|
#
|
||||||
|
# GSE3635 <- readRDS(file="./data/GSE3635.rds")
|
||||||
|
|
||||||
|
|
||||||
# Checkpoint ...
|
# Checkpoint ...
|
||||||
if (! exists("GSE3635")) {
|
if (! exists("GSE3635")) {
|
||||||
|
Binary file not shown.
Binary file not shown.
BIN
data/BLASThits.rds
Normal file
BIN
data/BLASThits.rds
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
data/GSE3635.rds
Normal file
BIN
data/GSE3635.rds
Normal file
Binary file not shown.
BIN
data/MYSPEmap.rds
Normal file
BIN
data/MYSPEmap.rds
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/REFdat.rds
Normal file
BIN
data/REFdat.rds
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/STRINGedges.rds
Normal file
BIN
data/STRINGedges.rds
Normal file
Binary file not shown.
1015
data/Species.csv
Normal file
1015
data/Species.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/myCodons.rds
Normal file
BIN
data/myCodons.rds
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/pdbRep.rds
Normal file
BIN
data/pdbRep.rds
Normal file
Binary file not shown.
BIN
data/refDB.rds
Normal file
BIN
data/refDB.rds
Normal file
Binary file not shown.
BIN
data/sDat.rds
Normal file
BIN
data/sDat.rds
Normal file
Binary file not shown.
Binary file not shown.
BIN
data/scCCnet.rds
Normal file
BIN
data/scCCnet.rds
Normal file
Binary file not shown.
@ -1,14 +1,16 @@
|
|||||||
# ABC_makeMYSPElist.R
|
# tocID <- "scripts/ABC-makeMYSPElist.R"
|
||||||
#
|
#
|
||||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||||
# Mbp1 homologues.
|
# Mbp1 homologues.
|
||||||
#
|
#
|
||||||
# Version: 1.2
|
# Version: 1.3
|
||||||
#
|
#
|
||||||
# Date: 2016 09 - 2019 01
|
# Date: 2016 09 - 2020 09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions
|
# Versions
|
||||||
|
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||||
|
# since 2012. Use ensembl fungi as initial source.
|
||||||
# 1.2 Change from require() to requireNamespace()
|
# 1.2 Change from require() to requireNamespace()
|
||||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||||
# 1.1 Update 2017
|
# 1.1 Update 2017
|
||||||
@ -37,17 +39,16 @@
|
|||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> ---------------------------------------------------------
|
#TOC> --------------------------------------------------------
|
||||||
#TOC> 1 The strategy 55
|
#TOC> 1 The strategy 56
|
||||||
#TOC> 2 GOLD species 67
|
#TOC> 2 PACKAGES AND INITIALIZATIONS 68
|
||||||
#TOC> 2.1 Initialize 72
|
#TOC> 3 ENSEMBL FUNGI 76
|
||||||
#TOC> 2.2 Import 79
|
#TOC> 3.1 Import 79
|
||||||
#TOC> 2.3 Unique species 131
|
#TOC> 4 BLAST SEARCH 156
|
||||||
#TOC> 3 BLAST species 173
|
#TOC> 4.1 find homologous proteins 162
|
||||||
#TOC> 3.1 find homologous proteins 180
|
#TOC> 4.2 Identify species in "hits" 193
|
||||||
#TOC> 3.2 Identify species in "hits" 204
|
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 283
|
||||||
#TOC> 4 Intersect GOLD and BLAST species 249
|
#TOC> 6 STUDENT NUMBERS 366
|
||||||
#TOC> 5 Cleanup and finish 267
|
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -55,129 +56,110 @@
|
|||||||
# = 1 The strategy ========================================================
|
# = 1 The strategy ========================================================
|
||||||
|
|
||||||
# This script will create a list of "MYSPE" species and save it in an R object
|
# This script will create a list of "MYSPE" species and save it in an R object
|
||||||
# MYSPEspecies that is stored in the data subdirectory of this project from where
|
# MYSPEspecies that is stored in the data subdirectory of this project from
|
||||||
# it can be loaded. The strategy is as follows: we download a list of all
|
# where it can be loaded. The strategy is as follows: we download a list of
|
||||||
# genome projects and then select species for which protein annotations are
|
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
|
||||||
# available - i.e. these are all genome-sequenced species that have been
|
# species that have been annotated.
|
||||||
# annotated. Then we search for fungal species that have homologues to MBP1.
|
# Next we perform a BLAST search, to identify fungal species that have
|
||||||
# Then we intersect the two lists to give us genome-sequenced species that
|
# genes that are homologous to yeast MBP1.
|
||||||
# also have Mbp1 homologues ...
|
#
|
||||||
|
# ...
|
||||||
|
|
||||||
|
# = 2 PACKAGES AND INITIALIZATIONS ========================================
|
||||||
# = 2 GOLD species ========================================================
|
|
||||||
|
|
||||||
# Fetch and parse the Genomes OnLine Database of the Joint Genome Institute
|
|
||||||
# (https://gold.jgi.doe.gov/). Use the data that is hosted at the NCBI.
|
|
||||||
|
|
||||||
# == 2.1 Initialize ========================================================
|
|
||||||
|
|
||||||
# httr provides interfaces to Webservers on the Internet
|
# httr provides interfaces to Webservers on the Internet
|
||||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
}
|
}
|
||||||
|
|
||||||
# == 2.2 Import ============================================================
|
|
||||||
|
|
||||||
# The URL of the genome data directory at the NCBI:
|
# = 3 ENSEMBL FUNGI =======================================================
|
||||||
# is https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS
|
|
||||||
# Note the relative size of the prokaryotes and the eukaryotes data.
|
|
||||||
|
|
||||||
# What's in this directory?
|
|
||||||
URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/README"
|
|
||||||
GOLDreadme <- readLines(URL) # read the file into a vector
|
|
||||||
cat(GOLDreadme, sep = "\n") # display the contents
|
|
||||||
|
|
||||||
# Retrieve the file "eukaryotes" via ftp from the NCBI ftp server and put it
|
|
||||||
# into a dataframe. This will take a few moments.
|
|
||||||
# URL <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/eukaryotes.txt"
|
|
||||||
# GOLDdata <- read.csv(URL,
|
|
||||||
# header = TRUE,
|
|
||||||
# sep = "\t",
|
|
||||||
# stringsAsFactors = FALSE)
|
|
||||||
# save(GOLDdata, file="data/GOLDdata.RData")
|
|
||||||
# or ...
|
|
||||||
load(file="data/GOLDdata.RData")
|
|
||||||
|
|
||||||
|
|
||||||
# What columns does the table have, how is it structured?
|
# == 3.1 Import ============================================================
|
||||||
str(GOLDdata)
|
|
||||||
|
|
||||||
# What groups of organisms are in the table? How many of each?
|
# Navigate to https://fungi.ensembl.org and click on the link to the full
|
||||||
table(GOLDdata$Group)
|
# list of all species: https://fungi.ensembl.org/species.html
|
||||||
|
# On the page, click on the spreadsheet symbol top right and choose
|
||||||
|
# "download whole table". The file will be named "Species.csv", in your
|
||||||
|
# usual downloads folder. Move it to the data folder, and read it.
|
||||||
|
|
||||||
# What subgroups of fungi do we have?
|
sDat <- read.csv("./data/Species.csv")
|
||||||
table(GOLDdata$SubGroup[GOLDdata$Group == "Fungi"])
|
str(sDat)
|
||||||
|
|
||||||
# How many of the fungi have protein annotations? The README file told us that
|
# The most obvious way to partition these is according to Classification ...
|
||||||
# the column "Proteins" contains "Number of Proteins annotated in the assembly".
|
# (poking around a bit in the UniProt taxonomy database shows that the
|
||||||
# Looking at a few ...
|
# classification used here is the taxonomic rank of "order").
|
||||||
head(GOLDdata$Proteins, 30)
|
# how many classifications do we have?
|
||||||
# ... we see that the number varies, and some have a hyphen, i.e. no
|
length(unique(sDat$Classification)) # 66
|
||||||
# annotations. The hyphens make this a char type column (as per: all elements
|
|
||||||
# of a vector must have the same type). Therefore we can't read this as numbers
|
|
||||||
# and filter by some value > 0. But we can filter for all genomes that don't
|
|
||||||
# have the hyphen:
|
|
||||||
sum(GOLDdata$Proteins[GOLDdata$Group == "Fungi"] != "-")
|
|
||||||
|
|
||||||
# Subset the data, with fungi that have protein annotations
|
# To have a good set for the class, we should have about 100.
|
||||||
GOLDfungi <- GOLDdata[GOLDdata$Group == "Fungi" &
|
# Let's see for which of these we can find Mbp1 homologues.
|
||||||
GOLDdata$Proteins != "-" , ]
|
# First, we'll keep only the colums for name, classification, and taxID, and
|
||||||
|
# drop the rest ...
|
||||||
|
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
|
||||||
|
colnames(sDat) <- c("name", "order", "taxID")
|
||||||
|
|
||||||
# check what we have in the table
|
# Next, we make an extra column: genus - the first part of the binomial name.
|
||||||
nrow(GOLDfungi)
|
# We'll use the gsub() function, and for that we need a "regular expression"
|
||||||
head(GOLDfungi)
|
# that matches to all characters from the first blank to the end of the string:
|
||||||
|
myPatt <- "\\s.*$" # one whitespace (\\s) ...
|
||||||
|
# followed by any character (.) 0..n times (*) ...
|
||||||
|
# until the end of the string
|
||||||
|
|
||||||
|
# using gsub() we substitue all matching characters with the empty string "" -
|
||||||
|
# this deletes the matching characters
|
||||||
|
# Test this:
|
||||||
|
gsub(myPatt, "", "Genus") # one word: unchanged
|
||||||
|
gsub(myPatt, "", "gEnus species") # two words: return only first
|
||||||
|
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
|
||||||
|
|
||||||
# == 2.3 Unique species ====================================================
|
# apply this to the "name" column and add the result as a separate column
|
||||||
|
# called "genus"
|
||||||
|
sDat$genus <- gsub(myPatt, "", sDat$name)
|
||||||
|
|
||||||
|
# what do we get?
|
||||||
|
c(head(unique(sDat$genus)),
|
||||||
|
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
|
||||||
|
# is a problem that we have to keep in mind.
|
||||||
|
# (Always inspect your results!)
|
||||||
|
# Drop all rows for which the genus contains special chracters -
|
||||||
|
# like "[Candida]"
|
||||||
|
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
|
||||||
|
|
||||||
|
length(table(sDat$genus)) # how many genus?
|
||||||
|
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
|
||||||
|
# most genus have very few, but
|
||||||
|
# some have very many species.
|
||||||
|
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
|
||||||
|
|
||||||
|
# We should have at least one species from each taxonomic order, but we can
|
||||||
|
# add a few genus until we have about 100 validated species.
|
||||||
|
|
||||||
|
# Let's add a column for species, by changing our regular expression a bit,
|
||||||
|
# using ^ (start of string), \\S (NOT a whitespace),
|
||||||
|
# and + (one or more matches), capturing the match (...), and returning
|
||||||
|
# it as the substitution (\\1) ...
|
||||||
|
|
||||||
|
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
|
||||||
|
sDat$species <- gsub(myPatt, "\\1", sDat$name)
|
||||||
|
|
||||||
|
# And we reorder the columns, just for aesthetics:
|
||||||
|
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
|
||||||
|
|
||||||
|
# Final check:
|
||||||
|
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
|
||||||
|
|
||||||
# For our purpose of defining species, we will select only species, not strains
|
|
||||||
# from this list. To do this, we pick the first two words i.e. the systematic
|
|
||||||
# binomial name from the "X.Organism.Name" column, and then we remove redundant
|
|
||||||
# species. Here is a function:
|
|
||||||
#
|
#
|
||||||
|
# Now we check which of these have Mbp1 homologues ...
|
||||||
|
|
||||||
getBinom <- function(s) {
|
# = 4 BLAST SEARCH ========================================================
|
||||||
# Fetch the first two words from a string.
|
|
||||||
# Parameters:
|
|
||||||
# s: char a string which is expected to contain a binomial species name
|
|
||||||
# as the first two words, possibly followed by other text.
|
|
||||||
# Value: char the first two words separated by a single blank
|
|
||||||
#
|
|
||||||
x <- unlist(strsplit(s, "\\s+")) # split s on one or more whitespace
|
|
||||||
return(paste(x[1:2], collapse=" ")) # return first two elements
|
|
||||||
}
|
|
||||||
|
|
||||||
# iterate through GOLDdata and extract species names
|
|
||||||
GOLDspecies <- character()
|
|
||||||
for (i in 1:nrow(GOLDfungi)) {
|
|
||||||
GOLDspecies[i] <- getBinom(GOLDfungi$X.Organism.Name[i])
|
|
||||||
}
|
|
||||||
head(GOLDspecies)
|
|
||||||
length(GOLDspecies)
|
|
||||||
|
|
||||||
# N.b. this would be more efficiently (but perhaps less explicitly) coded with
|
|
||||||
# one of the apply() functions, instead of a for-loop.
|
|
||||||
# GOLDspecies <- unlist(lapply(GOLDfungi$X.Organism.Name, getBinom))
|
|
||||||
|
|
||||||
# Species of great interest may appear more than once, one for each sequenced
|
|
||||||
# strain: e.g. brewer's yeast:
|
|
||||||
sum(GOLDspecies == "Saccharomyces cerevisiae")
|
|
||||||
|
|
||||||
# Therefore we use the function unique() to throw out duplicates. Simple:
|
|
||||||
GOLDspecies <- unique(GOLDspecies)
|
|
||||||
|
|
||||||
length(GOLDspecies)
|
|
||||||
# i.e. we got rid of about 40% of the species by removing duplicates.
|
|
||||||
|
|
||||||
|
|
||||||
# = 3 BLAST species =======================================================
|
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
|
||||||
#
|
# fungus. With the results, we'll annotate our sDat table.
|
||||||
# Next, we filter our list by species that have homologues to the yeast Mbp1
|
|
||||||
# gene. To do this we run a BLAST search to find all related proteins in any
|
# == 4.1 find homologous proteins ==========================================
|
||||||
# fungus. We list the species that appear in that list, and then we select those
|
|
||||||
# that appear in our GOLD table as well.
|
|
||||||
#
|
|
||||||
# == 3.1 find homologous proteins ==========================================
|
|
||||||
#
|
#
|
||||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
||||||
# contain them.
|
# contain them.
|
||||||
@ -190,18 +172,25 @@ length(GOLDspecies)
|
|||||||
# standard task of communicating with servers and parsing responses - everyday
|
# standard task of communicating with servers and parsing responses - everyday
|
||||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||||
# parser in currently available packages.
|
# parser in currently available packages.
|
||||||
|
#
|
||||||
# source("./scripts/BLAST.R") # load the function and its utilities
|
# DON'T use this for BLAST searches unless you have read the NCBI policy
|
||||||
|
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
|
||||||
|
# server, they will blacklist your IP-address. See:
|
||||||
|
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||||
|
#
|
||||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||||
# db = "refseq_protein", # database to search in
|
# db = "refseq_protein", # database to search in
|
||||||
# nHits = 3000, # 720 hits in 2017
|
# nHits = 3000, # 945 hits in 2020
|
||||||
# E = 0.01, #
|
# E = 0.01, #
|
||||||
# limits = "txid4751[ORGN]") # = fungi
|
# limits = "txid4751[ORGN]") # = fungi
|
||||||
# save(BLASThits, file="data/BLASThits.RData")
|
# saveRDS(BLASThits, file="data/BLASThits.rds")
|
||||||
load(file="data/BLASThits.RData")
|
#
|
||||||
|
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
|
||||||
|
#
|
||||||
|
BLASThits <- readRDS(file = "data/BLASThits.rds")
|
||||||
|
|
||||||
# == 3.2 Identify species in "hits" ========================================
|
# == 4.2 Identify species in "hits" ========================================
|
||||||
|
|
||||||
# This is a very big list that can't be usefully analyzed manually. Here
|
# This is a very big list that can't be usefully analyzed manually. Here
|
||||||
# we are only interested in the species names that it contains.
|
# we are only interested in the species names that it contains.
|
||||||
@ -230,55 +219,202 @@ for (i in seq_along(BLASThits$hits)) {
|
|||||||
# You can confirm that BLASTspecies has the expected size.
|
# You can confirm that BLASTspecies has the expected size.
|
||||||
length(BLASTspecies)
|
length(BLASTspecies)
|
||||||
|
|
||||||
|
# if we delete some of these later on, we still want to remember which hit
|
||||||
|
# they came from. Thus we name() the elements with their index, which is the
|
||||||
|
# same as the index of the hit in BLASThits
|
||||||
|
names(BLASTspecies) <- 1:length(BLASTspecies)
|
||||||
|
|
||||||
|
|
||||||
|
# let's plot the distribution of E-values
|
||||||
|
eVals <- numeric()
|
||||||
|
for (i in seq_along(BLASThits$hits)) {
|
||||||
|
eVals[i] <- BLASThits$hits[[i]]$E
|
||||||
|
}
|
||||||
|
range(eVals)
|
||||||
|
sum(eVals == 0)
|
||||||
|
|
||||||
|
# let's plot the log of all values > 0 to see how they are distributed
|
||||||
|
# plotting only one vectyor of numbers plots their index as x, and
|
||||||
|
# their value as y ...
|
||||||
|
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||||
|
|
||||||
|
# This is very informative: I would suspect that the first ten or so are
|
||||||
|
# virtually identical to the yeast protein, then we have about 700 hits with
|
||||||
|
# decreasing similarity, and then about 200 more that may actually be false
|
||||||
|
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||||
|
# Lower E-values strictly come before higher E-values.
|
||||||
|
|
||||||
# Again, some species appear more than once, e.g. ...
|
# Again, some species appear more than once, e.g. ...
|
||||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||||||
|
|
||||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
||||||
|
|
||||||
# Therefore we use unique() to throw out duplicates:
|
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
|
||||||
BLASTspecies <- unique(BLASTspecies)
|
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
|
||||||
|
# species, we will retain only the protein that has the highest similarity
|
||||||
|
# to yeast Mbp1, not any of its more distant paralogues.
|
||||||
|
sel <- ! duplicated(BLASTspecies)
|
||||||
|
BLASTspecies <- BLASTspecies[sel]
|
||||||
|
|
||||||
length(BLASTspecies)
|
length(BLASTspecies)
|
||||||
# i.e. we got rid of about two thirds of the hits.
|
# i.e. we got rid of about two thirds of the hits.
|
||||||
|
tail(BLASTspecies) # see how the names are useful!
|
||||||
|
# again - there are some special characters ...
|
||||||
|
# what are they?
|
||||||
|
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
|
||||||
|
|
||||||
# You should think about this: what is the biological interpretation of the
|
# remove the brackets ...
|
||||||
# finding that on average we have three sequences that are similar to Mbp1 in
|
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
|
||||||
# other species?
|
# drop any new duplicates ...
|
||||||
|
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
||||||
|
|
||||||
|
# check the number again:
|
||||||
|
length(BLASTspecies)
|
||||||
|
# Think a bit about this: what may be the biological reason to find that
|
||||||
|
# on average, in 300 fungi across the entire phylogenetic tree, we have
|
||||||
|
# three sequences that are homologous to yeast Mbp1?
|
||||||
|
|
||||||
|
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||||
|
# we plot all values that are TRUE in the vector "sel" that we created above,
|
||||||
|
# AND greater than 0
|
||||||
|
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
|
||||||
|
|
||||||
|
|
||||||
# = 4 Intersect GOLD and BLAST species ====================================
|
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
|
||||||
|
|
||||||
# Now we can compare the two lists for species that appear in both sources: the
|
# Next we add the blast result to our sDat dataframe. We'll store the index,
|
||||||
# simplest way is to use the set operation functions union(), intersection()
|
# the E-value, and the Query-bounds from which we can estimate which domains
|
||||||
# etc. See here:
|
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
|
||||||
?union
|
# Mbp1's N-terminal APSES domain.)
|
||||||
|
|
||||||
MYSPEspecies <- intersect(GOLDspecies, BLASTspecies)
|
|
||||||
|
|
||||||
# Again: interpret this:
|
|
||||||
# - what is the number of GOLDspecies?
|
|
||||||
# - what is the number of BLAST species?
|
|
||||||
# - how many species are present in both lists?
|
|
||||||
# - what does it mean if a species is in GOLD but not in the BLAST list?
|
|
||||||
# - what does it mean if a species has been found during BLAST, but it
|
|
||||||
# is not in GOLD?
|
|
||||||
|
|
||||||
|
|
||||||
# = 5 Cleanup and finish ==================================================
|
|
||||||
|
|
||||||
# One final thing: some of the species will be our so-called "reference" species
|
|
||||||
# which we use for model solutions and examples in the course. They are defined
|
|
||||||
# in the .utilities.R file of this project. We remove them from the list so that
|
|
||||||
# we don't inadvertently assign them.
|
|
||||||
#
|
#
|
||||||
|
# First we pull the hits we wanted from the BLASTspecies:
|
||||||
|
iHits <- as.numeric(names(BLASTspecies))
|
||||||
|
length(iHits) # one index for each TRUE in sel
|
||||||
|
|
||||||
REFspecies
|
# add columns to sDat
|
||||||
|
l <- nrow(sDat)
|
||||||
|
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
|
||||||
|
sDat$eVal <- numeric(l) # E-value of the hit
|
||||||
|
sDat$lAli <- numeric(l) # length of the aligned region
|
||||||
|
|
||||||
MYSPEspecies <- sort(setdiff(MYSPEspecies, REFspecies))
|
# extract and merge
|
||||||
|
for (iHit in iHits) {
|
||||||
|
thisSp <- BLASThits$hits[[iHit]]$species
|
||||||
|
sel <- sDat$species == thisSp
|
||||||
|
|
||||||
# save(MYSPEspecies, file = "data/MYSPEspecies.RData")
|
sDat$iHit[sel] <- iHit
|
||||||
|
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
|
||||||
|
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
|
||||||
|
}
|
||||||
|
|
||||||
|
# Are all reference species accounted for?
|
||||||
|
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
|
||||||
|
REFspecies %in% sDat$species[selA] # yes, all there
|
||||||
|
|
||||||
|
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
|
||||||
|
|
||||||
|
sum(selA & selB) # How many rows?
|
||||||
|
|
||||||
|
# sDat of course includes all duplicates. Some may be multiply sequenced, some
|
||||||
|
# may be different strains. We'll use the same strategy as before and keep
|
||||||
|
# only the best hit: order the rows by E-value, then drop all rows which
|
||||||
|
# are duplicated.
|
||||||
|
|
||||||
|
|
||||||
|
# drop all rows without BLAST hits ...
|
||||||
|
sDat <- sDat[ ! (sDat$iHit == 0) , ]
|
||||||
|
|
||||||
|
# order sDat by E-value ...
|
||||||
|
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
|
||||||
|
|
||||||
|
# drop all rows with duplicated species ...
|
||||||
|
sDat <- sDat[ ! duplicated(sDat$species) , ]
|
||||||
|
|
||||||
|
# Lets look at the E-values ...
|
||||||
|
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
|
||||||
|
|
||||||
|
# and alignment lengths ...
|
||||||
|
plot(sDat$lAli, col = "#00DDAA")
|
||||||
|
|
||||||
|
# How many ...
|
||||||
|
length(unique(sDat$name))
|
||||||
|
length(unique(sDat$species))
|
||||||
|
length(unique(sDat$genus))
|
||||||
|
length(unique(sDat$order))
|
||||||
|
|
||||||
|
# To get the final dataset, we remove the reference species with their
|
||||||
|
# entire orders ...
|
||||||
|
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||||
|
sel <- sDat$order %in% REForders
|
||||||
|
REFdat <- sDat[sel , ]
|
||||||
|
sDat <- sDat[ ! sel , ]
|
||||||
|
|
||||||
|
# REFdat should now contain only the REFspecies ...
|
||||||
|
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
|
||||||
|
|
||||||
|
# ... but all of them
|
||||||
|
sum(REFspecies %in% REFdat$species)
|
||||||
|
|
||||||
|
# ... and we have enough left in sDat to prune sDat to unique genus ...
|
||||||
|
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||||
|
|
||||||
|
# saveRDS(sDat, file = "data/sDat.rds")
|
||||||
|
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# = 6 STUDENT NUMBERS =====================================================
|
||||||
|
#
|
||||||
|
# An asymmetric function to retrieve a MYSPE species
|
||||||
|
|
||||||
|
students <- read.csv("../BCH441-2020-students.csv")
|
||||||
|
|
||||||
|
sN <- students$Student.Number
|
||||||
|
range(sN)
|
||||||
|
any(duplicated(gsub(".+(.......)$", "\\1", sN)))
|
||||||
|
|
||||||
|
N <- 7
|
||||||
|
x <- numeric(N)
|
||||||
|
for (i in 1:N) {
|
||||||
|
x[i] <- H(substr(gsub(".+(.......)$", "\\1", sN), i, i))
|
||||||
|
}
|
||||||
|
plot(x, col = "#BB0000", type = "b")
|
||||||
|
|
||||||
|
keys <- as.numeric(gsub(".+(....).$", "\\1", sN))
|
||||||
|
any(duplicated(keys))
|
||||||
|
|
||||||
|
# =====
|
||||||
|
set.seed(112358)
|
||||||
|
names(sN) <- sample(1:nrow(sDat), length(sN))
|
||||||
|
|
||||||
|
MYSPEmap <- data.frame(keys = sprintf("%04d", 0:9999),
|
||||||
|
iMYSPE = sample(1:nrow(sDat), 10000, replace = TRUE))
|
||||||
|
rownames(MYSPEmap) <- MYSPEmap$keys
|
||||||
|
|
||||||
|
for (i in 1:length(sN)) {
|
||||||
|
rMap <- gsub(".+(....).$", "\\1", sN[i])
|
||||||
|
MYSPEmap[rMap, "iMYSPE"] <- as.integer(names(sN)[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
# saveRDS(MYSPEmap, "./data/MYSPEmap.rds")
|
||||||
|
|
||||||
|
getMYSPE <- function(x) {
|
||||||
|
dat <- readRDS("./data/sDat.rds")
|
||||||
|
map <- readRDS("./data/MYSPEmap.rds")
|
||||||
|
key <- gsub(".+(....).$", "\\1", x)
|
||||||
|
return(dat$species[map[key, "iMYSPE"]])
|
||||||
|
}
|
||||||
|
|
||||||
|
# === validate
|
||||||
|
l <- length(sN)
|
||||||
|
sp <- character(l)
|
||||||
|
for(i in 1:l) {
|
||||||
|
sp[i] <- getMYSPE(sN[i])
|
||||||
|
}
|
||||||
|
any(duplicated(sp))
|
||||||
|
length(unique(sp))
|
||||||
|
which(! sDat$species %in% sp) # these can be assigned to late-comers
|
||||||
|
|
||||||
|
# Done.
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -119,10 +119,10 @@ scCCnet <- scCCnet[! duplicated(x), ]
|
|||||||
# length(unique(c(mySubnet$protein1, mySubnet$protein2))) # 261, no change
|
# length(unique(c(mySubnet$protein1, mySubnet$protein2))) # 261, no change
|
||||||
# Network has 261 nodes, 1280 edges
|
# Network has 261 nodes, 1280 edges
|
||||||
|
|
||||||
save(scCCnet, file = "./data/scCCnet.RData")
|
saveRDS(scCCnet, file = "./data/scCCnet.rds")
|
||||||
|
|
||||||
# load("./data/scCCnet.RData") # <<<- use this to load the object when
|
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
|
||||||
# needed
|
# object when needed
|
||||||
|
|
||||||
|
|
||||||
# [END]
|
# [END]
|
||||||
|
@ -4,23 +4,23 @@
|
|||||||
# This script uses the BLAST URL-API
|
# This script uses the BLAST URL-API
|
||||||
# (Application Programming Interface) at the NCBI.
|
# (Application Programming Interface) at the NCBI.
|
||||||
# Read about the constraints here:
|
# Read about the constraints here:
|
||||||
# https://ncbi.github.io/blast-cloud/dev/api.html
|
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Version: 3.1
|
# Version: 3.2
|
||||||
# Date: 2016 09 - 2019 01
|
# Date: 2016 09 - 2020 09
|
||||||
# Author: Boris Steipe
|
# Author: Boris Steipe
|
||||||
#
|
#
|
||||||
# Versions:
|
# Versions:
|
||||||
|
# 3.2 2020 updates
|
||||||
# 3.1 Change from require() to requireNamespace(),
|
# 3.1 Change from require() to requireNamespace(),
|
||||||
# use <package>::<function>() idiom throughout
|
# use <package>::<function>() idiom throughout
|
||||||
# 3 parsing logic had not been fully implemented; Fixed.
|
# 3.0 parsing logic had not been fully implemented; Fixed.
|
||||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||||
# 2.0 Completely rewritten because the interface completely changed.
|
# 2.0 Completely rewritten because the interface completely changed.
|
||||||
# Code adpated in part from NCBI Perl sample code:
|
# Code adpated in part from NCBI Perl sample code:
|
||||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||||
#
|
|
||||||
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
||||||
#
|
#
|
||||||
# ToDo:
|
# ToDo:
|
||||||
@ -31,47 +31,50 @@
|
|||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
if (! requireNamespace(httr, quietly = TRUE)) {
|
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||||
install.packages("httr")
|
install.packages("httr")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BLAST <- function(q,
|
BLAST <- function(Q,
|
||||||
db = "refseq_protein",
|
db = "refseq_protein",
|
||||||
nHits = 30,
|
nHits = 30,
|
||||||
E = 0.1,
|
E = 0.1,
|
||||||
limits = "",
|
limits = "",
|
||||||
rid = "",
|
rid = "",
|
||||||
|
query = "",
|
||||||
quietly = FALSE,
|
quietly = FALSE,
|
||||||
myTimeout = 120) {
|
myTimeout = 120) {
|
||||||
# Purpose:
|
# Purpose:
|
||||||
# Basic BLAST search
|
# Basic BLAST search
|
||||||
#
|
#
|
||||||
# Parameters:
|
# Parameters:
|
||||||
# q: query - either a valid ID or a sequence
|
# Q: query - either a valid ID or a sequence
|
||||||
# db: "refseq_protein" by default,
|
# db: "refseq_protein" by default,
|
||||||
# other legal valuses include: "nr", "pdb", "swissprot" ...
|
# other legal values include: "nr", "pdb", "swissprot" ...
|
||||||
# nHits: number of hits to maximally return
|
# nHits: number of hits to maximally return
|
||||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||||
# to occur E or more times in a database of random sequence.
|
# to occur E or more times in a database of random sequence.
|
||||||
# limits: a valid ENTREZ filter
|
# limits: a valid ENTREZ filter
|
||||||
# rid: a request ID - to retrieve earleir search results
|
# rid: a request ID - to retrieve earlier search results
|
||||||
|
# query: the actual query string (needed when retrieving results
|
||||||
|
# with an rid)
|
||||||
# quietly: controls printing of wait-time progress bar
|
# quietly: controls printing of wait-time progress bar
|
||||||
# timeout: how much longer _after_ rtoe to wait for a result
|
# timeout: how much longer _after_ rtoe to wait for a result
|
||||||
# before giving up (seconds)
|
# before giving up (seconds)
|
||||||
# Value:
|
# Value:
|
||||||
# result: list of resulting hits and some metadata
|
# result: list of process status or resulting hits, and some metadata
|
||||||
|
|
||||||
|
|
||||||
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
||||||
|
|
||||||
results <- list()
|
results <- list()
|
||||||
|
results$query = query
|
||||||
results$rid <- rid
|
results$rid <- rid
|
||||||
results$rtoe <- 0
|
results$rtoe <- 0
|
||||||
|
|
||||||
if (rid == "") { # if rid is not the empty string we skip the
|
if (rid == "") { # If no rid is available, spawn a search.
|
||||||
# initial search and and proceed directly to retrieval
|
# Else, proceed directly to retrieval.
|
||||||
|
|
||||||
|
|
||||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||||
@ -141,7 +144,8 @@ BLAST <- function(q,
|
|||||||
|
|
||||||
if (myTimeout <= 0) { # abort
|
if (myTimeout <= 0) { # abort
|
||||||
cat("BLAST search not concluded before timeout. Aborting.\n")
|
cat("BLAST search not concluded before timeout. Aborting.\n")
|
||||||
cat(sprintf("You could check back later with rid \"%s\"\n",
|
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
|
||||||
|
"Trying checking back later with >",
|
||||||
results$rid))
|
results$rid))
|
||||||
return(results)
|
return(results)
|
||||||
}
|
}
|
||||||
@ -370,7 +374,7 @@ if (FALSE) {
|
|||||||
nHits = 100,
|
nHits = 100,
|
||||||
E = 0.001,
|
E = 0.001,
|
||||||
rid = "",
|
rid = "",
|
||||||
limits = "txid4751[ORGN]")
|
limits = "txid4751[ORGN]") # Fungi
|
||||||
str(test)
|
str(test)
|
||||||
length(test$hits)
|
length(test$hits)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user