Line termination change and old code.
This commit is contained in:
@@ -1,30 +1,30 @@
|
||||
# ABC-createRefDB.R
|
||||
#
|
||||
# Create a reference protein database for Mbp1-like proteins
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# For the species, see:
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||
#
|
||||
# For the data model, see
|
||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
myDB <- dbInit()
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
|
||||
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
|
||||
|
||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
|
||||
|
||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
|
||||
|
||||
|
||||
# [END]
|
||||
# ABC-createRefDB.R
|
||||
#
|
||||
# Create a reference protein database for Mbp1-like proteins
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# For the species, see:
|
||||
# http://steipe.biochemistry.utoronto.ca/abc/index.php/Reference_species_for_fungi
|
||||
#
|
||||
# For the data model, see
|
||||
# https://docs.google.com/presentation/d/13vWaVcFpWEOGeSNhwmqugj2qTQuH1eZROgxWdHGEMr0
|
||||
# For the schema, see dbInit() in ./scripts/ABC-dbUtilities.R
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
myDB <- dbInit()
|
||||
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/MBP1_SACCE.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refMBP1Proteins.json"))
|
||||
myDB <- dbAddProtein(myDB, jsonlite::fromJSON("./data/refAPSES_PSI-BLAST.json"))
|
||||
|
||||
myDB <- dbAddTaxonomy(myDB, jsonlite::fromJSON("./data/refTaxonomy.json"))
|
||||
|
||||
myDB <- dbAddFeature(myDB, jsonlite::fromJSON("./data/refFeatures.json"))
|
||||
|
||||
myDB <- dbAddAnnotation( myDB, jsonlite::fromJSON("./data/refAnnotations.json"))
|
||||
|
||||
|
||||
# [END]
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -1,443 +1,443 @@
|
||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
|
||||
#
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2016 09 - 2021 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions
|
||||
# 1.4 New retrieval logic
|
||||
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||
# since 2012. Use ensembl fungi as initial source.
|
||||
# 1.2 Change from require() to requireNamespace()
|
||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# 1.1 Update 2017
|
||||
# 1.0 First code 2016
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# DO NOT source() THIS FILE!
|
||||
#
|
||||
# This file is code I provide for your deeper understanding of a process and
|
||||
# to provide you with useful sample code. It is not actually necessary for
|
||||
# you to run this code, but I encourage you to read it carefully and discuss
|
||||
# if there are parts you don't understand.
|
||||
#
|
||||
# Run the commands that interact with the NCBI servers only if you want to
|
||||
# experiment specifically with the code and/or parameters. I have commented out
|
||||
# those parts. If you only want to study the general workflow, just load()
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------
|
||||
#TOC> 1 The strategy 55
|
||||
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
|
||||
#TOC> 3 ENSEMBL FUNGI 75
|
||||
#TOC> 3.1 Import 78
|
||||
#TOC> 4 BLAST SEARCH 155
|
||||
#TOC> 4.1 find homologous proteins 161
|
||||
#TOC> 4.2 Identify species in "hits" 192
|
||||
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
|
||||
#TOC> 6 STUDENT NUMBERS 375
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
# MYSPEspecies that is stored in the data subdirectory of this project from
|
||||
# where it can be loaded. The strategy is as follows: we download a list of
|
||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
|
||||
# species that have been annotated.
|
||||
# Next we perform a BLAST search, to identify fungal species that have
|
||||
# genes that are homologous to yeast MBP1.
|
||||
#
|
||||
# ...
|
||||
|
||||
# = 2 PACKAGES AND INITIALIZATIONS ========================================
|
||||
|
||||
# httr provides interfaces to Webservers on the Internet
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
# = 3 ENSEMBL FUNGI =======================================================
|
||||
|
||||
|
||||
# == 3.1 Import ============================================================
|
||||
|
||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
|
||||
# list of all species: https://fungi.ensembl.org/species.html
|
||||
# On the page, click on the spreadsheet symbol top right and choose
|
||||
# "download whole table". The file will be named "Species.csv", in your
|
||||
# usual downloads folder. Move it to the data folder, and read it.
|
||||
|
||||
sDat <- read.csv("./data/Species.csv")
|
||||
str(sDat)
|
||||
|
||||
# The most obvious way to partition these is according to Classification ...
|
||||
# (poking around a bit in the UniProt taxonomy database shows that the
|
||||
# classification used here is the taxonomic rank of "order").
|
||||
# how many classifications do we have?
|
||||
length(unique(sDat$Classification)) # 66
|
||||
|
||||
# To have a good set for the class, we should have about 100.
|
||||
# Let's see for which of these we can find Mbp1 homologues.
|
||||
# First, we'll keep only the colums for name, classification, and taxID, and
|
||||
# drop the rest ...
|
||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
|
||||
colnames(sDat) <- c("name", "order", "taxID")
|
||||
|
||||
# Next, we make an extra column: genus - the first part of the binomial name.
|
||||
# We'll use the gsub() function, and for that we need a "regular expression"
|
||||
# that matches to all characters from the first blank to the end of the string:
|
||||
myPatt <- "\\s.*$" # one whitespace (\\s) ...
|
||||
# followed by any character (.) 0..n times (*) ...
|
||||
# until the end of the string
|
||||
|
||||
# using gsub() we substitue all matching characters with the empty string "" -
|
||||
# this deletes the matching characters
|
||||
# Test this:
|
||||
gsub(myPatt, "", "Genus") # one word: unchanged
|
||||
gsub(myPatt, "", "gEnus species") # two words: return only first
|
||||
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
|
||||
|
||||
# apply this to the "name" column and add the result as a separate column
|
||||
# called "genus"
|
||||
sDat$genus <- gsub(myPatt, "", sDat$name)
|
||||
|
||||
# what do we get?
|
||||
c(head(unique(sDat$genus)),
|
||||
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
|
||||
# is a problem that we have to keep in mind.
|
||||
# (Always inspect your results!)
|
||||
# Drop all rows for which the genus contains special chracters -
|
||||
# like "[Candida]"
|
||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
|
||||
|
||||
length(table(sDat$genus)) # how many genus?
|
||||
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
|
||||
# most genus have very few, but
|
||||
# some have very many species.
|
||||
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
|
||||
|
||||
# We should have at least one species from each taxonomic order, but we can
|
||||
# add a few genus until we have about 100 validated species.
|
||||
|
||||
# Let's add a column for species, by changing our regular expression a bit,
|
||||
# using ^ (start of string), \\S (NOT a whitespace),
|
||||
# and + (one or more matches), capturing the match (...), and returning
|
||||
# it as the substitution (\\1) ...
|
||||
|
||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
|
||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
|
||||
|
||||
# And we reorder the columns, just for aesthetics:
|
||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
|
||||
|
||||
# Final check:
|
||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
|
||||
|
||||
#
|
||||
# Now we check which of these have Mbp1 homologues ...
|
||||
|
||||
# = 4 BLAST SEARCH ========================================================
|
||||
|
||||
|
||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
|
||||
# fungus. With the results, we'll annotate our sDat table.
|
||||
|
||||
# == 4.1 find homologous proteins ==========================================
|
||||
#
|
||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
||||
# contain them.
|
||||
|
||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
|
||||
# amount of error handling involved that is not supported by the API in a
|
||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||
# standard task of communicating with servers and parsing responses - everyday
|
||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||
# parser in currently available packages.
|
||||
#
|
||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
|
||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
|
||||
# server, they will blacklist your IP-address. See:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||
# db = "refseq_protein", # database to search in
|
||||
# nHits = 3000, # 945 hits in 2020
|
||||
# E = 0.01, #
|
||||
# limits = "txid4751[ORGN]") # = fungi
|
||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
|
||||
#
|
||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
|
||||
#
|
||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
|
||||
|
||||
# == 4.2 Identify species in "hits" ========================================
|
||||
|
||||
# This is a very big list that can't be usefully analyzed manually. Here
|
||||
# we are only interested in the species names that it contains.
|
||||
|
||||
# How many hits in the list?
|
||||
length(BLASThits$hits) # 1,134
|
||||
|
||||
# Let's look at a hit somewhere down the list
|
||||
str(BLASThits$hit[[277]])
|
||||
|
||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
|
||||
# in a useful way. The species information is in the $species element of every
|
||||
# hit.
|
||||
|
||||
# Run a loop to extract all the species names into a vector. We subset ...
|
||||
# Blasthits$hits ... the list of hits, from which we choose ...
|
||||
# Blasthits$hits[[i]] ... the i-th hit, and get ...
|
||||
# Blasthits$hits[[i]]$species ... the species element from that.
|
||||
# Subsetting FTW.
|
||||
|
||||
BLASTspecies <- character()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
BLASTspecies[i] <- BLASThits$hits[[i]]$species
|
||||
}
|
||||
|
||||
# You can confirm that BLASTspecies has the expected size.
|
||||
length(BLASTspecies)
|
||||
|
||||
# if we delete some of these later on, we still want to remember which hit
|
||||
# they came from. Thus we name() the elements with their index, which is the
|
||||
# same as the index of the hit in BLASThits
|
||||
names(BLASTspecies) <- 1:length(BLASTspecies)
|
||||
|
||||
|
||||
# let's plot the distribution of E-values
|
||||
eVals <- numeric()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
eVals[i] <- BLASThits$hits[[i]]$E
|
||||
}
|
||||
range(eVals)
|
||||
sum(eVals == 0)
|
||||
|
||||
# let's plot the log of all values > 0 to see how they are distributed
|
||||
# plotting only one vectyor of numbers plots their index as x, and
|
||||
# their value as y ...
|
||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||
|
||||
# This is very informative: I would suspect that the first ten or so are
|
||||
# virtually identical to the yeast protein, then we have about 800 hits with
|
||||
# decreasing similarity, and then about 200 more that may actually be false
|
||||
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||
# Lower E-values strictly come before higher E-values.
|
||||
|
||||
# Again, some species appear more than once, e.g. ...
|
||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||||
|
||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
||||
|
||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
|
||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
|
||||
# species, we will retain only the protein that has the highest similarity
|
||||
# to yeast Mbp1, not any of its more distant paralogues.
|
||||
sel <- ! duplicated(BLASTspecies)
|
||||
BLASTspecies <- BLASTspecies[sel]
|
||||
|
||||
length(BLASTspecies)
|
||||
# i.e. we got rid of about two thirds of the hits.
|
||||
tail(BLASTspecies) # see how the names are useful!
|
||||
# again - there are some special characters ...
|
||||
# what are they?
|
||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
|
||||
|
||||
# remove the brackets ...
|
||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
|
||||
# drop any new duplicates ...
|
||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
||||
|
||||
# check the number again:
|
||||
length(BLASTspecies)
|
||||
# Think a bit about this: what may be the biological reason to find that
|
||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
|
||||
# three sequences that are homologous to yeast Mbp1?
|
||||
|
||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||
# we plot all values that are TRUE in the vector "sel" that we created above,
|
||||
# AND greater than 0
|
||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
|
||||
|
||||
|
||||
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
|
||||
|
||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
|
||||
# the E-value, and the Query-bounds from which we can estimate which domains
|
||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
|
||||
# Mbp1's N-terminal APSES domain.)
|
||||
#
|
||||
# First we pull the hits we wanted from the BLASTspecies:
|
||||
iHits <- as.numeric(names(BLASTspecies))
|
||||
length(iHits) # one index for each TRUE in sel
|
||||
|
||||
# add columns to sDat
|
||||
l <- nrow(sDat)
|
||||
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
|
||||
sDat$eVal <- numeric(l) # E-value of the hit
|
||||
sDat$lAli <- numeric(l) # length of the aligned region
|
||||
|
||||
# extract and merge
|
||||
for (iHit in iHits) {
|
||||
thisSp <- BLASThits$hits[[iHit]]$species
|
||||
sel <- sDat$species == thisSp
|
||||
|
||||
sDat$iHit[sel] <- iHit
|
||||
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
|
||||
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
|
||||
}
|
||||
|
||||
# Are all reference species accounted for?
|
||||
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
|
||||
REFspecies %in% sDat$species[selA] # yes, all there
|
||||
|
||||
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
|
||||
|
||||
sum(selA & selB) # How many rows?
|
||||
|
||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
|
||||
# may be different strains. We'll use the same strategy as before and keep
|
||||
# only the best hit: order the rows by E-value, then drop all rows which
|
||||
# are duplicated.
|
||||
|
||||
|
||||
# drop all rows without BLAST hits ...
|
||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
|
||||
|
||||
# order sDat by E-value ...
|
||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
|
||||
|
||||
# drop all rows with duplicated species ...
|
||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
|
||||
|
||||
# Lets look at the E-values ...
|
||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
|
||||
|
||||
# and alignment lengths ...
|
||||
plot(sDat$lAli, col = "#00DDAA")
|
||||
|
||||
# How many ...
|
||||
length(unique(sDat$name))
|
||||
length(unique(sDat$species))
|
||||
length(unique(sDat$genus))
|
||||
length(unique(sDat$order))
|
||||
|
||||
# I need an extra species for admin purposes later on ...
|
||||
sel <- grep("Sporothrix schenckii", sDat$species)
|
||||
SPOSCdat <- sDat[sel, ]
|
||||
sDat <- sDat[-sel, ]
|
||||
|
||||
# To get the final dataset, we remove the reference species with their
|
||||
# entire orders ...
|
||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||
sel <- sDat$order %in% REForders
|
||||
REFdat <- sDat[sel , ]
|
||||
sDat <- sDat[ ! sel , ]
|
||||
|
||||
# REFdat should now contain only the REFspecies ...
|
||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
|
||||
|
||||
# ... but all of them
|
||||
sum(REFspecies %in% REFdat$species)
|
||||
|
||||
# ... and we have enough left in sDat to prune sDat to unique genus
|
||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||
nrow(sDat) # 84
|
||||
|
||||
# I add back "Sporothrix schenckii" ...
|
||||
sDat <- rbind(SPOSCdat, sDat)
|
||||
|
||||
# ... and save for future use.
|
||||
# saveRDS(sDat, file = "data/sDat.rds")
|
||||
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||
|
||||
|
||||
|
||||
# = 6 STUDENT NUMBERS =====================================================
|
||||
#
|
||||
# An asymmetric function to retrieve a MYSPE species
|
||||
#
|
||||
sDat <- readRDS(file = "data/sDat.rds")
|
||||
|
||||
students <- read.csv("../BCH441-2021-students.csv")
|
||||
sN <- students$Integration.ID
|
||||
sN <- sN[! is.na(sN)]
|
||||
sN <- as.character(sN)
|
||||
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
|
||||
|
||||
set.seed(112358)
|
||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
|
||||
all(sort(theseSpecies$name) == sort(sDat$name))
|
||||
nrow((theseSpecies))
|
||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
|
||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
|
||||
rndMin <- 992000000
|
||||
rndMax <- 1020000000
|
||||
N <- 10000
|
||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
|
||||
keys <- keys[! (keys %in% sN)]
|
||||
keys <- keys[1:N]
|
||||
keys[1:length(sN)] <- sN
|
||||
|
||||
nRep <- floor(N/nrow(theseSpecies))
|
||||
MYSPEdat <- theseSpecies
|
||||
for(i in 1:nRep) {
|
||||
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
|
||||
}
|
||||
MYSPEdat <- MYSPEdat[1:N, ]
|
||||
for (i in 1:N) {
|
||||
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
|
||||
}
|
||||
set.seed(NULL)
|
||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
|
||||
|
||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
|
||||
|
||||
# === validate
|
||||
x <- character()
|
||||
for (n in sN) {
|
||||
sp <- getMYSPE(n)
|
||||
if (length(sp) != 1) {
|
||||
stop(print(as.character(n)))
|
||||
} else {
|
||||
x <- c(x, sp)
|
||||
}
|
||||
}
|
||||
|
||||
# === species for late-comers
|
||||
y <- unique(MYSPEdat$species)
|
||||
print(y[!(y %in% x)])
|
||||
|
||||
|
||||
# === validate
|
||||
l <- length(sN)
|
||||
sp <- character(l)
|
||||
for(i in 1:l) {
|
||||
sp[i] <- getMYSPE(sN[i])
|
||||
}
|
||||
any(duplicated(sp))
|
||||
length(unique(sp))
|
||||
which(! sDat$species %in% sp) # these can be assigned to late-comers
|
||||
|
||||
# Done.
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeMYSPElist.R"
|
||||
#
|
||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||
# Mbp1 homologues.
|
||||
#
|
||||
# Version: 1.4
|
||||
#
|
||||
# Date: 2016 09 - 2021 09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions
|
||||
# 1.4 New retrieval logic
|
||||
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||
# since 2012. Use ensembl fungi as initial source.
|
||||
# 1.2 Change from require() to requireNamespace()
|
||||
# 1.1.2 Moved BLAST.R to ./scripts directory
|
||||
# 1.1 Update 2017
|
||||
# 1.0 First code 2016
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
#
|
||||
# DO NOT source() THIS FILE!
|
||||
#
|
||||
# This file is code I provide for your deeper understanding of a process and
|
||||
# to provide you with useful sample code. It is not actually necessary for
|
||||
# you to run this code, but I encourage you to read it carefully and discuss
|
||||
# if there are parts you don't understand.
|
||||
#
|
||||
# Run the commands that interact with the NCBI servers only if you want to
|
||||
# experiment specifically with the code and/or parameters. I have commented out
|
||||
# those parts. If you only want to study the general workflow, just load()
|
||||
# the respective intermediate results.
|
||||
#
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> --------------------------------------------------------
|
||||
#TOC> 1 The strategy 55
|
||||
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
|
||||
#TOC> 3 ENSEMBL FUNGI 75
|
||||
#TOC> 3.1 Import 78
|
||||
#TOC> 4 BLAST SEARCH 155
|
||||
#TOC> 4.1 find homologous proteins 161
|
||||
#TOC> 4.2 Identify species in "hits" 192
|
||||
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
|
||||
#TOC> 6 STUDENT NUMBERS 375
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 The strategy ========================================================
|
||||
|
||||
# This script will create a list of "MYSPE" species and save it in an R object
|
||||
# MYSPEspecies that is stored in the data subdirectory of this project from
|
||||
# where it can be loaded. The strategy is as follows: we download a list of
|
||||
# annotated fungal genomes from ensembl.fungi. All these are genome-sequenced
|
||||
# species that have been annotated.
|
||||
# Next we perform a BLAST search, to identify fungal species that have
|
||||
# genes that are homologous to yeast MBP1.
|
||||
#
|
||||
# ...
|
||||
|
||||
# = 2 PACKAGES AND INITIALIZATIONS ========================================
|
||||
|
||||
# httr provides interfaces to Webservers on the Internet
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
# = 3 ENSEMBL FUNGI =======================================================
|
||||
|
||||
|
||||
# == 3.1 Import ============================================================
|
||||
|
||||
# Navigate to https://fungi.ensembl.org and click on the link to the full
|
||||
# list of all species: https://fungi.ensembl.org/species.html
|
||||
# On the page, click on the spreadsheet symbol top right and choose
|
||||
# "download whole table". The file will be named "Species.csv", in your
|
||||
# usual downloads folder. Move it to the data folder, and read it.
|
||||
|
||||
sDat <- read.csv("./data/Species.csv")
|
||||
str(sDat)
|
||||
|
||||
# The most obvious way to partition these is according to Classification ...
|
||||
# (poking around a bit in the UniProt taxonomy database shows that the
|
||||
# classification used here is the taxonomic rank of "order").
|
||||
# how many classifications do we have?
|
||||
length(unique(sDat$Classification)) # 66
|
||||
|
||||
# To have a good set for the class, we should have about 100.
|
||||
# Let's see for which of these we can find Mbp1 homologues.
|
||||
# First, we'll keep only the colums for name, classification, and taxID, and
|
||||
# drop the rest ...
|
||||
sDat <- sDat[ , c("Name", "Classification", "Taxon.ID")]
|
||||
colnames(sDat) <- c("name", "order", "taxID")
|
||||
|
||||
# Next, we make an extra column: genus - the first part of the binomial name.
|
||||
# We'll use the gsub() function, and for that we need a "regular expression"
|
||||
# that matches to all characters from the first blank to the end of the string:
|
||||
myPatt <- "\\s.*$" # one whitespace (\\s) ...
|
||||
# followed by any character (.) 0..n times (*) ...
|
||||
# until the end of the string
|
||||
|
||||
# using gsub() we substitue all matching characters with the empty string "" -
|
||||
# this deletes the matching characters
|
||||
# Test this:
|
||||
gsub(myPatt, "", "Genus") # one word: unchanged
|
||||
gsub(myPatt, "", "gEnus species") # two words: return only first
|
||||
gsub(myPatt, "", "geNus species strain 123") # many words: return only first
|
||||
|
||||
# apply this to the "name" column and add the result as a separate column
|
||||
# called "genus"
|
||||
sDat$genus <- gsub(myPatt, "", sDat$name)
|
||||
|
||||
# what do we get?
|
||||
c(head(unique(sDat$genus)),
|
||||
tail(unique(sDat$genus))) # inspect the first and last few. Note that there
|
||||
# is a problem that we have to keep in mind.
|
||||
# (Always inspect your results!)
|
||||
# Drop all rows for which the genus contains special chracters -
|
||||
# like "[Candida]"
|
||||
sDat <- sDat[ ! grepl("[^a-zA-Z]", sDat$genus) , ]
|
||||
|
||||
length(table(sDat$genus)) # how many genus?
|
||||
hist(table(sDat$genus), col = "#E9F4FF") # Distribution ...
|
||||
# most genus have very few, but
|
||||
# some have very many species.
|
||||
sort(table(sDat$genus), decreasing = TRUE)[1:10] # Top ten...
|
||||
|
||||
# We should have at least one species from each taxonomic order, but we can
|
||||
# add a few genus until we have about 100 validated species.
|
||||
|
||||
# Let's add a column for species, by changing our regular expression a bit,
|
||||
# using ^ (start of string), \\S (NOT a whitespace),
|
||||
# and + (one or more matches), capturing the match (...), and returning
|
||||
# it as the substitution (\\1) ...
|
||||
|
||||
myPatt <- "^(\\S+\\s\\S+)\\s.*$"
|
||||
sDat$species <- gsub(myPatt, "\\1", sDat$name)
|
||||
|
||||
# And we reorder the columns, just for aesthetics:
|
||||
sDat <- sDat[ , c("name", "species", "genus", "order", "taxID")]
|
||||
|
||||
# Final check:
|
||||
any(grepl("[^a-zA-Z -]", sDat$species)) # FALSE means no special characters
|
||||
|
||||
#
|
||||
# Now we check which of these have Mbp1 homologues ...
|
||||
|
||||
# = 4 BLAST SEARCH ========================================================
|
||||
|
||||
|
||||
# We run a BLAST search to find all proteins related to yeast Mbp1 in any
|
||||
# fungus. With the results, we'll annotate our sDat table.
|
||||
|
||||
# == 4.1 find homologous proteins ==========================================
|
||||
#
|
||||
# Use BLAST to fetch proteins related to Mbp1 and identify the species that
|
||||
# contain them.
|
||||
|
||||
# Scripting against NCBI APIs is not exactly enjoyable - there is usually a fair
|
||||
# amount of error handling involved that is not supported by the API in a
|
||||
# principled way but requires rather ad hoc solutions. The code I threw together
|
||||
# to make a BLAST interface (demo-quality, not research-quality) is in the file
|
||||
# ./scripts/BLAST.R Feel encouraged to study how this works. It's a pretty
|
||||
# standard task of communicating with servers and parsing responses - everyday
|
||||
# fare in the bioinformatics lab. Surprisingly, there seems to be no good BLAST
|
||||
# parser in currently available packages.
|
||||
#
|
||||
# DON'T use this for BLAST searches unless you have read the NCBI policy
|
||||
# for automated tasks. If you indicriminately pound on the NCBI's BLAST
|
||||
# server, they will blacklist your IP-address. See:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
# Use BLAST() to find yeast Mbp1 homologues in other fungi in refseq
|
||||
# BLASThits <- BLAST("NP_010227", # Yeast Mbp1 RefSeq ID
|
||||
# db = "refseq_protein", # database to search in
|
||||
# nHits = 3000, # 945 hits in 2020
|
||||
# E = 0.01, #
|
||||
# limits = "txid4751[ORGN]") # = fungi
|
||||
# saveRDS(BLASThits, file="data/BLASThits.rds")
|
||||
#
|
||||
# NO NEED TO ACTUALLY RUN THIS:you can load the results from the data directory
|
||||
#
|
||||
BLASThits <- readRDS(file = "data/BLASThits.rds")
|
||||
|
||||
# == 4.2 Identify species in "hits" ========================================
|
||||
|
||||
# This is a very big list that can't be usefully analyzed manually. Here
|
||||
# we are only interested in the species names that it contains.
|
||||
|
||||
# How many hits in the list?
|
||||
length(BLASThits$hits) # 1,134
|
||||
|
||||
# Let's look at a hit somewhere down the list
|
||||
str(BLASThits$hit[[277]])
|
||||
|
||||
# A fair amount of parsing has gone into the BLAST.R code to prepare the results
|
||||
# in a useful way. The species information is in the $species element of every
|
||||
# hit.
|
||||
|
||||
# Run a loop to extract all the species names into a vector. We subset ...
|
||||
# Blasthits$hits ... the list of hits, from which we choose ...
|
||||
# Blasthits$hits[[i]] ... the i-th hit, and get ...
|
||||
# Blasthits$hits[[i]]$species ... the species element from that.
|
||||
# Subsetting FTW.
|
||||
|
||||
BLASTspecies <- character()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
BLASTspecies[i] <- BLASThits$hits[[i]]$species
|
||||
}
|
||||
|
||||
# You can confirm that BLASTspecies has the expected size.
|
||||
length(BLASTspecies)
|
||||
|
||||
# if we delete some of these later on, we still want to remember which hit
|
||||
# they came from. Thus we name() the elements with their index, which is the
|
||||
# same as the index of the hit in BLASThits
|
||||
names(BLASTspecies) <- 1:length(BLASTspecies)
|
||||
|
||||
|
||||
# let's plot the distribution of E-values
|
||||
eVals <- numeric()
|
||||
for (i in seq_along(BLASThits$hits)) {
|
||||
eVals[i] <- BLASThits$hits[[i]]$E
|
||||
}
|
||||
range(eVals)
|
||||
sum(eVals == 0)
|
||||
|
||||
# let's plot the log of all values > 0 to see how they are distributed
|
||||
# plotting only one vectyor of numbers plots their index as x, and
|
||||
# their value as y ...
|
||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||
|
||||
# This is very informative: I would suspect that the first ten or so are
|
||||
# virtually identical to the yeast protein, then we have about 800 hits with
|
||||
# decreasing similarity, and then about 200 more that may actually be false
|
||||
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||
# Lower E-values strictly come before higher E-values.
|
||||
|
||||
# Again, some species appear more than once, e.g. ...
|
||||
sum(BLASTspecies == "Saccharomyces cerevisiae")
|
||||
|
||||
# ... corresponding to the five homologous gene sequences (paralogues) of yeast.
|
||||
|
||||
# Therefore we remove duplicates. Removing duplicates will leave the FIRST
|
||||
# in a list alone, and only remove the SUBSEQUENT ones. Which means, from each
|
||||
# species, we will retain only the protein that has the highest similarity
|
||||
# to yeast Mbp1, not any of its more distant paralogues.
|
||||
sel <- ! duplicated(BLASTspecies)
|
||||
BLASTspecies <- BLASTspecies[sel]
|
||||
|
||||
length(BLASTspecies)
|
||||
# i.e. we got rid of about two thirds of the hits.
|
||||
tail(BLASTspecies) # see how the names are useful!
|
||||
# again - there are some special characters ...
|
||||
# what are they?
|
||||
BLASTspecies[grep("[^a-zA-Z ]", BLASTspecies)]
|
||||
|
||||
# remove the brackets ...
|
||||
BLASTspecies <- gsub("\\[|\\]", "", BLASTspecies)
|
||||
# drop any new duplicates ...
|
||||
BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
||||
|
||||
# check the number again:
|
||||
length(BLASTspecies)
|
||||
# Think a bit about this: what may be the biological reason to find that
|
||||
# on average, in 388 fungi across the entire phylogenetic tree, we have
|
||||
# three sequences that are homologous to yeast Mbp1?
|
||||
|
||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||
# we plot all values that are TRUE in the vector "sel" that we created above,
|
||||
# AND greater than 0
|
||||
plot(log(eVals[sel & eVals > 0]), col = "#00CC00")
|
||||
|
||||
|
||||
# = 5 MERGE ENSEMBL AND BLAST RESULTS =====================================
|
||||
|
||||
# Next we add the blast result to our sDat dataframe. We'll store the index,
|
||||
# the E-value, and the Query-bounds from which we can estimate which domains
|
||||
# of Mbp1 are actually covered by the hit. (True orthologues MUST align with
|
||||
# Mbp1's N-terminal APSES domain.)
|
||||
#
|
||||
# First we pull the hits we wanted from the BLASTspecies:
|
||||
iHits <- as.numeric(names(BLASTspecies))
|
||||
length(iHits) # one index for each TRUE in sel
|
||||
|
||||
# add columns to sDat
|
||||
l <- nrow(sDat)
|
||||
sDat$iHit <- numeric(l) # index of the hit in the BLAST results
|
||||
sDat$eVal <- numeric(l) # E-value of the hit
|
||||
sDat$lAli <- numeric(l) # length of the aligned region
|
||||
|
||||
# extract and merge
|
||||
for (iHit in iHits) {
|
||||
thisSp <- BLASThits$hits[[iHit]]$species
|
||||
sel <- sDat$species == thisSp
|
||||
|
||||
sDat$iHit[sel] <- iHit
|
||||
sDat$eVal[sel] <- BLASThits$hits[[iHit]]$E
|
||||
sDat$lAli[sel] <- BLASThits$hits[[iHit]]$lengthAli
|
||||
}
|
||||
|
||||
# Are all reference species accounted for?
|
||||
selA <- sDat$iHit != 0 # all rows which matched to a BLAST hit
|
||||
REFspecies %in% sDat$species[selA] # yes, all there
|
||||
|
||||
selB <- sDat$species %in% REFspecies # all rows which have one of REF species
|
||||
|
||||
sum(selA & selB) # How many rows?
|
||||
|
||||
# sDat of course includes all duplicates. Some may be multiply sequenced, some
|
||||
# may be different strains. We'll use the same strategy as before and keep
|
||||
# only the best hit: order the rows by E-value, then drop all rows which
|
||||
# are duplicated.
|
||||
|
||||
|
||||
# drop all rows without BLAST hits ...
|
||||
sDat <- sDat[ ! (sDat$iHit == 0) , ]
|
||||
|
||||
# order sDat by E-value ...
|
||||
sDat <- sDat[order(sDat$eVal, decreasing = FALSE) , ]
|
||||
|
||||
# drop all rows with duplicated species ...
|
||||
sDat <- sDat[ ! duplicated(sDat$species) , ]
|
||||
|
||||
# Lets look at the E-values ...
|
||||
plot(log(sDat$eVal[sDat$eVal > 0]), col = "#00CC00")
|
||||
|
||||
# and alignment lengths ...
|
||||
plot(sDat$lAli, col = "#00DDAA")
|
||||
|
||||
# How many ...
|
||||
length(unique(sDat$name))
|
||||
length(unique(sDat$species))
|
||||
length(unique(sDat$genus))
|
||||
length(unique(sDat$order))
|
||||
|
||||
# I need an extra species for admin purposes later on ...
|
||||
sel <- grep("Sporothrix schenckii", sDat$species)
|
||||
SPOSCdat <- sDat[sel, ]
|
||||
sDat <- sDat[-sel, ]
|
||||
|
||||
# To get the final dataset, we remove the reference species with their
|
||||
# entire orders ...
|
||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||
sel <- sDat$order %in% REForders
|
||||
REFdat <- sDat[sel , ]
|
||||
sDat <- sDat[ ! sel , ]
|
||||
|
||||
# REFdat should now contain only the REFspecies ...
|
||||
( REFdat <- REFdat[REFdat$species %in% REFspecies , ] )
|
||||
|
||||
# ... but all of them
|
||||
sum(REFspecies %in% REFdat$species)
|
||||
|
||||
# ... and we have enough left in sDat to prune sDat to unique genus
|
||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||
nrow(sDat) # 84
|
||||
|
||||
# I add back "Sporothrix schenckii" ...
|
||||
sDat <- rbind(SPOSCdat, sDat)
|
||||
|
||||
# ... and save for future use.
|
||||
# saveRDS(sDat, file = "data/sDat.rds")
|
||||
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||
|
||||
|
||||
|
||||
# = 6 STUDENT NUMBERS =====================================================
|
||||
#
|
||||
# An asymmetric function to retrieve a MYSPE species
|
||||
#
|
||||
sDat <- readRDS(file = "data/sDat.rds")
|
||||
|
||||
students <- read.csv("../BCH441-2021-students.csv")
|
||||
sN <- students$Integration.ID
|
||||
sN <- sN[! is.na(sN)]
|
||||
sN <- as.character(sN)
|
||||
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
|
||||
|
||||
set.seed(112358)
|
||||
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
|
||||
all(sort(theseSpecies$name) == sort(sDat$name))
|
||||
nrow((theseSpecies))
|
||||
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
|
||||
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
|
||||
rndMin <- 992000000
|
||||
rndMax <- 1020000000
|
||||
N <- 10000
|
||||
keys <- as.character(sample(rndMin:rndMax, N + 1000))
|
||||
keys <- keys[! (keys %in% sN)]
|
||||
keys <- keys[1:N]
|
||||
keys[1:length(sN)] <- sN
|
||||
|
||||
nRep <- floor(N/nrow(theseSpecies))
|
||||
MYSPEdat <- theseSpecies
|
||||
for(i in 1:nRep) {
|
||||
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
|
||||
}
|
||||
MYSPEdat <- MYSPEdat[1:N, ]
|
||||
for (i in 1:N) {
|
||||
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
|
||||
}
|
||||
set.seed(NULL)
|
||||
MYSPEdat <- MYSPEdat[sample(1:N), ]
|
||||
|
||||
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
|
||||
|
||||
# === validate
|
||||
x <- character()
|
||||
for (n in sN) {
|
||||
sp <- getMYSPE(n)
|
||||
if (length(sp) != 1) {
|
||||
stop(print(as.character(n)))
|
||||
} else {
|
||||
x <- c(x, sp)
|
||||
}
|
||||
}
|
||||
|
||||
# === species for late-comers
|
||||
y <- unique(MYSPEdat$species)
|
||||
print(y[!(y %in% x)])
|
||||
|
||||
|
||||
# === validate
|
||||
l <- length(sN)
|
||||
sp <- character(l)
|
||||
for(i in 1:l) {
|
||||
sp[i] <- getMYSPE(sN[i])
|
||||
}
|
||||
any(duplicated(sp))
|
||||
length(unique(sp))
|
||||
which(! sDat$species %in% sp) # these can be assigned to late-comers
|
||||
|
||||
# Done.
|
||||
|
||||
# [END]
|
||||
|
@@ -1,168 +1,168 @@
|
||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence human STRING edges.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafile is NOT posted to github. If you want to
|
||||
# experiment with the original data, download it and place it into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||
# Data: (127.6 Mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 Rewrite
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Initialize 44
|
||||
#TOC> 2 Read STRING Data 51
|
||||
#TOC> 3 Define cutoff and subset 63
|
||||
#TOC> 4 Drop duplicates 103
|
||||
#TOC> 5 Simple statistics 127
|
||||
#TOC> 6 Write to file 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Initialize ==========================================================
|
||||
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 Read STRING Data ====================================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||
# 848 Mb. Fortunately readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
nrow(STR) # 11,759,454 rows
|
||||
head(STR)
|
||||
|
||||
|
||||
# = 3 Define cutoff and subset ============================================
|
||||
|
||||
# approximate distribution of combined_score
|
||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||
|
||||
# Let's table the counts >= 850 and plot them for better resolution.
|
||||
|
||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||
|
||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||
# frequently assigns a combined score of 0.900
|
||||
|
||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||
# as combined scores.
|
||||
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||
plot(myX,
|
||||
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||
xlim = c(1.0, 0.85), # reverse x-axis
|
||||
type = "l",
|
||||
main = "STRING interactions for 9606 (top 600,000)",
|
||||
xlab = "combined_score",
|
||||
ylab = "cumulative counts",
|
||||
col = "#CC0000")
|
||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# What's the cutoff for 100,000 edges?
|
||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||
|
||||
# confirm
|
||||
sum(STR$combined_score >= 964) # 101,348
|
||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# subset the table, and use only the protein IDs and the combined_score
|
||||
STR <- STR[STR$combined_score >= 964,
|
||||
c("protein1", "protein2", "combined_score")]
|
||||
colnames(STR) <- c("a", "b", "score")
|
||||
|
||||
|
||||
# = 4 Drop duplicates ====================================================
|
||||
|
||||
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||
# sort order, then checking for duplicated().
|
||||
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||
# (X:U, X:U) has a duplicate.
|
||||
|
||||
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||
tmp <- STR$b # copy column b
|
||||
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||
all(STR$a >= STR$b) # confirm: TRUE
|
||||
|
||||
# now, make combined keys, like this:
|
||||
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||
|
||||
tmp <- paste0(STR$a, ":", STR$b)
|
||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||
# both a:b and b:a !
|
||||
|
||||
# drop all duplicated interactions from tmp
|
||||
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||
|
||||
|
||||
# = 5 Simple statistics ===================================================
|
||||
|
||||
# how many unique genes?
|
||||
length(unique(c(STR$a, STR$b))) # 8,445
|
||||
|
||||
# how many self-edges?
|
||||
sum(STR$a == STR$b) # none
|
||||
|
||||
# log(rank) / log(frequency)
|
||||
myTbl <- table(c(STR$a, STR$b))
|
||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||
|
||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||
|
||||
# number of singletons
|
||||
sum(myTbl == 1) # almost a quarter
|
||||
|
||||
# maximum?
|
||||
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||
# Google: CDC5L
|
||||
|
||||
# Zipf-plot
|
||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||
type = "b", cex = 0.7,
|
||||
main = "STRINGedges - degrees",
|
||||
xlab = "log(rank)",
|
||||
ylab = "log(frequency)",
|
||||
col = "#FFBB88")
|
||||
|
||||
sprintf("Average number of interactions: %5.2f",
|
||||
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||
|
||||
|
||||
# = 6 Write to file =======================================================
|
||||
|
||||
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||
|
||||
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeSTRINGedges.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence human STRING edges.
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafile is NOT posted to github. If you want to
|
||||
# experiment with the original data, download it and place it into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Homo+sapiens
|
||||
# Data: (127.6 Mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# Version: 1.0
|
||||
#
|
||||
# Date: 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.0 Rewrite
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> -------------------------------------------------
|
||||
#TOC> 1 Initialize 44
|
||||
#TOC> 2 Read STRING Data 51
|
||||
#TOC> 3 Define cutoff and subset 63
|
||||
#TOC> 4 Drop duplicates 103
|
||||
#TOC> 5 Simple statistics 127
|
||||
#TOC> 6 Write to file 160
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 Initialize ==========================================================
|
||||
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 Read STRING Data ====================================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 127.6MB, the uncompressed version is probably
|
||||
# 848 Mb. Fortunately readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path("~", "9606.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
nrow(STR) # 11,759,454 rows
|
||||
head(STR)
|
||||
|
||||
|
||||
# = 3 Define cutoff and subset ============================================
|
||||
|
||||
# approximate distribution of combined_score
|
||||
hist(sample(STR$combined_score, 10000), breaks = 50, col = "#6699FF")
|
||||
|
||||
# Let's table the counts >= 850 and plot them for better resolution.
|
||||
|
||||
myTb <- table(STR$combined_score[STR$combined_score >= 850])
|
||||
is.unsorted(as.integer(names(myTb))) # Good - they are all in order
|
||||
|
||||
plot(myTb, type = "b", cex = 0.5, col = "#BB0000")
|
||||
myTb[myTb == max(myTb)] # Apparently there is an algorithmic effect that
|
||||
# frequently assigns a combined score of 0.900
|
||||
|
||||
# Let's plot these counts as cumulative sums, in reverse order, scaled
|
||||
# as combined scores.
|
||||
myX <- 1 - (1:length(myTb)) / 1000 # x-values, decreasing
|
||||
plot(myX,
|
||||
cumsum(myTb[length(myTb):1]), # cumulative sum, decreasing
|
||||
xlim = c(1.0, 0.85), # reverse x-axis
|
||||
type = "l",
|
||||
main = "STRING interactions for 9606 (top 600,000)",
|
||||
xlab = "combined_score",
|
||||
ylab = "cumulative counts",
|
||||
col = "#CC0000")
|
||||
abline(h = seq(50000, sum(myTb), by = 50000), lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# What's the cutoff for 100,000 edges?
|
||||
which(cumsum(myTb[length(myTb):1]) >= 100000)[1] # p = 0.964
|
||||
|
||||
# confirm
|
||||
sum(STR$combined_score >= 964) # 101,348
|
||||
abline(v = 0.964, lwd = 0.5, col = "#DDDDFF")
|
||||
|
||||
# subset the table, and use only the protein IDs and the combined_score
|
||||
STR <- STR[STR$combined_score >= 964,
|
||||
c("protein1", "protein2", "combined_score")]
|
||||
colnames(STR) <- c("a", "b", "score")
|
||||
|
||||
|
||||
# = 4 Drop duplicates ====================================================
|
||||
|
||||
# identify duplicate interactions by creating keys in a defined alphabetical
|
||||
# sort order, then checking for duplicated().
|
||||
# e.g if we have (X:U, U:X), we change U:X to X:U and now find that
|
||||
# (X:U, X:U) has a duplicate.
|
||||
|
||||
AB <- STR$a < STR$b # logical vector: genes we need to swap
|
||||
tmp <- STR$b # copy column b
|
||||
STR$b[AB] <- STR$a[AB] # copy a's into b
|
||||
STR$a[AB] <- tmp[AB] # copy tmp's into a
|
||||
all(STR$a >= STR$b) # confirm: TRUE
|
||||
|
||||
# now, make combined keys, like this:
|
||||
paste0(STR$a[1:10], ":", STR$b[1:10])
|
||||
|
||||
tmp <- paste0(STR$a, ":", STR$b)
|
||||
sum(duplicated(tmp)) # That's half of them ... i.e. STRING reports
|
||||
# both a:b and b:a !
|
||||
|
||||
# drop all duplicated interactions from tmp
|
||||
STR <- STR[ ! duplicated(tmp), ] # 50,674 interactions remain
|
||||
|
||||
|
||||
# = 5 Simple statistics ===================================================
|
||||
|
||||
# how many unique genes?
|
||||
length(unique(c(STR$a, STR$b))) # 8,445
|
||||
|
||||
# how many self-edges?
|
||||
sum(STR$a == STR$b) # none
|
||||
|
||||
# log(rank) / log(frequency)
|
||||
myTbl <- table(c(STR$a, STR$b))
|
||||
myTbl <- myTbl[order(myTbl, decreasing = TRUE)]
|
||||
|
||||
hist(myTbl, breaks = 40, col = "#FFEEBB")
|
||||
|
||||
# number of singletons
|
||||
sum(myTbl == 1) # almost a quarter
|
||||
|
||||
# maximum?
|
||||
myTbl[which(myTbl == max(myTbl))] # 9606.ENSP00000360532: 465
|
||||
# Google: CDC5L
|
||||
|
||||
# Zipf-plot
|
||||
plot(log(1:length(myTbl)), log(as.numeric(myTbl)),
|
||||
type = "b", cex = 0.7,
|
||||
main = "STRINGedges - degrees",
|
||||
xlab = "log(rank)",
|
||||
ylab = "log(frequency)",
|
||||
col = "#FFBB88")
|
||||
|
||||
sprintf("Average number of interactions: %5.2f",
|
||||
nrow(STR) / length(unique(c(STR$a, STR$b))))
|
||||
|
||||
|
||||
# = 6 Write to file =======================================================
|
||||
|
||||
saveRDS(STR, file = "./data/STRINGedges.rds")
|
||||
|
||||
# STRINGedges <- readRDS("./data/STRINGedges.rds") # use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
|
@@ -1,167 +1,167 @@
|
||||
# tocID <- "scripts/ABC-makeScCCnet.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
||||
# GOSlim annotation.
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafiles are NOT posted to github. If you want to
|
||||
# experiment with your own code, download them and place them into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
|
||||
# Data: (20.1 mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# GOSlim data source: (Note: this has moved from GO to SGD)
|
||||
# Info page: https://www.yeastgenome.org/downloads
|
||||
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
|
||||
# Data: (3 mb)
|
||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||
#
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Update. GO Slim Yeast mow at SGD
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
# SRCDIR <- "./instructor"
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 INITIALIZE 58
|
||||
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
|
||||
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
|
||||
#TOC> 3.1 Intersect interactions and annotations 122
|
||||
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 INITIALIZE ==========================================================
|
||||
|
||||
SRCDIR <- "./data"
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
|
||||
# really not necessary to uncompress since readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
|
||||
# Subset only IDs and combined_score column
|
||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||
|
||||
# head(STR)
|
||||
# sum(STR$combined_score > 909) # 100270 edges
|
||||
# subset for 100,000 highest confidence edges
|
||||
STR <- STR[(STR$combined_score > 909), ]
|
||||
head(STR)
|
||||
|
||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
|
||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
|
||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
|
||||
head(STR)
|
||||
|
||||
# get a vector of gene names in this list
|
||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
||||
# names
|
||||
length(myIntxGenes)
|
||||
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
|
||||
|
||||
|
||||
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
|
||||
#
|
||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
|
||||
|
||||
Gsl <- readr::read_tsv(fn,
|
||||
col_names = c("ID",
|
||||
"name",
|
||||
"SGDId",
|
||||
"Ontology",
|
||||
"termName",
|
||||
"termID",
|
||||
"status"))
|
||||
|
||||
head(Gsl)
|
||||
|
||||
# What cell cycle names does it contain?
|
||||
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
|
||||
myGslTermNames[grep("cycle", myGslTermNames)]
|
||||
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
|
||||
|
||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
|
||||
|
||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
|
||||
length(scCCgenes) # 324 genes annotated to that term
|
||||
|
||||
# == 3.1 Intersect interactions and annotations ============================
|
||||
|
||||
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
|
||||
# # functional interactions
|
||||
|
||||
|
||||
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
|
||||
#
|
||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
|
||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
|
||||
#
|
||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
|
||||
(STR$protein2 %in% scCCgenes), ]
|
||||
|
||||
# How many genes are there?
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
|
||||
|
||||
# Each edge is listed twice - now remove duplicates.
|
||||
|
||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
|
||||
# smaller Than the second one. This brings the two names into a defined
|
||||
# order. Then concatenate them with a "." - the resulting string
|
||||
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
|
||||
# and c("B", "A") also gives "A.B". This identifies duplicates.
|
||||
|
||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
|
||||
1,
|
||||
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
|
||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
|
||||
|
||||
sum(duplicated(x)) # 1453
|
||||
|
||||
# Step 2: drop all rows that contain duplicates in x
|
||||
scCCnet <- scCCnet[! duplicated(x), ]
|
||||
|
||||
# Confirm we didn't loose genes
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
|
||||
nrow(scCCnet)
|
||||
# Network has 283 nodes, 1453 edges
|
||||
|
||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
|
||||
|
||||
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-makeScCCnet.R"
|
||||
#
|
||||
# Create a subnetwork of high-confidence yeast genes with a "mitotic cell cycle"
|
||||
# GOSlim annotation.
|
||||
#
|
||||
# Boris Steipe for ABC learning units
|
||||
#
|
||||
# Notes:
|
||||
#
|
||||
# The large source- datafiles are NOT posted to github. If you want to
|
||||
# experiment with your own code, download them and place them into your
|
||||
# local ./data directory.
|
||||
#
|
||||
# STRING data source:
|
||||
# Download page:
|
||||
# https://string-db.org/cgi/download.pl?species_text=Saccharomyces+cerevisiae
|
||||
# Data: (20.1 mb)
|
||||
# https://stringdb-static.org/download/protein.links.full.v11.0/4932.protein.links.full.v11.0.txt.gz
|
||||
#
|
||||
# GOSlim data source: (Note: this has moved from GO to SGD)
|
||||
# Info page: https://www.yeastgenome.org/downloads
|
||||
# Info page: http://sgd-archive.yeastgenome.org/curation/literature/
|
||||
# Data: (3 mb)
|
||||
# http://sgd-archive.yeastgenome.org/curation/literature/go_slim_mapping.tab
|
||||
#
|
||||
#
|
||||
# Version: 1.2
|
||||
#
|
||||
# Date: 2017-10 - 2020-09
|
||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||
#
|
||||
# Versions:
|
||||
# 1.2 2020 Update. GO Slim Yeast mow at SGD
|
||||
# 1.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 1.0 First code copied from 2016 material.
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# ==============================================================================
|
||||
# SRCDIR <- "./instructor"
|
||||
|
||||
|
||||
#TOC> ==========================================================================
|
||||
#TOC>
|
||||
#TOC> Section Title Line
|
||||
#TOC> ---------------------------------------------------------------
|
||||
#TOC> 1 INITIALIZE 58
|
||||
#TOC> 2 STRING FUNCTIONAL INTERACTION DATA 66
|
||||
#TOC> 3 GOSlim FUNCTIONAL ANNOTATIONS 96
|
||||
#TOC> 3.1 Intersect interactions and annotations 122
|
||||
#TOC> 4 DEFINE THE CELL-CYCLE NETWORK 128
|
||||
#TOC>
|
||||
#TOC> ==========================================================================
|
||||
|
||||
|
||||
# = 1 INITIALIZE ==========================================================
|
||||
|
||||
SRCDIR <- "./data"
|
||||
if (! requireNamespace("readr", quietly = TRUE)) {
|
||||
install.packages("readr")
|
||||
}
|
||||
|
||||
|
||||
# = 2 STRING FUNCTIONAL INTERACTION DATA ==================================
|
||||
|
||||
# Read STRING Data (needs to be downloaded from database, see URL in Notes)
|
||||
# The .gz compressed version is 20MB, the uncompressed versioj is 110MB -
|
||||
# really not necessary to uncompress since readr:: can read from compressed
|
||||
# files, and does so automatically, based on the file extension.
|
||||
( fn <- file.path(SRCDIR, "4932.protein.links.full.v11.0.txt.gz") )
|
||||
STR <- readr::read_delim(fn, delim = " ")
|
||||
|
||||
# Subset only IDs and combined_score column
|
||||
STR <- STR[ , c("protein1", "protein2", "combined_score")]
|
||||
|
||||
# head(STR)
|
||||
# sum(STR$combined_score > 909) # 100270 edges
|
||||
# subset for 100,000 highest confidence edges
|
||||
STR <- STR[(STR$combined_score > 909), ]
|
||||
head(STR)
|
||||
|
||||
# IDs are formatted like 4932.YAL005C ... drop the "4932." prefix
|
||||
STR$protein1 <- gsub("^4932\\.", "", STR$protein1)
|
||||
STR$protein2 <- gsub("^4932\\.", "", STR$protein2)
|
||||
head(STR)
|
||||
|
||||
# get a vector of gene names in this list
|
||||
myIntxGenes <- unique(c(STR$protein1, STR$protein2)) # yeast systematic gene
|
||||
# names
|
||||
length(myIntxGenes)
|
||||
sample(myIntxGenes, 10) # choose 10 at random (sanity check)
|
||||
|
||||
|
||||
# = 3 GOSlim FUNCTIONAL ANNOTATIONS =======================================
|
||||
#
|
||||
# Read GOSlim data (needs to be downloaded from database, see URL in Notes)
|
||||
( fn <- file.path(SRCDIR, "go_slim_mapping.tab") )
|
||||
|
||||
Gsl <- readr::read_tsv(fn,
|
||||
col_names = c("ID",
|
||||
"name",
|
||||
"SGDId",
|
||||
"Ontology",
|
||||
"termName",
|
||||
"termID",
|
||||
"status"))
|
||||
|
||||
head(Gsl)
|
||||
|
||||
# What cell cycle names does it contain?
|
||||
myGslTermNames <- unique(Gsl$termName) # 169 unique terms
|
||||
myGslTermNames[grep("cycle", myGslTermNames)]
|
||||
# [1] "regulation of cell cycle" "mitotic cell cycle" "meiotic cell cycle"
|
||||
|
||||
# Choose "mitotic cell cycle" as the GOslim term to subset with
|
||||
|
||||
scCCgenes <- unique(Gsl$ID[Gsl$termName == "mitotic cell cycle"])
|
||||
length(scCCgenes) # 324 genes annotated to that term
|
||||
|
||||
# == 3.1 Intersect interactions and annotations ============================
|
||||
|
||||
sum(scCCgenes %in% myIntxGenes) # 307 of these have high-confidence
|
||||
# # functional interactions
|
||||
|
||||
|
||||
# = 4 DEFINE THE CELL-CYCLE NETWORK =======================================
|
||||
#
|
||||
# Define scCCnet ... the S. Cervisiae Cell Cycle network
|
||||
# Subset all rows for which BOTH genes are in the GOslim cell cycle set
|
||||
#
|
||||
scCCnet <- STR[(STR$protein1 %in% scCCgenes) &
|
||||
(STR$protein2 %in% scCCgenes), ]
|
||||
|
||||
# How many genes are there?
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) #283
|
||||
|
||||
# Each edge is listed twice - now remove duplicates.
|
||||
|
||||
# Step 1: make a vector: sort two names so the fiRst one is alphabetically
|
||||
# smaller Than the second one. This brings the two names into a defined
|
||||
# order. Then concatenate them with a "." - the resulting string
|
||||
# is always the same, for any order. E.g. c("A", "B") gives "A.B"
|
||||
# and c("B", "A") also gives "A.B". This identifies duplicates.
|
||||
|
||||
x <- apply(cbind(scCCnet$protein1, scCCnet$protein2),
|
||||
1,
|
||||
FUN = function(x) { return(paste(sort(x), collapse = ".")) })
|
||||
head(x) # "YAL016W.YGR040W" "YAL016W.YOR014W" "YAL016W.YDL188C" ... etc.
|
||||
|
||||
sum(duplicated(x)) # 1453
|
||||
|
||||
# Step 2: drop all rows that contain duplicates in x
|
||||
scCCnet <- scCCnet[! duplicated(x), ]
|
||||
|
||||
# Confirm we didn't loose genes
|
||||
length(unique(c(scCCnet$protein1, scCCnet$protein2))) # 283, no change
|
||||
nrow(scCCnet)
|
||||
# Network has 283 nodes, 1453 edges
|
||||
|
||||
saveRDS(scCCnet, file = "./data/scCCnet.rds")
|
||||
|
||||
# scCCnet <- readRDS("./data/scCCnet.rds") # <<<- use this to restore the
|
||||
# object when needed
|
||||
|
||||
|
||||
# [END]
|
||||
|
@@ -1,135 +1,135 @@
|
||||
# tocID <- "scripts/ABC-writeALN.R"
|
||||
#
|
||||
# ToDo: calculate consensus line
|
||||
# append sequence numbers
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeALN <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 60) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.0
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector.
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full alignment length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (blockWidth < 1) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
if (blockWidth > 60) {
|
||||
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad any sequence with "-" that is shorter than ranges[2]
|
||||
for (i in seq_along(sSet)) {
|
||||
if (nchar(sSet[i]) < range[2]) {
|
||||
sSet[i] <- paste0(sSet[i],
|
||||
paste0(rep("-", range[2] - nchar(sSet[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad sequence names
|
||||
sNames <- names(sSet)
|
||||
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
|
||||
for (i in seq_along(sNames)) {
|
||||
sNames[i] <- paste0(sNames[i],
|
||||
paste0(rep(" ", len - nchar(sNames[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
|
||||
|
||||
# Process each sequence
|
||||
txt <- paste0("CLUSTAL W format. ", note)
|
||||
txt[2] <- ""
|
||||
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
for (i in seq_along(iStarts)) {
|
||||
for (j in seq_along(sSet)) {
|
||||
txt <- c(txt,
|
||||
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
|
||||
}
|
||||
txt <- c(txt, "") # append a blank consenus line
|
||||
txt <- c(txt, "") # append a separator line
|
||||
}
|
||||
|
||||
writeLines(txt, con= myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# tocID <- "scripts/ABC-writeALN.R"
|
||||
#
|
||||
# ToDo: calculate consensus line
|
||||
# append sequence numbers
|
||||
# Notes:
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeALN <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 60) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.0
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector.
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full alignment length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (blockWidth < 1) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
if (blockWidth > 60) {
|
||||
warning("Programs that read CLUSTAL format might not expect blockWidth > 60.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad any sequence with "-" that is shorter than ranges[2]
|
||||
for (i in seq_along(sSet)) {
|
||||
if (nchar(sSet[i]) < range[2]) {
|
||||
sSet[i] <- paste0(sSet[i],
|
||||
paste0(rep("-", range[2] - nchar(sSet[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
}
|
||||
|
||||
# Right-pad sequence names
|
||||
sNames <- names(sSet)
|
||||
len <- max(nchar(sNames)) + 2 # longest name plus two spaces
|
||||
for (i in seq_along(sNames)) {
|
||||
sNames[i] <- paste0(sNames[i],
|
||||
paste0(rep(" ", len - nchar(sNames[i])),
|
||||
collapse = ""))
|
||||
}
|
||||
|
||||
|
||||
# Process each sequence
|
||||
txt <- paste0("CLUSTAL W format. ", note)
|
||||
txt[2] <- ""
|
||||
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
for (i in seq_along(iStarts)) {
|
||||
for (j in seq_along(sSet)) {
|
||||
txt <- c(txt,
|
||||
paste0(sNames[j], substring(sSet[j], iStarts[i], iEnds[i])))
|
||||
}
|
||||
txt <- c(txt, "") # append a blank consenus line
|
||||
txt <- c(txt, "") # append a separator line
|
||||
}
|
||||
|
||||
writeLines(txt, con= myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
@@ -1,121 +1,121 @@
|
||||
# ABC-writeMFA.R
|
||||
#
|
||||
# ToDo:
|
||||
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeMFA <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 80) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.1
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full sequence length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (! blockWidth > 0){
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Process each sequence
|
||||
txt <- character()
|
||||
if (note != "") { # construct header line
|
||||
headers <- paste(names(sSet), note)
|
||||
} else {
|
||||
headers <- names(sSet)
|
||||
}
|
||||
|
||||
for (i in seq_along(sSet)) {
|
||||
|
||||
# output FASTA header
|
||||
txt <- c(txt, sprintf(">%s", headers[i]))
|
||||
|
||||
# output the sequence in blocks of blockWidth per line ...
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
|
||||
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
|
||||
txt <- c(txt, thisSeq)
|
||||
|
||||
txt <- c(txt, "") # append an empty line for readability
|
||||
}
|
||||
|
||||
writeLines(txt, con = myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
# ABC-writeMFA.R
|
||||
#
|
||||
# ToDo:
|
||||
# Notes: 2.1 bugfix: empty notes caused superfluous blank after header.
|
||||
#
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
writeMFA <- function(ali,
|
||||
range,
|
||||
note = "",
|
||||
myCon = stdout(),
|
||||
blockWidth = 80) {
|
||||
# Purpose:
|
||||
# Write an MsaAAMultipleAlignment or AAStringSet object to stdout() or
|
||||
# a file in multi-FASTA format.
|
||||
# Version: 2.1
|
||||
# Date: 2017 10
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Parameters:
|
||||
# ali MsaAAMultipleAlignment or AAStringSet or character
|
||||
# vector
|
||||
# range num a two-integer vector of start and end positions if
|
||||
# only a range of the MSA should be written, e.g.
|
||||
# a domain. Defaults to the full sequence length.
|
||||
# note chr a vector of character that is appended to the name
|
||||
# of a sequence in the FASTA header. Recycling of
|
||||
# shorter vectors applies, thus a vector of length one
|
||||
# is added to all headers.
|
||||
# myCon a connection (cf. the con argument for writeLines).
|
||||
# Defaults to stdout()
|
||||
# blockWidth int width of sequence block. Default 80 characters.
|
||||
# Value:
|
||||
# NA the function is invoked for its side effect of printing an
|
||||
# alignment to stdout() or file.
|
||||
|
||||
blockWidth <- as.integer(blockWidth)
|
||||
if (is.na(blockWidth)) {
|
||||
stop("PANIC: parameter \"blockWidth\" must be numeric.")
|
||||
}
|
||||
if (! blockWidth > 0){
|
||||
stop("PANIC: parameter \"blockWidth\" must be greater than zero.")
|
||||
}
|
||||
|
||||
# Extract the raw data from the objects depending on their respective class
|
||||
# and put it into a named vector of strings.
|
||||
|
||||
# Extract XStringSet from MsaXMultipleAlignment ...
|
||||
if (class(ali) == "MsaAAMultipleAlignment" |
|
||||
class(ali) == "MsaDNAMultipleAlignment" |
|
||||
class(ali) == "MsaRNAMultipleAlignment") {
|
||||
ali <- ali@unmasked
|
||||
}
|
||||
|
||||
# Process XStringSet
|
||||
if (class(ali) == "AAStringSet" |
|
||||
class(ali) == "DNAStringSet" |
|
||||
class(ali) == "RNAStringSet") {
|
||||
sSet <- as.character(ali) # we use as.character(), not toString() thus
|
||||
# we don't _have_ to load Biostrings
|
||||
} else if (class(ali) == "character") {
|
||||
sSet <- ali
|
||||
} else {
|
||||
stop(paste("Input object of class",
|
||||
class(ali),
|
||||
"can't be handled by this function."))
|
||||
}
|
||||
|
||||
if (missing(range)) {
|
||||
range <- 1
|
||||
range[2] <- max(nchar(sSet))
|
||||
} else {
|
||||
range <- as.integer(range)
|
||||
if(length(range) != 2 ||
|
||||
any(is.na(range)) ||
|
||||
range[1] > range[2] ||
|
||||
range[1] < 1) {
|
||||
stop("PANIC: \"range\" parameter must contain valid start and end index.")
|
||||
}
|
||||
}
|
||||
|
||||
# Process each sequence
|
||||
txt <- character()
|
||||
if (note != "") { # construct header line
|
||||
headers <- paste(names(sSet), note)
|
||||
} else {
|
||||
headers <- names(sSet)
|
||||
}
|
||||
|
||||
for (i in seq_along(sSet)) {
|
||||
|
||||
# output FASTA header
|
||||
txt <- c(txt, sprintf(">%s", headers[i]))
|
||||
|
||||
# output the sequence in blocks of blockWidth per line ...
|
||||
iStarts <- seq(range[1], range[2], by = blockWidth)
|
||||
iEnds <- c((iStarts[-1] - 1), range[2])
|
||||
|
||||
thisSeq <- substring(sSet[i], iStarts, iEnds) # collect all blocks
|
||||
thisSeq <- thisSeq[! nchar(thisSeq) == 0] # drop empty blocks
|
||||
txt <- c(txt, thisSeq)
|
||||
|
||||
txt <- c(txt, "") # append an empty line for readability
|
||||
}
|
||||
|
||||
writeLines(txt, con = myCon)
|
||||
|
||||
}
|
||||
|
||||
# ==== TESTS =================================================================
|
||||
# Enter your function tests here...
|
||||
|
||||
if (FALSE) {
|
||||
# test ...
|
||||
}
|
||||
|
||||
|
||||
|
||||
# [END]
|
||||
|
768
scripts/BLAST.R
768
scripts/BLAST.R
@@ -1,384 +1,384 @@
|
||||
# BLAST.R
|
||||
#
|
||||
# Purpose: Send off one BLAST search and return parsed list of results
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
#
|
||||
# Version: 3.2
|
||||
# Date: 2016 09 - 2020 09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 3.2 2020 updates
|
||||
# 3.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 3.0 parsing logic had not been fully implemented; Fixed.
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
# 2.0 Completely rewritten because the interface completely changed.
|
||||
# Code adpated in part from NCBI Perl sample code:
|
||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
||||
#
|
||||
# ToDo: Return the organism/strain name in the output, and propagate
|
||||
# into MYSPE selection script.
|
||||
#
|
||||
# Notes: This is somewhat pedestrian, but apparently there are currently
|
||||
# no R packages that contain such code.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
BLAST <- function(Q,
|
||||
db = "refseq_protein",
|
||||
nHits = 30,
|
||||
E = 0.1,
|
||||
limits = "",
|
||||
rid = "",
|
||||
query = "",
|
||||
quietly = FALSE,
|
||||
myTimeout = 120) {
|
||||
# Purpose:
|
||||
# Basic BLAST search
|
||||
#
|
||||
# Parameters:
|
||||
# Q: query - either a valid ID or a sequence
|
||||
# db: "refseq_protein" by default,
|
||||
# other legal values include: "nr", "pdb", "swissprot" ...
|
||||
# nHits: number of hits to maximally return
|
||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||
# to occur E or more times in a database of random sequence.
|
||||
# limits: a valid ENTREZ filter
|
||||
# rid: a request ID - to retrieve earlier search results
|
||||
# query: the actual query string (needed when retrieving results
|
||||
# with an rid)
|
||||
# quietly: controls printing of wait-time progress bar
|
||||
# timeout: how much longer _after_ rtoe to wait for a result
|
||||
# before giving up (seconds)
|
||||
# Value:
|
||||
# result: list of process status or resulting hits, and some metadata
|
||||
|
||||
|
||||
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
||||
|
||||
results <- list()
|
||||
results$query = query
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
|
||||
if (rid == "") { # If no rid is available, spawn a search.
|
||||
# Else, proceed directly to retrieval.
|
||||
|
||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Put",
|
||||
"&PROGRAM=", "blastp",
|
||||
"&QUERY=", URLencode(Q),
|
||||
"&DATABASE=", db,
|
||||
"&MATRIX=", "BLOSUM62",
|
||||
"&EXPECT=", as.character(E),
|
||||
"&HITLIST_SIZE=", as.character(nHits),
|
||||
"&ALIGNMENTS=", as.character(nHits),
|
||||
"&FORMAT_TYPE=Text")
|
||||
|
||||
if (limits != "") {
|
||||
results$query <- paste0(
|
||||
results$query,
|
||||
"&ENTREZ_QUERY=", limits)
|
||||
}
|
||||
|
||||
# send it off ...
|
||||
response <- httr::GET(results$query)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
patt <- "RID = (\\w+)" # match the request id
|
||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||
|
||||
patt <- "RTOE = (\\d+)" # match the expected completion time
|
||||
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
|
||||
|
||||
# Now we wait ...
|
||||
if (quietly) {
|
||||
Sys.sleep(results$rtoe)
|
||||
} else {
|
||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
||||
waitTimer(results$rtoe)
|
||||
}
|
||||
|
||||
} # done sending query and retrieving rid, rtoe
|
||||
|
||||
# Enter an infinite loop to check for result availability
|
||||
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
"&FORMAT_OBJECT=SearchInfo",
|
||||
sep = "")
|
||||
|
||||
while (TRUE) {
|
||||
# Check whether the result is ready
|
||||
response <- httr::GET(checkStatus)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||
myTimeout <- myTimeout - EXTRAWAIT
|
||||
|
||||
if (myTimeout <= 0) { # abort
|
||||
cat("BLAST search not concluded before timeout. Aborting.\n")
|
||||
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
|
||||
"Trying checking back later with >",
|
||||
results$rid))
|
||||
return(results)
|
||||
}
|
||||
|
||||
if (quietly) {
|
||||
Sys.sleep(EXTRAWAIT)
|
||||
} else {
|
||||
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
|
||||
EXTRAWAIT,
|
||||
myTimeout))
|
||||
waitTimer(EXTRAWAIT)
|
||||
next
|
||||
}
|
||||
|
||||
} else if (length(grep("Status=FAILED", txt)) > 0) {
|
||||
cat("BLAST search returned status \"FAILED\". Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
|
||||
cat("BLAST search returned status \"UNKNOWN\".\n")
|
||||
cat("This probably means the rid has expired. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=READY", txt)) > 0) { # Done
|
||||
|
||||
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
|
||||
cat("BLAST search ready but no hits found. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else {
|
||||
break # done ... retrieve search result
|
||||
}
|
||||
}
|
||||
} # end result-check loop
|
||||
|
||||
# retrieve results from BLAST server
|
||||
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"&CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
sep = "")
|
||||
|
||||
response <- httr::GET(retrieve)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
# txt contains the whole set of results. Process:
|
||||
|
||||
# First, we strsplit() on linebreaks:
|
||||
txt <- unlist(strsplit(txt, "\n"))
|
||||
|
||||
# The alignments range from the first line that begins with ">" ...
|
||||
iFirst <- grep("^>", txt)[1]
|
||||
|
||||
# ... to the last line that begins with "Sbjct"
|
||||
x <- grep("^Sbjct", txt)
|
||||
iLast <- x[length(x)]
|
||||
|
||||
# Get the alignments block
|
||||
txt <- txt[iFirst:iLast]
|
||||
|
||||
# Drop empty lines
|
||||
txt <- txt[!(nchar(txt) == 0)]
|
||||
|
||||
# A line that ends "]" but does not begin ">" seems to be a split
|
||||
# defline ... eg.
|
||||
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
|
||||
# [2] "EXF-2481]"
|
||||
# Merge these lines to the preceding lines and delete them.
|
||||
#
|
||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||
if (length(x) > 0) {
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
}
|
||||
|
||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||
# redundant, identical sequences. Keep only the first instance.
|
||||
iKeep <- ! grepl("^>", txt)
|
||||
x <- rle(iKeep)
|
||||
x$positions <- cumsum(x$lengths)
|
||||
i <- which(x$lengths > 1 & x$values == FALSE)
|
||||
if (length(i) > 0) {
|
||||
firsts <- x$positions[i] - x$lengths[i] + 1
|
||||
iKeep[firsts] <- TRUE
|
||||
txt <- txt[iKeep]
|
||||
}
|
||||
|
||||
# After this preprocessing the following should be true:
|
||||
# - Every alignment block begins with a defline in which the
|
||||
# first character is ">"
|
||||
# - There is only one defline in each block.
|
||||
# - Lines are not split.
|
||||
|
||||
# Make a dataframe of first and last indices of alignment blocks
|
||||
x <- grep("^>", txt)
|
||||
blocks <- data.frame(iFirst = x,
|
||||
iLast = c((x[-1] - 1), length(txt)))
|
||||
|
||||
# Build the hits list by parsing the blocks
|
||||
results$hits <- list()
|
||||
|
||||
for (i in seq_len(nrow(blocks))) {
|
||||
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
|
||||
results$hits[[i]] <- parseBLASTalignment(thisBlock)
|
||||
}
|
||||
|
||||
return(results)
|
||||
}
|
||||
|
||||
parseBLASTalignment <- function(hit) {
|
||||
# Parse data from a character vector containing a BLAST hit
|
||||
# Parameters:
|
||||
# hit char one BLAST hit as char vector
|
||||
# Value:
|
||||
# list $def chr defline
|
||||
# $accession chr accession number
|
||||
# $organism chr complete organism definition
|
||||
# $species chr binomial species
|
||||
# $E num E value
|
||||
# $lengthAli num length of the alignment
|
||||
# $nIdentitites num number of identities
|
||||
# $nGaps num number of gaps
|
||||
# $Qbounds num 2-element vector of query start-end
|
||||
# $Sbounds num 2-element vector of subject start-end
|
||||
# $Qseq chr query sequence
|
||||
# $midSeq chr midline string
|
||||
# $Sseq chr subject sequence
|
||||
|
||||
getToken <- function(patt, v) {
|
||||
# get the first token identified by pattern patt in character vector v
|
||||
v <- v[grep(patt, v)]
|
||||
if (length(v) > 1) { v <- v[1] }
|
||||
if (length(v) == 0) { token <- NA
|
||||
} else {
|
||||
token <- regmatches(v, regexec(patt, v))[[1]][2] }
|
||||
return(token)
|
||||
}
|
||||
|
||||
h <- list()
|
||||
|
||||
# FASTA defline
|
||||
h$def <- hit[1]
|
||||
|
||||
# accesion number (ID), use the first if there are several, separated by "|"
|
||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# organism
|
||||
patt <- "\\[(.+)]"
|
||||
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# species
|
||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||
if (length(x) >= 2) {
|
||||
h$species <- paste(x[1], x[2])
|
||||
} else if (length(x) == 1) {
|
||||
h$species <- paste(x[1], "sp.")
|
||||
} else {
|
||||
h$species <- NA
|
||||
}
|
||||
|
||||
# E-value
|
||||
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
|
||||
|
||||
# length of alignment
|
||||
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
|
||||
|
||||
# number of identities
|
||||
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
|
||||
|
||||
# number of gaps
|
||||
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
|
||||
|
||||
# split up alignment section
|
||||
idx <- grep("^Query ", hit)
|
||||
Que <- hit[idx]
|
||||
Mid <- hit[idx + 1]
|
||||
Sbj <- hit[idx + 2]
|
||||
|
||||
# first and last positions
|
||||
h$Qbounds <- c(start = 0, end = 0)
|
||||
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
|
||||
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
|
||||
|
||||
h$Sbounds <- c(start = 0, end = 0)
|
||||
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
|
||||
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
|
||||
|
||||
# aligned sequences
|
||||
for (i in seq_along(Que)) {
|
||||
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
|
||||
m <- regexec(patt, Que[i])
|
||||
iFirst <- m[[1]][2]
|
||||
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
|
||||
Que[i] <- substring(Que[i], iFirst, iLast)
|
||||
Mid[i] <- substring(Mid[i], iFirst, iLast)
|
||||
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
|
||||
}
|
||||
|
||||
h$Qseq <- paste0(Que, collapse = "")
|
||||
h$midSeq <- paste0(Mid, collapse = "")
|
||||
h$Sseq <- paste0(Sbj, collapse = "")
|
||||
|
||||
return(h)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
if (FALSE) {
|
||||
# define query:
|
||||
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
sep="")
|
||||
# or ...
|
||||
q <- "NP_010227" # refseq ID
|
||||
|
||||
test <- BLAST(q,
|
||||
nHits = 100,
|
||||
E = 0.001,
|
||||
rid = "",
|
||||
limits = "txid4751[ORGN]") # Fungi
|
||||
str(test)
|
||||
length(test$hits)
|
||||
}
|
||||
|
||||
# [END]
|
||||
|
||||
# BLAST.R
|
||||
#
|
||||
# Purpose: Send off one BLAST search and return parsed list of results
|
||||
# This script uses the BLAST URL-API
|
||||
# (Application Programming Interface) at the NCBI.
|
||||
# Read about the constraints here:
|
||||
# https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
|
||||
#
|
||||
#
|
||||
# Version: 3.2
|
||||
# Date: 2016 09 - 2020 09
|
||||
# Author: Boris Steipe
|
||||
#
|
||||
# Versions:
|
||||
# 3.2 2020 updates
|
||||
# 3.1 Change from require() to requireNamespace(),
|
||||
# use <package>::<function>() idiom throughout
|
||||
# 3.0 parsing logic had not been fully implemented; Fixed.
|
||||
# 2.1 bugfix in BLAST(), bug was blanking non-split deflines;
|
||||
# refactored parseBLASTalignment() to handle lists with multiple hits.
|
||||
# 2.0 Completely rewritten because the interface completely changed.
|
||||
# Code adpated in part from NCBI Perl sample code:
|
||||
# $Id: web_blast.pl,v 1.10 2016/07/13 14:32:50 merezhuk Exp $
|
||||
# 1.0 first version posted for BCH441 2016, based on BLAST - API
|
||||
#
|
||||
# ToDo: Return the organism/strain name in the output, and propagate
|
||||
# into MYSPE selection script.
|
||||
#
|
||||
# Notes: This is somewhat pedestrian, but apparently there are currently
|
||||
# no R packages that contain such code.
|
||||
#
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
if (! requireNamespace("httr", quietly = TRUE)) {
|
||||
install.packages("httr")
|
||||
}
|
||||
|
||||
|
||||
BLAST <- function(Q,
|
||||
db = "refseq_protein",
|
||||
nHits = 30,
|
||||
E = 0.1,
|
||||
limits = "",
|
||||
rid = "",
|
||||
query = "",
|
||||
quietly = FALSE,
|
||||
myTimeout = 120) {
|
||||
# Purpose:
|
||||
# Basic BLAST search
|
||||
#
|
||||
# Parameters:
|
||||
# Q: query - either a valid ID or a sequence
|
||||
# db: "refseq_protein" by default,
|
||||
# other legal values include: "nr", "pdb", "swissprot" ...
|
||||
# nHits: number of hits to maximally return
|
||||
# E: E-value cutoff. Do not return hits whose score would be expected
|
||||
# to occur E or more times in a database of random sequence.
|
||||
# limits: a valid ENTREZ filter
|
||||
# rid: a request ID - to retrieve earlier search results
|
||||
# query: the actual query string (needed when retrieving results
|
||||
# with an rid)
|
||||
# quietly: controls printing of wait-time progress bar
|
||||
# timeout: how much longer _after_ rtoe to wait for a result
|
||||
# before giving up (seconds)
|
||||
# Value:
|
||||
# result: list of process status or resulting hits, and some metadata
|
||||
|
||||
|
||||
EXTRAWAIT <- 10 # duration of extra wait cycles if BLAST search is not done
|
||||
|
||||
results <- list()
|
||||
results$query = query
|
||||
results$rid <- rid
|
||||
results$rtoe <- 0
|
||||
|
||||
if (rid == "") { # If no rid is available, spawn a search.
|
||||
# Else, proceed directly to retrieval.
|
||||
|
||||
# prepare query, GET(), and parse rid and rtoe from BLAST server response
|
||||
results$query <- paste0("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Put",
|
||||
"&PROGRAM=", "blastp",
|
||||
"&QUERY=", URLencode(Q),
|
||||
"&DATABASE=", db,
|
||||
"&MATRIX=", "BLOSUM62",
|
||||
"&EXPECT=", as.character(E),
|
||||
"&HITLIST_SIZE=", as.character(nHits),
|
||||
"&ALIGNMENTS=", as.character(nHits),
|
||||
"&FORMAT_TYPE=Text")
|
||||
|
||||
if (limits != "") {
|
||||
results$query <- paste0(
|
||||
results$query,
|
||||
"&ENTREZ_QUERY=", limits)
|
||||
}
|
||||
|
||||
# send it off ...
|
||||
response <- httr::GET(results$query)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't send query. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
patt <- "RID = (\\w+)" # match the request id
|
||||
results$rid <- regmatches(txt, regexec(patt, txt))[[1]][2]
|
||||
|
||||
patt <- "RTOE = (\\d+)" # match the expected completion time
|
||||
results$rtoe <- as.numeric(regmatches(txt, regexec(patt, txt))[[1]][2])
|
||||
|
||||
# Now we wait ...
|
||||
if (quietly) {
|
||||
Sys.sleep(results$rtoe)
|
||||
} else {
|
||||
cat(sprintf("BLAST is processing %s:\n", results$rid))
|
||||
waitTimer(results$rtoe)
|
||||
}
|
||||
|
||||
} # done sending query and retrieving rid, rtoe
|
||||
|
||||
# Enter an infinite loop to check for result availability
|
||||
checkStatus <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
"&FORMAT_OBJECT=SearchInfo",
|
||||
sep = "")
|
||||
|
||||
while (TRUE) {
|
||||
# Check whether the result is ready
|
||||
response <- httr::GET(checkStatus)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't check status. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
if (length(grep("Status=WAITING", txt)) > 0) {
|
||||
myTimeout <- myTimeout - EXTRAWAIT
|
||||
|
||||
if (myTimeout <= 0) { # abort
|
||||
cat("BLAST search not concluded before timeout. Aborting.\n")
|
||||
cat(sprintf("%s BLASThits <- BLAST(rid=\"%s\")\n",
|
||||
"Trying checking back later with >",
|
||||
results$rid))
|
||||
return(results)
|
||||
}
|
||||
|
||||
if (quietly) {
|
||||
Sys.sleep(EXTRAWAIT)
|
||||
} else {
|
||||
cat(sprintf("Status: Waiting. Wait %d more seconds (max. %d more)",
|
||||
EXTRAWAIT,
|
||||
myTimeout))
|
||||
waitTimer(EXTRAWAIT)
|
||||
next
|
||||
}
|
||||
|
||||
} else if (length(grep("Status=FAILED", txt)) > 0) {
|
||||
cat("BLAST search returned status \"FAILED\". Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=UNKNOWN", txt)) > 0) {
|
||||
cat("BLAST search returned status \"UNKNOWN\".\n")
|
||||
cat("This probably means the rid has expired. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else if (length(grep("Status=READY", txt)) > 0) { # Done
|
||||
|
||||
if (length(grep("ThereAreHits=yes", txt)) == 0) { # No hits
|
||||
cat("BLAST search ready but no hits found. Aborting.\n")
|
||||
return(results)
|
||||
|
||||
} else {
|
||||
break # done ... retrieve search result
|
||||
}
|
||||
}
|
||||
} # end result-check loop
|
||||
|
||||
# retrieve results from BLAST server
|
||||
retrieve <- paste("https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi",
|
||||
"?",
|
||||
"&CMD=Get",
|
||||
"&RID=", results$rid,
|
||||
"&FORMAT_TYPE=Text",
|
||||
sep = "")
|
||||
|
||||
response <- httr::GET(retrieve)
|
||||
if (httr::http_status(response)$category != "Success" ) {
|
||||
stop(sprintf("PANIC: Can't retrieve. BLAST server status error: %s",
|
||||
httr::http_status(response)$message))
|
||||
}
|
||||
|
||||
txt <- httr::content(response, "text", encoding = "UTF-8")
|
||||
|
||||
# txt contains the whole set of results. Process:
|
||||
|
||||
# First, we strsplit() on linebreaks:
|
||||
txt <- unlist(strsplit(txt, "\n"))
|
||||
|
||||
# The alignments range from the first line that begins with ">" ...
|
||||
iFirst <- grep("^>", txt)[1]
|
||||
|
||||
# ... to the last line that begins with "Sbjct"
|
||||
x <- grep("^Sbjct", txt)
|
||||
iLast <- x[length(x)]
|
||||
|
||||
# Get the alignments block
|
||||
txt <- txt[iFirst:iLast]
|
||||
|
||||
# Drop empty lines
|
||||
txt <- txt[!(nchar(txt) == 0)]
|
||||
|
||||
# A line that ends "]" but does not begin ">" seems to be a split
|
||||
# defline ... eg.
|
||||
# [1] ">XP_013349208.1 AUEXF2481DRAFT_695809 [Aureobasidium subglaciale "
|
||||
# [2] "EXF-2481]"
|
||||
# Merge these lines to the preceding lines and delete them.
|
||||
#
|
||||
x <- which(grepl("]$", txt) & !(grepl("^>", txt)))
|
||||
if (length(x) > 0) {
|
||||
txt[x-1] <- paste0(txt[x-1], txt[x])
|
||||
txt <- txt[-x]
|
||||
}
|
||||
|
||||
# Special case: there may be multiple deflines when the BLAST hit is to
|
||||
# redundant, identical sequences. Keep only the first instance.
|
||||
iKeep <- ! grepl("^>", txt)
|
||||
x <- rle(iKeep)
|
||||
x$positions <- cumsum(x$lengths)
|
||||
i <- which(x$lengths > 1 & x$values == FALSE)
|
||||
if (length(i) > 0) {
|
||||
firsts <- x$positions[i] - x$lengths[i] + 1
|
||||
iKeep[firsts] <- TRUE
|
||||
txt <- txt[iKeep]
|
||||
}
|
||||
|
||||
# After this preprocessing the following should be true:
|
||||
# - Every alignment block begins with a defline in which the
|
||||
# first character is ">"
|
||||
# - There is only one defline in each block.
|
||||
# - Lines are not split.
|
||||
|
||||
# Make a dataframe of first and last indices of alignment blocks
|
||||
x <- grep("^>", txt)
|
||||
blocks <- data.frame(iFirst = x,
|
||||
iLast = c((x[-1] - 1), length(txt)))
|
||||
|
||||
# Build the hits list by parsing the blocks
|
||||
results$hits <- list()
|
||||
|
||||
for (i in seq_len(nrow(blocks))) {
|
||||
thisBlock <- txt[blocks$iFirst[i]:blocks$iLast[i]]
|
||||
results$hits[[i]] <- parseBLASTalignment(thisBlock)
|
||||
}
|
||||
|
||||
return(results)
|
||||
}
|
||||
|
||||
parseBLASTalignment <- function(hit) {
|
||||
# Parse data from a character vector containing a BLAST hit
|
||||
# Parameters:
|
||||
# hit char one BLAST hit as char vector
|
||||
# Value:
|
||||
# list $def chr defline
|
||||
# $accession chr accession number
|
||||
# $organism chr complete organism definition
|
||||
# $species chr binomial species
|
||||
# $E num E value
|
||||
# $lengthAli num length of the alignment
|
||||
# $nIdentitites num number of identities
|
||||
# $nGaps num number of gaps
|
||||
# $Qbounds num 2-element vector of query start-end
|
||||
# $Sbounds num 2-element vector of subject start-end
|
||||
# $Qseq chr query sequence
|
||||
# $midSeq chr midline string
|
||||
# $Sseq chr subject sequence
|
||||
|
||||
getToken <- function(patt, v) {
|
||||
# get the first token identified by pattern patt in character vector v
|
||||
v <- v[grep(patt, v)]
|
||||
if (length(v) > 1) { v <- v[1] }
|
||||
if (length(v) == 0) { token <- NA
|
||||
} else {
|
||||
token <- regmatches(v, regexec(patt, v))[[1]][2] }
|
||||
return(token)
|
||||
}
|
||||
|
||||
h <- list()
|
||||
|
||||
# FASTA defline
|
||||
h$def <- hit[1]
|
||||
|
||||
# accesion number (ID), use the first if there are several, separated by "|"
|
||||
patt <- "^>(.+?)(\\s|\\|)" # from ">" to space or "|"
|
||||
h$accession <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# organism
|
||||
patt <- "\\[(.+)]"
|
||||
h$organism <- regmatches(h$def, regexec(patt, h$def))[[1]][2]
|
||||
|
||||
# species
|
||||
x <- unlist(strsplit(h$organism, "\\s+"))
|
||||
if (length(x) >= 2) {
|
||||
h$species <- paste(x[1], x[2])
|
||||
} else if (length(x) == 1) {
|
||||
h$species <- paste(x[1], "sp.")
|
||||
} else {
|
||||
h$species <- NA
|
||||
}
|
||||
|
||||
# E-value
|
||||
h$E <- as.numeric(getToken("Expect\\s*=(.+?), Method", hit))
|
||||
|
||||
# length of alignment
|
||||
h$lengthAli <- as.numeric(getToken("^\\s*Length\\s*=(.+)$", hit))
|
||||
|
||||
# number of identities
|
||||
h$nIdentities <- as.numeric(getToken("^\\s*Identities\\s*=(.+?)/", hit))
|
||||
|
||||
# number of gaps
|
||||
h$nGaps <- as.numeric(getToken("\\s*Gaps\\s*=(.+?)/", hit))
|
||||
|
||||
# split up alignment section
|
||||
idx <- grep("^Query ", hit)
|
||||
Que <- hit[idx]
|
||||
Mid <- hit[idx + 1]
|
||||
Sbj <- hit[idx + 2]
|
||||
|
||||
# first and last positions
|
||||
h$Qbounds <- c(start = 0, end = 0)
|
||||
h$Qbounds[1] <- as.numeric(getToken("^Query\\s*(\\d+)", Que[1]))
|
||||
h$Qbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Que[length(Que)]))
|
||||
|
||||
h$Sbounds <- c(start = 0, end = 0)
|
||||
h$Sbounds[1] <- as.numeric(getToken("^Sbjct\\s*(\\d+)", Sbj[1]))
|
||||
h$Sbounds[2] <- as.numeric(getToken("\\s*(\\d+)\\s*$", Sbj[length(Sbj)]))
|
||||
|
||||
# aligned sequences
|
||||
for (i in seq_along(Que)) {
|
||||
patt <- ("^\\s*Query\\s*\\d+\\s*([A-Za-z-]+)") # capture aligned string
|
||||
m <- regexec(patt, Que[i])
|
||||
iFirst <- m[[1]][2]
|
||||
iLast <- iFirst + attr(m[[1]], which = "match.length")[2] - 1
|
||||
Que[i] <- substring(Que[i], iFirst, iLast)
|
||||
Mid[i] <- substring(Mid[i], iFirst, iLast)
|
||||
Sbj[i] <- substring(Sbj[i], iFirst, iLast)
|
||||
}
|
||||
|
||||
h$Qseq <- paste0(Que, collapse = "")
|
||||
h$midSeq <- paste0(Mid, collapse = "")
|
||||
h$Sseq <- paste0(Sbj, collapse = "")
|
||||
|
||||
return(h)
|
||||
}
|
||||
|
||||
|
||||
# ==== TESTS ===================================================================
|
||||
|
||||
if (FALSE) {
|
||||
# define query:
|
||||
q <- paste("IYSARYSGVDVYEFIHSTGSIMKRKKDDWVNATHI", # Mbp1 APSES domain
|
||||
"LKAANFAKAKRTRILEKEVLKETHEKVQGGFGKYQ",
|
||||
"GTWVPLNIAKQLAEKFSVYDQLKPLFDFTQTDGSASP",
|
||||
sep="")
|
||||
# or ...
|
||||
q <- "NP_010227" # refseq ID
|
||||
|
||||
test <- BLAST(q,
|
||||
nHits = 100,
|
||||
E = 0.001,
|
||||
rid = "",
|
||||
limits = "txid4751[ORGN]") # Fungi
|
||||
str(test)
|
||||
length(test$hits)
|
||||
}
|
||||
|
||||
# [END]
|
||||
|
||||
|
Reference in New Issue
Block a user