New retrieval logic

This commit is contained in:
hyginn 2021-09-18 12:36:46 -04:00
parent c3ca448997
commit 5fa5dd9ff7

View File

@ -3,12 +3,13 @@
# Purpose: Create a list of genome sequenced fungi with protein annotations and
# Mbp1 homologues.
#
# Version: 1.3
# Version: 1.4
#
# Date: 2016 09 - 2020 09
# Date: 2016 09 - 2021 09
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# Versions
# 1.4 New retrieval logic
# 1.3 Rewrite to change datasource. NCBI has not been updated
# since 2012. Use ensembl fungi as initial source.
# 1.2 Change from require() to requireNamespace()
@ -18,8 +19,6 @@
#
# TODO:
#
# type out workflow
#
# ==============================================================================
#
# DO NOT source() THIS FILE!
@ -40,15 +39,15 @@
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------
#TOC> 1 The strategy 56
#TOC> 2 PACKAGES AND INITIALIZATIONS 68
#TOC> 3 ENSEMBL FUNGI 76
#TOC> 3.1 Import 79
#TOC> 4 BLAST SEARCH 156
#TOC> 4.1 find homologous proteins 162
#TOC> 4.2 Identify species in "hits" 193
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 283
#TOC> 6 STUDENT NUMBERS 366
#TOC> 1 The strategy 55
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
#TOC> 3 ENSEMBL FUNGI 75
#TOC> 3.1 Import 78
#TOC> 4 BLAST SEARCH 155
#TOC> 4.1 find homologous proteins 161
#TOC> 4.2 Identify species in "hits" 192
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
#TOC> 6 STUDENT NUMBERS 375
#TOC>
#TOC> ==========================================================================
@ -196,7 +195,7 @@ BLASThits <- readRDS(file = "data/BLASThits.rds")
# we are only interested in the species names that it contains.
# How many hits in the list?
length(BLASThits$hits)
length(BLASThits$hits) # 1,134
# Let's look at a hit somewhere down the list
str(BLASThits$hit[[277]])
@ -239,7 +238,7 @@ sum(eVals == 0)
plot(log(eVals[eVals > 0]), col = "#CC0000")
# This is very informative: I would suspect that the first ten or so are
# virtually identical to the yeast protein, then we have about 700 hits with
# virtually identical to the yeast protein, then we have about 800 hits with
# decreasing similarity, and then about 200 more that may actually be false
# positives. Also - we plotted them by index, that means the table is SORTED:
# Lower E-values strictly come before higher E-values.
@ -271,7 +270,7 @@ BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
# check the number again:
length(BLASTspecies)
# Think a bit about this: what may be the biological reason to find that
# on average, in 300 fungi across the entire phylogenetic tree, we have
# on average, in 388 fungi across the entire phylogenetic tree, we have
# three sequences that are homologous to yeast Mbp1?
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
@ -342,6 +341,11 @@ length(unique(sDat$species))
length(unique(sDat$genus))
length(unique(sDat$order))
# I need an extra species for admin purposes later on ...
sel <- grep("Sporothrix schenckii", sDat$species)
SPOSCdat <- sDat[sel, ]
sDat <- sDat[-sel, ]
# To get the final dataset, we remove the reference species with their
# entire orders ...
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
@ -355,9 +359,14 @@ sDat <- sDat[ ! sel , ]
# ... but all of them
sum(REFspecies %in% REFdat$species)
# ... and we have enough left in sDat to prune sDat to unique genus ...
# ... and we have enough left in sDat to prune sDat to unique genus
sDat <- sDat[ ! duplicated(sDat$genus) , ]
nrow(sDat) # 84
# I add back "Sporothrix schenckii" ...
sDat <- rbind(SPOSCdat, sDat)
# ... and save for future use.
# saveRDS(sDat, file = "data/sDat.rds")
# saveRDS(REFdat, file = "data/REFdat.rds")
@ -366,44 +375,58 @@ sDat <- sDat[ ! duplicated(sDat$genus) , ]
# = 6 STUDENT NUMBERS =====================================================
#
# An asymmetric function to retrieve a MYSPE species
#
sDat <- readRDS(file = "data/sDat.rds")
students <- read.csv("../BCH441-2020-students.csv")
students <- read.csv("../BCH441-2021-students.csv")
sN <- students$Integration.ID
sN <- sN[! is.na(sN)]
sN <- as.character(sN)
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
sN <- students$Student.Number
range(sN)
any(duplicated(gsub(".+(.......)$", "\\1", sN)))
N <- 7
x <- numeric(N)
for (i in 1:N) {
x[i] <- H(substr(gsub(".+(.......)$", "\\1", sN), i, i))
}
plot(x, col = "#BB0000", type = "b")
keys <- as.numeric(gsub(".+(....).$", "\\1", sN))
any(duplicated(keys))
# =====
set.seed(112358)
names(sN) <- sample(1:nrow(sDat), length(sN))
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
all(sort(theseSpecies$name) == sort(sDat$name))
nrow((theseSpecies))
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
rndMin <- 992000000
rndMax <- 1020000000
N <- 10000
keys <- as.character(sample(rndMin:rndMax, N + 1000))
keys <- keys[! (keys %in% sN)]
keys <- keys[1:N]
keys[1:length(sN)] <- sN
MYSPEmap <- data.frame(keys = sprintf("%04d", 0:9999),
iMYSPE = sample(1:nrow(sDat), 10000, replace = TRUE))
rownames(MYSPEmap) <- MYSPEmap$keys
nRep <- floor(N/nrow(theseSpecies))
MYSPEdat <- theseSpecies
for(i in 1:nRep) {
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
}
MYSPEdat <- MYSPEdat[1:N, ]
for (i in 1:N) {
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
}
set.seed(NULL)
MYSPEdat <- MYSPEdat[sample(1:N), ]
for (i in 1:length(sN)) {
rMap <- gsub(".+(....).$", "\\1", sN[i])
MYSPEmap[rMap, "iMYSPE"] <- as.integer(names(sN)[i])
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
# === validate
x <- character()
for (n in sN) {
sp <- getMYSPE(n)
if (length(sp) != 1) {
stop(print(as.character(n)))
} else {
x <- c(x, sp)
}
}
# saveRDS(MYSPEmap, "./data/MYSPEmap.rds")
# === species for late-comers
y <- unique(MYSPEdat$species)
print(y[!(y %in% x)])
getMYSPE <- function(x) {
dat <- readRDS("./data/sDat.rds")
map <- readRDS("./data/MYSPEmap.rds")
key <- gsub(".+(....).$", "\\1", x)
return(dat$species[map[key, "iMYSPE"]])
}
# === validate
l <- length(sN)