New retrieval logic
This commit is contained in:
parent
c3ca448997
commit
5fa5dd9ff7
@ -3,12 +3,13 @@
|
|||||||
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
# Purpose: Create a list of genome sequenced fungi with protein annotations and
|
||||||
# Mbp1 homologues.
|
# Mbp1 homologues.
|
||||||
#
|
#
|
||||||
# Version: 1.3
|
# Version: 1.4
|
||||||
#
|
#
|
||||||
# Date: 2016 09 - 2020 09
|
# Date: 2016 09 - 2021 09
|
||||||
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
# Author: Boris Steipe (boris.steipe@utoronto.ca)
|
||||||
#
|
#
|
||||||
# Versions
|
# Versions
|
||||||
|
# 1.4 New retrieval logic
|
||||||
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
# 1.3 Rewrite to change datasource. NCBI has not been updated
|
||||||
# since 2012. Use ensembl fungi as initial source.
|
# since 2012. Use ensembl fungi as initial source.
|
||||||
# 1.2 Change from require() to requireNamespace()
|
# 1.2 Change from require() to requireNamespace()
|
||||||
@ -18,8 +19,6 @@
|
|||||||
#
|
#
|
||||||
# TODO:
|
# TODO:
|
||||||
#
|
#
|
||||||
# type out workflow
|
|
||||||
#
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
#
|
#
|
||||||
# DO NOT source() THIS FILE!
|
# DO NOT source() THIS FILE!
|
||||||
@ -40,15 +39,15 @@
|
|||||||
#TOC>
|
#TOC>
|
||||||
#TOC> Section Title Line
|
#TOC> Section Title Line
|
||||||
#TOC> --------------------------------------------------------
|
#TOC> --------------------------------------------------------
|
||||||
#TOC> 1 The strategy 56
|
#TOC> 1 The strategy 55
|
||||||
#TOC> 2 PACKAGES AND INITIALIZATIONS 68
|
#TOC> 2 PACKAGES AND INITIALIZATIONS 67
|
||||||
#TOC> 3 ENSEMBL FUNGI 76
|
#TOC> 3 ENSEMBL FUNGI 75
|
||||||
#TOC> 3.1 Import 79
|
#TOC> 3.1 Import 78
|
||||||
#TOC> 4 BLAST SEARCH 156
|
#TOC> 4 BLAST SEARCH 155
|
||||||
#TOC> 4.1 find homologous proteins 162
|
#TOC> 4.1 find homologous proteins 161
|
||||||
#TOC> 4.2 Identify species in "hits" 193
|
#TOC> 4.2 Identify species in "hits" 192
|
||||||
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 283
|
#TOC> 5 MERGE ENSEMBL AND BLAST RESULTS 282
|
||||||
#TOC> 6 STUDENT NUMBERS 366
|
#TOC> 6 STUDENT NUMBERS 375
|
||||||
#TOC>
|
#TOC>
|
||||||
#TOC> ==========================================================================
|
#TOC> ==========================================================================
|
||||||
|
|
||||||
@ -196,7 +195,7 @@ BLASThits <- readRDS(file = "data/BLASThits.rds")
|
|||||||
# we are only interested in the species names that it contains.
|
# we are only interested in the species names that it contains.
|
||||||
|
|
||||||
# How many hits in the list?
|
# How many hits in the list?
|
||||||
length(BLASThits$hits)
|
length(BLASThits$hits) # 1,134
|
||||||
|
|
||||||
# Let's look at a hit somewhere down the list
|
# Let's look at a hit somewhere down the list
|
||||||
str(BLASThits$hit[[277]])
|
str(BLASThits$hit[[277]])
|
||||||
@ -239,7 +238,7 @@ sum(eVals == 0)
|
|||||||
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
plot(log(eVals[eVals > 0]), col = "#CC0000")
|
||||||
|
|
||||||
# This is very informative: I would suspect that the first ten or so are
|
# This is very informative: I would suspect that the first ten or so are
|
||||||
# virtually identical to the yeast protein, then we have about 700 hits with
|
# virtually identical to the yeast protein, then we have about 800 hits with
|
||||||
# decreasing similarity, and then about 200 more that may actually be false
|
# decreasing similarity, and then about 200 more that may actually be false
|
||||||
# positives. Also - we plotted them by index, that means the table is SORTED:
|
# positives. Also - we plotted them by index, that means the table is SORTED:
|
||||||
# Lower E-values strictly come before higher E-values.
|
# Lower E-values strictly come before higher E-values.
|
||||||
@ -271,7 +270,7 @@ BLASTspecies <- BLASTspecies[ ! duplicated(BLASTspecies)]
|
|||||||
# check the number again:
|
# check the number again:
|
||||||
length(BLASTspecies)
|
length(BLASTspecies)
|
||||||
# Think a bit about this: what may be the biological reason to find that
|
# Think a bit about this: what may be the biological reason to find that
|
||||||
# on average, in 300 fungi across the entire phylogenetic tree, we have
|
# on average, in 388 fungi across the entire phylogenetic tree, we have
|
||||||
# three sequences that are homologous to yeast Mbp1?
|
# three sequences that are homologous to yeast Mbp1?
|
||||||
|
|
||||||
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
# Let's look at the distribution of E-values in this selection (Subsetting FTW):
|
||||||
@ -342,6 +341,11 @@ length(unique(sDat$species))
|
|||||||
length(unique(sDat$genus))
|
length(unique(sDat$genus))
|
||||||
length(unique(sDat$order))
|
length(unique(sDat$order))
|
||||||
|
|
||||||
|
# I need an extra species for admin purposes later on ...
|
||||||
|
sel <- grep("Sporothrix schenckii", sDat$species)
|
||||||
|
SPOSCdat <- sDat[sel, ]
|
||||||
|
sDat <- sDat[-sel, ]
|
||||||
|
|
||||||
# To get the final dataset, we remove the reference species with their
|
# To get the final dataset, we remove the reference species with their
|
||||||
# entire orders ...
|
# entire orders ...
|
||||||
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
REForders <- unique(sDat$order[sDat$species %in% REFspecies])
|
||||||
@ -355,9 +359,14 @@ sDat <- sDat[ ! sel , ]
|
|||||||
# ... but all of them
|
# ... but all of them
|
||||||
sum(REFspecies %in% REFdat$species)
|
sum(REFspecies %in% REFdat$species)
|
||||||
|
|
||||||
# ... and we have enough left in sDat to prune sDat to unique genus ...
|
# ... and we have enough left in sDat to prune sDat to unique genus
|
||||||
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
||||||
|
nrow(sDat) # 84
|
||||||
|
|
||||||
|
# I add back "Sporothrix schenckii" ...
|
||||||
|
sDat <- rbind(SPOSCdat, sDat)
|
||||||
|
|
||||||
|
# ... and save for future use.
|
||||||
# saveRDS(sDat, file = "data/sDat.rds")
|
# saveRDS(sDat, file = "data/sDat.rds")
|
||||||
# saveRDS(REFdat, file = "data/REFdat.rds")
|
# saveRDS(REFdat, file = "data/REFdat.rds")
|
||||||
|
|
||||||
@ -366,44 +375,58 @@ sDat <- sDat[ ! duplicated(sDat$genus) , ]
|
|||||||
# = 6 STUDENT NUMBERS =====================================================
|
# = 6 STUDENT NUMBERS =====================================================
|
||||||
#
|
#
|
||||||
# An asymmetric function to retrieve a MYSPE species
|
# An asymmetric function to retrieve a MYSPE species
|
||||||
|
#
|
||||||
|
sDat <- readRDS(file = "data/sDat.rds")
|
||||||
|
|
||||||
students <- read.csv("../BCH441-2020-students.csv")
|
students <- read.csv("../BCH441-2021-students.csv")
|
||||||
|
sN <- students$Integration.ID
|
||||||
|
sN <- sN[! is.na(sN)]
|
||||||
|
sN <- as.character(sN)
|
||||||
|
sN <- c("1003141593", sN) # will map to "Sporothrix schenckii"
|
||||||
|
|
||||||
sN <- students$Student.Number
|
|
||||||
range(sN)
|
|
||||||
any(duplicated(gsub(".+(.......)$", "\\1", sN)))
|
|
||||||
|
|
||||||
N <- 7
|
|
||||||
x <- numeric(N)
|
|
||||||
for (i in 1:N) {
|
|
||||||
x[i] <- H(substr(gsub(".+(.......)$", "\\1", sN), i, i))
|
|
||||||
}
|
|
||||||
plot(x, col = "#BB0000", type = "b")
|
|
||||||
|
|
||||||
keys <- as.numeric(gsub(".+(....).$", "\\1", sN))
|
|
||||||
any(duplicated(keys))
|
|
||||||
|
|
||||||
# =====
|
|
||||||
set.seed(112358)
|
set.seed(112358)
|
||||||
names(sN) <- sample(1:nrow(sDat), length(sN))
|
theseSpecies <- sDat[sample(1:nrow(sDat)), ]
|
||||||
|
all(sort(theseSpecies$name) == sort(sDat$name))
|
||||||
|
nrow((theseSpecies))
|
||||||
|
(iX <- grep("Sporothrix schenckii", theseSpecies$name))
|
||||||
|
theseSpecies <- rbind(theseSpecies[iX, ], theseSpecies[-iX, ])
|
||||||
|
rndMin <- 992000000
|
||||||
|
rndMax <- 1020000000
|
||||||
|
N <- 10000
|
||||||
|
keys <- as.character(sample(rndMin:rndMax, N + 1000))
|
||||||
|
keys <- keys[! (keys %in% sN)]
|
||||||
|
keys <- keys[1:N]
|
||||||
|
keys[1:length(sN)] <- sN
|
||||||
|
|
||||||
MYSPEmap <- data.frame(keys = sprintf("%04d", 0:9999),
|
nRep <- floor(N/nrow(theseSpecies))
|
||||||
iMYSPE = sample(1:nrow(sDat), 10000, replace = TRUE))
|
MYSPEdat <- theseSpecies
|
||||||
rownames(MYSPEmap) <- MYSPEmap$keys
|
for(i in 1:nRep) {
|
||||||
|
MYSPEdat <- rbind(MYSPEdat, theseSpecies)
|
||||||
|
}
|
||||||
|
MYSPEdat <- MYSPEdat[1:N, ]
|
||||||
|
for (i in 1:N) {
|
||||||
|
rownames(MYSPEdat)[i] <- digest::digest(keys[i], algo = "md5")
|
||||||
|
}
|
||||||
|
set.seed(NULL)
|
||||||
|
MYSPEdat <- MYSPEdat[sample(1:N), ]
|
||||||
|
|
||||||
for (i in 1:length(sN)) {
|
# saveRDS(MYSPEdat, file = "data/MYSPEdat.rds")
|
||||||
rMap <- gsub(".+(....).$", "\\1", sN[i])
|
|
||||||
MYSPEmap[rMap, "iMYSPE"] <- as.integer(names(sN)[i])
|
# === validate
|
||||||
|
x <- character()
|
||||||
|
for (n in sN) {
|
||||||
|
sp <- getMYSPE(n)
|
||||||
|
if (length(sp) != 1) {
|
||||||
|
stop(print(as.character(n)))
|
||||||
|
} else {
|
||||||
|
x <- c(x, sp)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# saveRDS(MYSPEmap, "./data/MYSPEmap.rds")
|
# === species for late-comers
|
||||||
|
y <- unique(MYSPEdat$species)
|
||||||
|
print(y[!(y %in% x)])
|
||||||
|
|
||||||
getMYSPE <- function(x) {
|
|
||||||
dat <- readRDS("./data/sDat.rds")
|
|
||||||
map <- readRDS("./data/MYSPEmap.rds")
|
|
||||||
key <- gsub(".+(....).$", "\\1", x)
|
|
||||||
return(dat$species[map[key, "iMYSPE"]])
|
|
||||||
}
|
|
||||||
|
|
||||||
# === validate
|
# === validate
|
||||||
l <- length(sN)
|
l <- length(sN)
|
||||||
|
Loading…
Reference in New Issue
Block a user