Compare commits
28 Commits
0.6.0
...
features/l
Author | SHA1 | Date | |
---|---|---|---|
897f7ee922 | |||
bfc286e6b0 | |||
a88225fcff | |||
c18d817cd9 | |||
f462e6d5e0 | |||
e568e9fb2c | |||
4b9eb8674d | |||
f75707e4fe | |||
b4845fab34 | |||
fe999f1cab | |||
85946eb110 | |||
a27e09da31 | |||
ba2b688e89 | |||
49f31b7943 | |||
1c6e1cfb35 | |||
fb99526162 | |||
ff8a1aff08 | |||
341ca933a3 | |||
3e3898334f | |||
ba1f0aa318 | |||
6d0157581f | |||
4bcbfa0c6a | |||
ca0f9673b0 | |||
5048fa8057 | |||
39125c848e | |||
744a6c2009 | |||
1773bb9dcb | |||
1372141b57 |
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
@@ -6,10 +6,10 @@
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "autobigsst info -lschema pubmlst_bordetella_seqdef",
|
||||
"name": "autobigs info -lschema pubmlst_bordetella_seqdef",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/src/autobigsst/cli/program.py",
|
||||
"program": "${workspaceFolder}/src/autobigs/cli/program.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"info",
|
||||
|
2
Jenkinsfile
vendored
2
Jenkinsfile
vendored
@@ -36,7 +36,7 @@ pipeline {
|
||||
CREDS = credentials('username-password-rs-git')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS__PSW} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
}
|
||||
}
|
||||
stage ("pypi.org") {
|
||||
|
@@ -1,4 +1,4 @@
|
||||
# autobigsst.Engine
|
||||
# autoBIGS.Engine
|
||||
|
||||
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||
|
||||
@@ -18,8 +18,8 @@ Furthermore, this library is highly asynchronous where any potentially blocking
|
||||
|
||||
This library can be installed through pip. Learn how to [setup and install pip first](https://pip.pypa.io/en/stable/installation/).
|
||||
|
||||
Then, it's as easy as running `pip install autobigsst-engine` in any terminal that has pip in it's path (any terminal where `pip --version` returns a valid version and install path).
|
||||
Then, it's as easy as running `pip install autobigs-engine` in any terminal that has pip in it's path (any terminal where `pip --version` returns a valid version and install path).
|
||||
|
||||
### CLI usage
|
||||
|
||||
This is a independent python library and thus does not have any form of direct user interface. One way of using it could be to create your own Python script that makes calls to this libraries functions. Alternatively, you may use `autobigsst-cli`, a `Python` package that implements a CLI for calling this library.
|
||||
This is a independent python library and thus does not have any form of direct user interface. One way of using it could be to create your own Python script that makes calls to this libraries functions. Alternatively, you may use `autobigs-cli`, a `Python` package that implements a CLI for calling this library.
|
@@ -3,20 +3,21 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "autobigsst.engine"
|
||||
name = "autoBIGS.engine"
|
||||
dynamic = ["version"]
|
||||
readme = "README.md"
|
||||
|
||||
dependencies = [
|
||||
"biopython",
|
||||
"aiohttp[speedups]",
|
||||
"biopython==1.85",
|
||||
"aiohttp[speedups]==3.11.*",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
requires-python = ">=3.12"
|
||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||
|
||||
[project.urls]
|
||||
Repository = "https://github.com/RealYHD/autobigsst.engine.git"
|
||||
Issues = "https://github.com/RealYHD/autobigsst.engine/issues"
|
||||
Homepage = "https://github.com/RealYHD/autoBIGS.engine"
|
||||
Source = "https://github.com/RealYHD/autoBIGS.engine"
|
||||
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
|
||||
|
@@ -1,5 +1,5 @@
|
||||
aiohttp[speedups]
|
||||
biopython
|
||||
aiohttp[speedups]==3.11.*
|
||||
biopython==1.85
|
||||
pytest
|
||||
pytest-asyncio
|
||||
build
|
||||
|
70
src/autobigs/engine/analysis/aligners.py
Normal file
70
src/autobigs/engine/analysis/aligners.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import asyncio
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from contextlib import AbstractContextManager
|
||||
from typing import Any, Set, Union
|
||||
from Bio.Align import PairwiseAligner
|
||||
from queue import Queue
|
||||
|
||||
from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
|
||||
|
||||
class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager):
|
||||
def __enter__(self):
|
||||
self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment")
|
||||
return self
|
||||
|
||||
def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
|
||||
self._max_threads = max_threads
|
||||
self._aligner = aligner
|
||||
self._work_left: Set[Future] = set()
|
||||
self._work_complete: Queue[Future] = Queue()
|
||||
|
||||
def align(self, reference: str, query: str, **associated_data):
|
||||
work = self._thread_pool.submit(
|
||||
self.work, reference, query, **associated_data)
|
||||
work.add_done_callback(self._on_complete)
|
||||
self._work_left.add(work)
|
||||
|
||||
def _on_complete(self, future: Future):
|
||||
self._work_left.remove(future)
|
||||
self._work_complete.put(future)
|
||||
|
||||
def work(self, reference, query, **associated_data):
|
||||
alignments = self._aligner.align(reference, query)
|
||||
top_alignment = alignments[0]
|
||||
top_alignment_stats = top_alignment.counts()
|
||||
top_alignment_gaps = top_alignment_stats.gaps
|
||||
top_alignment_identities = top_alignment_stats.identities
|
||||
top_alignment_mismatches = top_alignment_stats.mismatches
|
||||
top_alignment_score = top_alignment.score # type: ignore
|
||||
return PairwiseAlignment(
|
||||
top_alignment.sequences[0],
|
||||
top_alignment.sequences[1],
|
||||
tuple(top_alignment.indices[0]),
|
||||
tuple(top_alignment.indices[1]),
|
||||
AlignmentStats(
|
||||
percent_identity=top_alignment_identities/top_alignment.length,
|
||||
mismatches=top_alignment_mismatches,
|
||||
gaps=top_alignment_gaps,
|
||||
match_metric=top_alignment_score
|
||||
)), associated_data
|
||||
|
||||
async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
|
||||
if self._work_complete.empty() and len(self._work_left):
|
||||
return None
|
||||
completed_alignment = await asyncio.wrap_future(self._work_complete.get())
|
||||
return completed_alignment
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.shutdown()
|
||||
|
||||
def __aiter__(self):
|
||||
return self
|
||||
|
||||
async def __anext__(self):
|
||||
result = await self.next_completed()
|
||||
if result is None:
|
||||
raise StopAsyncIteration
|
||||
return result
|
||||
|
||||
def shutdown(self):
|
||||
self._thread_pool.shutdown(wait=True, cancel_futures=True)
|
338
src/autobigs/engine/analysis/bigsdb.py
Normal file
338
src/autobigs/engine/analysis/bigsdb.py
Normal file
@@ -0,0 +1,338 @@
|
||||
from abc import abstractmethod
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import csv
|
||||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
|
||||
from autobigs.engine.reading import read_fasta
|
||||
from autobigs.engine.structures.alignment import PairwiseAlignment
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
from Bio.Align import PairwiseAligner
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
@abstractmethod
|
||||
def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._database_name = database_name
|
||||
self._schema_id = schema_id
|
||||
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
if isinstance(query_sequence_strings, str):
|
||||
query_sequence_strings = [query_sequence_strings]
|
||||
for sequence_string in query_sequence_strings:
|
||||
async with self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": True
|
||||
}) as response:
|
||||
sequence_response: dict = await response.json()
|
||||
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = AlignmentStats(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
gaps=int(partial_match["gaps"]),
|
||||
match_metric=int(partial_match["bitscore"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_locus=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json: dict = await response.json()
|
||||
allele_set: Set[Allele] = set()
|
||||
response_json.setdefault("fields", dict())
|
||||
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||
schema_fields_returned.setdefault("ST", "unknown")
|
||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
|
||||
if len(exact_match_alleles) > 1:
|
||||
raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
|
||||
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
|
||||
if len(allele_set) == 0:
|
||||
raise ValueError("Passed in no alleles.")
|
||||
return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
||||
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
|
||||
return await self.determine_mlst_st(alleles)
|
||||
|
||||
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||
async for named_strings in query_named_string_groups:
|
||||
for named_string in named_strings:
|
||||
try:
|
||||
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield NamedMLSTProfile(named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
||||
async def __aenter__(self):
|
||||
if self._prepare:
|
||||
await self.update_scheme_locis()
|
||||
await asyncio.gather(
|
||||
self.download_alleles_cache_data(),
|
||||
self.download_scheme_profiles()
|
||||
)
|
||||
await self.load_scheme_profiles()
|
||||
return self
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True):
|
||||
self._database_api = database_api
|
||||
self._database_name = database_name
|
||||
self._schema_id = schema_id
|
||||
self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
|
||||
if cache_path is None:
|
||||
self._cache_path = tempfile.mkdtemp("BIGSdb")
|
||||
self._cleanup_required = True
|
||||
else:
|
||||
self._cache_path = cache_path
|
||||
self._cleanup_required = False
|
||||
self._loci: list[str] = []
|
||||
self._profiles_st_map = {}
|
||||
self._prepare = prepare
|
||||
|
||||
async def update_scheme_locis(self):
|
||||
self._loci.clear()
|
||||
async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response:
|
||||
schema_json = await schema_response.json()
|
||||
for locus in schema_json["loci"]:
|
||||
locus_name = path.basename(locus)
|
||||
self._loci.append(locus_name)
|
||||
self._loci.sort()
|
||||
|
||||
async def load_scheme_profiles(self):
|
||||
self._profiles_st_map.clear()
|
||||
with open(self.get_scheme_profile_path()) as profile_cache_handle:
|
||||
reader = csv.DictReader(profile_cache_handle, delimiter="\t")
|
||||
for line in reader:
|
||||
alleles = []
|
||||
for locus in self._loci:
|
||||
alleles.append(line[locus])
|
||||
self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"])
|
||||
|
||||
def get_locus_cache_path(self, locus) -> str:
|
||||
return path.join(self._cache_path, locus + "." + "fasta")
|
||||
|
||||
def get_scheme_profile_path(self):
|
||||
return path.join(self._cache_path, "profiles.csv")
|
||||
|
||||
async def download_alleles_cache_data(self):
|
||||
for locus in self._loci:
|
||||
with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
|
||||
async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
|
||||
async for chunk, eof in fasta_response.content.iter_chunks():
|
||||
fasta_handle.write(chunk)
|
||||
|
||||
async def download_scheme_profiles(self):
|
||||
with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle:
|
||||
async with self._http_client.get("profiles_csv") as profiles_response:
|
||||
async for chunk, eof in profiles_response.content.iter_chunks():
|
||||
profile_cache_handle.write(chunk)
|
||||
await self.load_scheme_profiles()
|
||||
|
||||
async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
|
||||
aligner = PairwiseAligner("blastn")
|
||||
aligner.mode = "local"
|
||||
with AsyncBiopythonPairwiseAlignmentEngine(aligner, max_threads=4) as aligner_engine:
|
||||
for query_sequence_string in query_sequence_strings:
|
||||
for locus in self._loci:
|
||||
async for allele_variant in read_fasta(self.get_locus_cache_path(locus)):
|
||||
aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True)
|
||||
break # start a bunch of full alignments for each variant to select segments
|
||||
alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set)
|
||||
async for alignment_result, additional_information in aligner_engine:
|
||||
result_variant_name = additional_information["variant_name"]
|
||||
result_locus, variant_id = result_variant_name.split("_")
|
||||
full_alignment = additional_information["full"]
|
||||
if full_alignment:
|
||||
if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0:
|
||||
# I.e., 100% exactly the same
|
||||
yield Allele(result_locus, variant_id, None)
|
||||
continue
|
||||
else:
|
||||
alignment_rankings[result_locus].add((alignment_result, variant_id))
|
||||
interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]]
|
||||
async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)):
|
||||
if result_variant_name == allele_variant.name:
|
||||
continue # Skip if we just finished aligning this
|
||||
aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False)
|
||||
else:
|
||||
alignment_rankings[result_locus].add((alignment_result, variant_id))
|
||||
for final_locus, alignments in alignment_rankings.items():
|
||||
closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.match_metric)[0]
|
||||
yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats)
|
||||
|
||||
async def determine_mlst_st(self, alleles):
|
||||
allele_variants: dict[str, Allele] = {}
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_variants[allele.allele_locus] = allele
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_variants[allele.allele_locus] = allele
|
||||
ordered_profile = []
|
||||
for locus in self._loci:
|
||||
ordered_profile.append(allele_variants[locus].allele_variant)
|
||||
|
||||
st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)]
|
||||
return MLSTProfile(set(allele_variants.values()), st, clonal_complex)
|
||||
|
||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
||||
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
|
||||
return await self.determine_mlst_st(alleles)
|
||||
|
||||
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||
async for named_strings in query_named_string_groups:
|
||||
for named_string in named_strings:
|
||||
try:
|
||||
yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield NamedMLSTProfile(named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
if self._cleanup_required:
|
||||
shutil.rmtree(self._cache_path)
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
KNOWN_BIGSDB_APIS = {
|
||||
"https://bigsdb.pasteur.fr/api",
|
||||
"https://rest.pubmlst.org"
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._http_client = ClientSession()
|
||||
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
|
||||
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
|
||||
super().__init__()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
|
||||
if self._known_seqdef_dbs_origin is not None and not force:
|
||||
return self._known_seqdef_dbs_origin
|
||||
known_seqdef_dbs = dict()
|
||||
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
|
||||
async with self._http_client.get(f"{known_bigsdb}/db") as response:
|
||||
response_json_databases = await response.json()
|
||||
for database_group in response_json_databases:
|
||||
for database_info in database_group["databases"]:
|
||||
if str(database_info["name"]).endswith("seqdef"):
|
||||
known_seqdef_dbs[database_info["name"]] = known_bigsdb
|
||||
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
|
||||
return self._known_seqdef_dbs_origin
|
||||
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
known_databases = await self.get_known_seqdef_dbs()
|
||||
if seqdef_db_name not in known_databases:
|
||||
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||
return known_databases[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
if seqdef_db_name in self._seqdefdb_schemas and not force:
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
|
||||
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
|
||||
async with self._http_client.get(uri_path) as response:
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||
return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
|
||||
if local:
|
||||
return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
|
||||
return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
|
26
src/autobigs/engine/analysis/genbank.py
Normal file
26
src/autobigs/engine/analysis/genbank.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import asyncio
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import tempfile
|
||||
from typing import Iterable, Union
|
||||
from Bio import Entrez
|
||||
from Bio import SeqIO
|
||||
|
||||
from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation
|
||||
|
||||
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
||||
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
||||
record = SeqIO.read(fetch_stream, "genbank")
|
||||
sequence_features = list()
|
||||
for feature in record.features:
|
||||
start = int(feature.location.start)
|
||||
end = int(feature.location.end)
|
||||
qualifiers = feature.qualifiers
|
||||
for qualifier_key in qualifiers:
|
||||
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
|
||||
sequence_features.append(StringAnnotation(
|
||||
type=feature.type,
|
||||
start=start,
|
||||
end=end+1, # Position is exclusive
|
||||
feature_properties=qualifiers
|
||||
))
|
||||
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
@@ -1,9 +1,9 @@
|
||||
import asyncio
|
||||
from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||
from typing import Any, AsyncGenerator, Iterable, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from autobigsst.engine.data.structures.genomics import NamedString
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
18
src/autobigs/engine/structures/alignment.py
Normal file
18
src/autobigs/engine/structures/alignment.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from dataclasses import dataclass
|
||||
from numbers import Number
|
||||
from typing import Sequence
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AlignmentStats:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
gaps: int
|
||||
match_metric: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PairwiseAlignment:
|
||||
reference: str
|
||||
query: str
|
||||
reference_indices: Sequence[Number]
|
||||
query_indices: Sequence[Number]
|
||||
alignment_stats: AlignmentStats
|
33
src/autobigs/engine/structures/mlst.py
Normal file
33
src/autobigs/engine/structures/mlst.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Collection, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from autobigs.engine.structures.alignment import AlignmentStats
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_locus: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, AlignmentStats]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Collection[Allele]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NamedMLSTProfile:
|
||||
name: str
|
||||
mlst_profile: Union[None, MLSTProfile]
|
||||
|
||||
|
||||
def alleles_to_mapping(alleles: Iterable[Allele]):
|
||||
result = defaultdict(list)
|
||||
for allele in alleles:
|
||||
result[allele.allele_locus].append(allele.allele_variant)
|
||||
result = dict(result)
|
||||
for locus, variant in result.items():
|
||||
if len(variant) == 1:
|
||||
result[locus] = variant[0]
|
||||
return result
|
@@ -1,22 +1,19 @@
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
from os import PathLike
|
||||
from typing import AsyncIterable, Mapping, Sequence, Union
|
||||
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
|
||||
|
||||
from autobigsst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, Union[list[str], str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
if len(alleles) == 1:
|
||||
result_dict[loci] = alleles[0].allele_variant
|
||||
else:
|
||||
result_locis = list()
|
||||
for allele in alleles:
|
||||
result_locis.append(allele.allele_variant)
|
||||
result_dict[loci] = result_locis
|
||||
return result_dict
|
||||
|
||||
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
|
||||
result = defaultdict(list)
|
||||
for allele in alleles:
|
||||
result[allele.allele_locus].append(allele.allele_variant)
|
||||
for locus in result.keys():
|
||||
if len(result[locus]) == 1:
|
||||
result[locus] = result[locus][0] # Take the only one
|
||||
return dict(result)
|
||||
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||
failed = list()
|
||||
@@ -27,15 +24,16 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
|
||||
if mlst_profile is None:
|
||||
failed.append(name)
|
||||
continue
|
||||
allele_mapping = alleles_to_map(mlst_profile.alleles)
|
||||
if writer is None:
|
||||
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
|
||||
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
writer.writeheader()
|
||||
row_dictionary = {
|
||||
"st": mlst_profile.sequence_type,
|
||||
"clonal-complex": mlst_profile.clonal_complex,
|
||||
"id": name,
|
||||
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
**allele_mapping
|
||||
}
|
||||
writer.writerow(rowdict=row_dictionary)
|
||||
return failed
|
@@ -1,166 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from numbers import Number
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from autobigsst.engine.data.structures.genomics import NamedString
|
||||
from autobigsst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from autobigsst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._database_name = database_name
|
||||
self._schema_id = schema_id
|
||||
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": not exact
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
if exact:
|
||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = PartialAllelicMatchProfile(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
bitscore=float(partial_match["bitscore"]),
|
||||
gaps=int(partial_match["gaps"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_loci=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
|
||||
|
||||
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json: dict = await response.json()
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
response_json.setdefault("fields", dict())
|
||||
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||
schema_fields_returned.setdefault("ST", "unknown")
|
||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
|
||||
if len(allele_map) == 0:
|
||||
raise ValueError("Passed in no alleles.")
|
||||
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||
async for named_string in namedStrings:
|
||||
try:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield (named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
KNOWN_BIGSDB_APIS = {
|
||||
"https://bigsdb.pasteur.fr/api",
|
||||
"https://rest.pubmlst.org"
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._http_client = ClientSession()
|
||||
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
|
||||
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
|
||||
super().__init__()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
|
||||
if self._known_seqdef_dbs_origin is not None and not force:
|
||||
return self._known_seqdef_dbs_origin
|
||||
known_seqdef_dbs = dict()
|
||||
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
|
||||
async with self._http_client.get(f"{known_bigsdb}/db") as response:
|
||||
response_json_databases = await response.json()
|
||||
for database_group in response_json_databases:
|
||||
for database_info in database_group["databases"]:
|
||||
if str(database_info["name"]).endswith("seqdef"):
|
||||
known_seqdef_dbs[database_info["name"]] = known_bigsdb
|
||||
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
|
||||
return self._known_seqdef_dbs_origin
|
||||
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
known_databases = await self.get_known_seqdef_dbs()
|
||||
if seqdef_db_name not in known_databases:
|
||||
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||
return known_databases[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
if seqdef_db_name in self._seqdefdb_schemas and not force:
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
|
||||
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
|
||||
async with self._http_client.get(uri_path) as response:
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
@@ -1,21 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence, Union
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PartialAllelicMatchProfile:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
bitscore: float
|
||||
gaps: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
42
tests/autobigs/engine/analysis/test_aligners.py
Normal file
42
tests/autobigs/engine/analysis/test_aligners.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from Bio import SeqIO
|
||||
from Bio.Align import PairwiseAligner
|
||||
from pytest import mark
|
||||
from pytest import fixture
|
||||
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
|
||||
from autobigs.engine.structures.alignment import PairwiseAlignment
|
||||
|
||||
@fixture
|
||||
def tohamaI_bpertussis_adk():
|
||||
return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
|
||||
|
||||
@fixture
|
||||
def tohamaI_bpertussis_genome():
|
||||
return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
|
||||
|
||||
@fixture
|
||||
def fdaargos_1560_hinfluenza_adk():
|
||||
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq)
|
||||
|
||||
@fixture
|
||||
def fdaargos_1560_hinfluenza_genome():
|
||||
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq)
|
||||
|
||||
|
||||
@fixture(params=[1, 2])
|
||||
def dummy_engine(request):
|
||||
aligner = PairwiseAligner("blastn")
|
||||
aligner.mode = "local"
|
||||
with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine:
|
||||
yield engine
|
||||
|
||||
class TestAsyncPairwiseAlignmentEngine:
|
||||
async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
|
||||
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
|
||||
async for alignment, additional_information in dummy_engine:
|
||||
assert isinstance(alignment, PairwiseAlignment)
|
||||
|
||||
async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
|
||||
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
|
||||
dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk)
|
||||
async for alignment, additional_information in dummy_engine:
|
||||
assert isinstance(alignment, PairwiseAlignment)
|
215
tests/autobigs/engine/analysis/test_bigsdb.py
Normal file
215
tests/autobigs/engine/analysis/test_bigsdb.py
Normal file
@@ -0,0 +1,215 @@
|
||||
from os import path
|
||||
import random
|
||||
import re
|
||||
from typing import Callable, Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from autobigs.engine.analysis import bigsdb
|
||||
from autobigs.engine.structures import mlst
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
|
||||
|
||||
async def generate_async_iterable(normal_iterable):
|
||||
for dummy_sequence in normal_iterable:
|
||||
yield dummy_sequence
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
def get_first_sequence_from_fasta(resource: str):
|
||||
return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq)
|
||||
|
||||
def get_multiple_sequences_from_fasta(resource: str):
|
||||
return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta"))
|
||||
|
||||
bpertussis_tohamaI_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None)), "1", "ST-2 complex")
|
||||
|
||||
bpertussis_tohamaI_bad_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "2", None),
|
||||
Allele("glyA", "36", None),
|
||||
Allele("tyrB", "4", None),
|
||||
Allele("icd", "4", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "5", None),
|
||||
), "unknown", "unknown")
|
||||
|
||||
hinfluenzae_fdaargos_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None)
|
||||
), "3", "ST-3 complex")
|
||||
|
||||
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None)
|
||||
), "3", "ST-3 complex")
|
||||
|
||||
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
|
||||
|
||||
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
|
||||
|
||||
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
|
||||
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
|
||||
(True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
|
||||
])
|
||||
class TestBIGSdbMLSTProfiler:
|
||||
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
sequence = get_first_sequence_from_fasta(seq_path)
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
|
||||
targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
|
||||
async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_locus in expected_alleles
|
||||
assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus]
|
||||
targets_left.remove(exact_match.allele_locus)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
|
||||
mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
|
||||
for target_sequence in target_sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1).lower()
|
||||
if gene not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(target_sequence.seq), 0.125)
|
||||
async for partial_match in profiler.determine_mlst_allele_variants([scrambled]):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
mlst_targets.remove(gene)
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
|
||||
assert mlst_st_data.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
dummy_alleles = bad_profile.alleles
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
|
||||
assert mlst_profile.clonal_complex == "unknown"
|
||||
assert mlst_profile.sequence_type == "unknown"
|
||||
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
sequence = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
|
||||
name, profile = named_profile.name, named_profile.mlst_profile
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
valid_seq = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
|
||||
name, profile = name_profile.name, name_profile.mlst_profile
|
||||
|
||||
assert profile is not None
|
||||
if name == "should_fail":
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
valid_seq = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
|
||||
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
|
||||
name, profile = named_profile.name, named_profile.mlst_profile
|
||||
|
||||
assert profile is not None
|
||||
if name == "should_fail":
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
class TestBIGSdbIndex:
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
async def test_bigsdb_index_get_schemas_for_bordetella(self):
|
||||
async with BIGSdbIndex() as index:
|
||||
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||
assert len(schemas.keys()) > 0
|
||||
assert "MLST" in schemas
|
||||
assert isinstance(schemas["MLST"], int)
|
||||
|
||||
async def test_bigsdb_index_get_databases_has_only_seqdef(self):
|
||||
async with BIGSdbIndex() as index:
|
||||
databases = await index.get_known_seqdef_dbs()
|
||||
assert len(databases.keys()) > 0
|
||||
for database_name in databases.keys():
|
||||
assert database_name.endswith("seqdef")
|
||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
@pytest.mark.parametrize("local", [
|
||||
(True),
|
||||
(False)
|
||||
])
|
||||
async def test_bigsdb_index_instantiates_correct_profiler(self, local):
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb(local, "pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
assert isinstance(profiler, BIGSdbMLSTProfiler)
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
@@ -1,4 +1,4 @@
|
||||
from autobigsst.engine.data.local.fasta import read_fasta
|
||||
from autobigs.engine.reading import read_fasta
|
||||
|
||||
|
||||
async def test_fasta_reader_not_none():
|
@@ -1,21 +0,0 @@
|
||||
from autobigsst.engine.data.local.csv import dict_loci_alleles_variants_from_loci
|
||||
from autobigsst.engine.data.structures.mlst import Allele
|
||||
|
||||
|
||||
def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
|
||||
alleles_map = {
|
||||
"adk": [Allele("adk", "1", None)]
|
||||
}
|
||||
results = dict_loci_alleles_variants_from_loci(alleles_map)
|
||||
for loci, variant in results.items():
|
||||
assert isinstance(variant, str)
|
||||
assert variant == "1"
|
||||
|
||||
def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
|
||||
alleles_map = {
|
||||
"adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
|
||||
}
|
||||
results = dict_loci_alleles_variants_from_loci(alleles_map)
|
||||
for loci, variant in results.items():
|
||||
assert isinstance(variant, list)
|
||||
assert len(variant) == 2
|
@@ -1,244 +0,0 @@
|
||||
import random
|
||||
import re
|
||||
from typing import Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from autobigsst.engine.data.structures.genomics import NamedString
|
||||
from autobigsst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from autobigsst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from autobigsst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
|
||||
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||
for sequence in sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1)
|
||||
if gene.lower() not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(sequence.seq), 0.125)
|
||||
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
mlst_targets.remove(gene.lower())
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||
async def dummy_allele_generator():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
assert mlst_st_data.sequence_type == "1"
|
||||
|
||||
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "2", None),
|
||||
Allele("glyA", "36", None),
|
||||
Allele("tyrB", "4", None),
|
||||
Allele("icd", "4", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "5", None),
|
||||
]
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||
assert mlst_profile.clonal_complex == "unknown"
|
||||
assert mlst_profile.sequence_type == "unknown"
|
||||
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
|
||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
|
||||
assert len(dummy_alleles) == 0
|
||||
|
||||
async def test_pubmlst_profiling_results_in_correct_st():
|
||||
async def generate_dummy_targets():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||
assert mlst_st_data.sequence_type == "3"
|
||||
|
||||
async def test_pubmlst_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-3 complex"
|
||||
assert profile.sequence_type == "3"
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
|
||||
async def test_bigsdb_index_instantiates_correct_profiler():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_same_string_twice():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
|
||||
if name == "should_fail":
|
||||
assert profile is None
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
|
||||
if name == "should_fail":
|
||||
assert profile is not None
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
with pytest.raises(NoBIGSdbMatchesException):
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
|
||||
if name == "should_fail":
|
||||
pytest.fail("Exception should have been thrown, no exception was thrown.")
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_index_get_schemas_for_bordetella():
|
||||
async with BIGSdbIndex() as index:
|
||||
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||
assert len(schemas.keys()) > 0
|
||||
assert "MLST" in schemas
|
||||
assert isinstance(schemas["MLST"], int)
|
||||
|
||||
async def test_bigsdb_index_get_databases_has_only_seqdef():
|
||||
async with BIGSdbIndex() as index:
|
||||
databases = await index.get_known_seqdef_dbs()
|
||||
assert len(databases.keys()) > 0
|
||||
for database_name in databases.keys():
|
||||
assert database_name.endswith("seqdef")
|
||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
11
tests/resources/fdaargos_1560_hinfluenza_adk.fasta
Normal file
11
tests/resources/fdaargos_1560_hinfluenza_adk.fasta
Normal file
@@ -0,0 +1,11 @@
|
||||
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
|
||||
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
|
||||
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
|
||||
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
|
||||
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
|
||||
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
|
||||
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
|
||||
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
|
||||
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
|
||||
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
|
||||
AAAATCTTAGGCTAA
|
27246
tests/resources/fdaargos_1560_hinfluenza_features.fasta
Normal file
27246
tests/resources/fdaargos_1560_hinfluenza_features.fasta
Normal file
File diff suppressed because it is too large
Load Diff
11
tests/resources/tohama_I_bpertussis_adk.fasta
Normal file
11
tests/resources/tohama_I_bpertussis_adk.fasta
Normal file
@@ -0,0 +1,11 @@
|
||||
>lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS]
|
||||
ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT
|
||||
ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT
|
||||
GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT
|
||||
CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG
|
||||
ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT
|
||||
CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC
|
||||
AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG
|
||||
TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA
|
||||
GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC
|
||||
CGCCTGTCGCAGGCTCTGCAGAGCTAA
|
Reference in New Issue
Block a user