Compare commits
39 Commits
Author | SHA1 | Date | |
---|---|---|---|
2e8cdd8da9 | |||
d0318536b2 | |||
765cf9d418 | |||
348c3d00b4 | |||
1c3f7f9ed8 | |||
e4ddaf2e8c | |||
73aade2bde | |||
af8590baa7 | |||
36bca1b70d | |||
09a693b696 | |||
f76bf86ef6 | |||
a60daf3ee2 | |||
fbfd993269 | |||
ba606c35a9 | |||
4183840ba0 | |||
7fb3eab5b6 | |||
175a51f968 | |||
897f7ee922 | |||
bfc286e6b0 | |||
a88225fcff | |||
c18d817cd9 | |||
f462e6d5e0 | |||
e568e9fb2c | |||
4b9eb8674d | |||
f75707e4fe | |||
b4845fab34 | |||
fe999f1cab | |||
85946eb110 | |||
a27e09da31 | |||
ba2b688e89 | |||
49f31b7943 | |||
1c6e1cfb35 | |||
fb99526162 | |||
ff8a1aff08 | |||
341ca933a3 | |||
3e3898334f | |||
ba1f0aa318 | |||
6d0157581f | |||
4bcbfa0c6a |
@@ -1,6 +1,6 @@
|
||||
# autoBIGS.Engine
|
||||
|
||||
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||
A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||
|
||||
## Features
|
||||
|
||||
|
@@ -15,8 +15,9 @@ requires-python = ">=3.12"
|
||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||
|
||||
[project.urls]
|
||||
Repository = "https://github.com/RealYHD/autoBIGS.engine"
|
||||
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
|
||||
Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
||||
Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
||||
Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"
|
||||
|
||||
[tool.setuptools_scm]
|
||||
|
||||
|
202
src/autobigs/engine/analysis/bigsdb.py
Normal file
202
src/autobigs/engine/analysis/bigsdb.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from abc import abstractmethod
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import csv
|
||||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from autobigs.engine.reading import read_fasta
|
||||
from autobigs.engine.structures.alignment import PairwiseAlignment
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
from Bio.Align import PairwiseAligner
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
@abstractmethod
|
||||
def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._database_name = database_name
|
||||
self._schema_id = schema_id
|
||||
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
if isinstance(query_sequence_strings, str):
|
||||
query_sequence_strings = [query_sequence_strings]
|
||||
for sequence_string in query_sequence_strings:
|
||||
async with self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": True
|
||||
}) as response:
|
||||
sequence_response: dict = await response.json()
|
||||
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = AlignmentStats(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
gaps=int(partial_match["gaps"]),
|
||||
match_metric=int(partial_match["bitscore"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_locus=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json: dict = await response.json()
|
||||
allele_set: Set[Allele] = set()
|
||||
response_json.setdefault("fields", dict())
|
||||
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||
schema_fields_returned.setdefault("ST", "unknown")
|
||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
|
||||
if len(exact_match_alleles) > 1:
|
||||
raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
|
||||
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
|
||||
if len(allele_set) == 0:
|
||||
raise ValueError("Passed in no alleles.")
|
||||
return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
||||
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
|
||||
return await self.determine_mlst_st(alleles)
|
||||
|
||||
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||
async for named_strings in query_named_string_groups:
|
||||
for named_string in named_strings:
|
||||
try:
|
||||
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield NamedMLSTProfile(named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
KNOWN_BIGSDB_APIS = {
|
||||
"https://bigsdb.pasteur.fr/api",
|
||||
"https://rest.pubmlst.org"
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._http_client = ClientSession()
|
||||
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
|
||||
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
|
||||
super().__init__()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
|
||||
if self._known_seqdef_dbs_origin is not None and not force:
|
||||
return self._known_seqdef_dbs_origin
|
||||
known_seqdef_dbs = dict()
|
||||
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
|
||||
async with self._http_client.get(f"{known_bigsdb}/db") as response:
|
||||
response_json_databases = await response.json()
|
||||
for database_group in response_json_databases:
|
||||
for database_info in database_group["databases"]:
|
||||
if str(database_info["name"]).endswith("seqdef"):
|
||||
known_seqdef_dbs[database_info["name"]] = known_bigsdb
|
||||
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
|
||||
return self._known_seqdef_dbs_origin
|
||||
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
known_databases = await self.get_known_seqdef_dbs()
|
||||
if seqdef_db_name not in known_databases:
|
||||
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||
return known_databases[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
if seqdef_db_name in self._seqdefdb_schemas and not force:
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
|
||||
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
|
||||
async with self._http_client.get(uri_path) as response:
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||
return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
|
||||
if local:
|
||||
raise NotImplementedError()
|
||||
return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
|
@@ -1,16 +0,0 @@
|
||||
import asyncio
|
||||
from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from autobigs.engine.data.structures.genomics import NamedString
|
||||
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
||||
for fasta_sequence in await fasta_sequences:
|
||||
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
|
||||
|
||||
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
|
||||
for handle in handles:
|
||||
async for named_seq in read_fasta(handle):
|
||||
yield named_seq
|
@@ -1,166 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from numbers import Number
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from autobigs.engine.data.structures.genomics import NamedString
|
||||
from autobigs.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._database_name = database_name
|
||||
self._schema_id = schema_id
|
||||
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": not exact
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
if exact:
|
||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = PartialAllelicMatchProfile(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
bitscore=float(partial_match["bitscore"]),
|
||||
gaps=int(partial_match["gaps"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_loci=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
|
||||
|
||||
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json: dict = await response.json()
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
response_json.setdefault("fields", dict())
|
||||
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||
schema_fields_returned.setdefault("ST", "unknown")
|
||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
|
||||
if len(allele_map) == 0:
|
||||
raise ValueError("Passed in no alleles.")
|
||||
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||
async for named_string in namedStrings:
|
||||
try:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield (named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
KNOWN_BIGSDB_APIS = {
|
||||
"https://bigsdb.pasteur.fr/api",
|
||||
"https://rest.pubmlst.org"
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._http_client = ClientSession()
|
||||
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
|
||||
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
|
||||
super().__init__()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
|
||||
if self._known_seqdef_dbs_origin is not None and not force:
|
||||
return self._known_seqdef_dbs_origin
|
||||
known_seqdef_dbs = dict()
|
||||
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
|
||||
async with self._http_client.get(f"{known_bigsdb}/db") as response:
|
||||
response_json_databases = await response.json()
|
||||
for database_group in response_json_databases:
|
||||
for database_info in database_group["databases"]:
|
||||
if str(database_info["name"]).endswith("seqdef"):
|
||||
known_seqdef_dbs[database_info["name"]] = known_bigsdb
|
||||
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
|
||||
return self._known_seqdef_dbs_origin
|
||||
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
known_databases = await self.get_known_seqdef_dbs()
|
||||
if seqdef_db_name not in known_databases:
|
||||
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||
return known_databases[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
if seqdef_db_name in self._seqdefdb_schemas and not force:
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
|
||||
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
|
||||
async with self._http_client.get(uri_path) as response:
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
@@ -1,21 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence, Union
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PartialAllelicMatchProfile:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
bitscore: float
|
||||
gaps: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
17
src/autobigs/engine/reading.py
Normal file
17
src/autobigs/engine/reading.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import asyncio
|
||||
from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Iterable, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
||||
results = []
|
||||
for fasta_sequence in await fasta_sequences:
|
||||
results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
|
||||
return results
|
||||
|
||||
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
|
||||
for handle in handles:
|
||||
yield await read_fasta(handle)
|
18
src/autobigs/engine/structures/alignment.py
Normal file
18
src/autobigs/engine/structures/alignment.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from dataclasses import dataclass
|
||||
from numbers import Number
|
||||
from typing import Sequence
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AlignmentStats:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
gaps: int
|
||||
match_metric: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PairwiseAlignment:
|
||||
reference: str
|
||||
query: str
|
||||
reference_indices: Sequence[Number]
|
||||
query_indices: Sequence[Number]
|
||||
alignment_stats: AlignmentStats
|
33
src/autobigs/engine/structures/mlst.py
Normal file
33
src/autobigs/engine/structures/mlst.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Collection, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from autobigs.engine.structures.alignment import AlignmentStats
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_locus: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, AlignmentStats]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Collection[Allele]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NamedMLSTProfile:
|
||||
name: str
|
||||
mlst_profile: Union[None, MLSTProfile]
|
||||
|
||||
|
||||
def alleles_to_mapping(alleles: Iterable[Allele]):
|
||||
result = defaultdict(list)
|
||||
for allele in alleles:
|
||||
result[allele.allele_locus].append(allele.allele_variant)
|
||||
result = dict(result)
|
||||
for locus, variant in result.items():
|
||||
if len(variant) == 1:
|
||||
result[locus] = variant[0]
|
||||
return result
|
@@ -1,22 +1,21 @@
|
||||
from collections import defaultdict
|
||||
import csv
|
||||
from os import PathLike
|
||||
from typing import AsyncIterable, Mapping, Sequence, Union
|
||||
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
|
||||
|
||||
from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, Union[list[str], str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
if len(alleles) == 1:
|
||||
result_dict[loci] = alleles[0].allele_variant
|
||||
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
|
||||
result = defaultdict(list)
|
||||
for allele in alleles:
|
||||
result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
|
||||
for locus in result.keys():
|
||||
if len(result[locus]) == 1:
|
||||
result[locus] = result[locus][0] # Take the only one
|
||||
else:
|
||||
result_locis = list()
|
||||
for allele in alleles:
|
||||
result_locis.append(allele.allele_variant)
|
||||
result_dict[loci] = result_locis
|
||||
return result_dict
|
||||
|
||||
result[locus] = tuple(result[locus]) # type: ignore
|
||||
return dict(result)
|
||||
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||
failed = list()
|
||||
@@ -27,15 +26,16 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
|
||||
if mlst_profile is None:
|
||||
failed.append(name)
|
||||
continue
|
||||
allele_mapping = alleles_to_text_map(mlst_profile.alleles)
|
||||
if writer is None:
|
||||
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
|
||||
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
writer.writeheader()
|
||||
row_dictionary = {
|
||||
"st": mlst_profile.sequence_type,
|
||||
"clonal-complex": mlst_profile.clonal_complex,
|
||||
"id": name,
|
||||
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
**allele_mapping
|
||||
}
|
||||
writer.writerow(rowdict=row_dictionary)
|
||||
return failed
|
211
tests/autobigs/engine/analysis/test_bigsdb.py
Normal file
211
tests/autobigs/engine/analysis/test_bigsdb.py
Normal file
@@ -0,0 +1,211 @@
|
||||
from os import path
|
||||
import random
|
||||
import re
|
||||
from typing import Callable, Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from autobigs.engine.analysis import bigsdb
|
||||
from autobigs.engine.structures import mlst
|
||||
from autobigs.engine.structures.genomics import NamedString
|
||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
|
||||
|
||||
async def generate_async_iterable(normal_iterable):
|
||||
for dummy_sequence in normal_iterable:
|
||||
yield dummy_sequence
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
def get_first_sequence_from_fasta(resource: str):
|
||||
return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq)
|
||||
|
||||
def get_multiple_sequences_from_fasta(resource: str):
|
||||
return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta"))
|
||||
|
||||
bpertussis_tohamaI_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None)), "1", "ST-2 complex")
|
||||
|
||||
bpertussis_tohamaI_bad_profile = MLSTProfile((
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "2", None),
|
||||
Allele("glyA", "36", None),
|
||||
Allele("tyrB", "4", None),
|
||||
Allele("icd", "4", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "5", None),
|
||||
), "unknown", "unknown")
|
||||
|
||||
hinfluenzae_2014_102_profile = MLSTProfile((
|
||||
Allele("adk", "28", None),
|
||||
Allele("atpG", "33", None),
|
||||
Allele("frdB", "7", None),
|
||||
Allele("fucK", "18", None),
|
||||
Allele("mdh", "11", None),
|
||||
Allele("pgi", "125", None),
|
||||
Allele("recA", "89", None)
|
||||
), "478", "unknown")
|
||||
|
||||
hinfluenzae_2014_102_bad_profile = MLSTProfile((
|
||||
Allele("adk", "3", None),
|
||||
Allele("atpG", "121", None),
|
||||
Allele("frdB", "6", None),
|
||||
Allele("fucK", "5", None),
|
||||
Allele("mdh", "12", None),
|
||||
Allele("pgi", "4", None),
|
||||
Allele("recA", "5", None)
|
||||
), "unknown", "unknown")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
|
||||
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
|
||||
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
|
||||
])
|
||||
class TestBIGSdbMLSTProfiler:
|
||||
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
sequence = get_first_sequence_from_fasta(seq_path)
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
|
||||
targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
|
||||
async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_locus in expected_alleles
|
||||
assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus]
|
||||
targets_left.remove(exact_match.allele_locus)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
|
||||
mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
|
||||
for target_sequence in target_sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1).lower()
|
||||
if gene not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(target_sequence.seq), 0.125)
|
||||
async for partial_match in profiler.determine_mlst_allele_variants([scrambled]):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
mlst_targets.remove(gene)
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
|
||||
assert mlst_st_data.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
dummy_alleles = bad_profile.alleles
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
|
||||
assert mlst_profile.clonal_complex == "unknown"
|
||||
assert mlst_profile.sequence_type == "unknown"
|
||||
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
sequence = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
|
||||
name, profile = named_profile.name, named_profile.mlst_profile
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
valid_seq = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
|
||||
name, profile = name_profile.name, name_profile.mlst_profile
|
||||
|
||||
assert profile is not None
|
||||
if name == "should_fail":
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||
valid_seq = get_first_sequence_from_fasta(seq_path)
|
||||
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
|
||||
|
||||
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
|
||||
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
|
||||
name, profile = named_profile.name, named_profile.mlst_profile
|
||||
|
||||
assert profile is not None
|
||||
if name == "should_fail":
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == expected_profile.clonal_complex
|
||||
assert profile.sequence_type == expected_profile.sequence_type
|
||||
|
||||
class TestBIGSdbIndex:
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly(self):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
async def test_bigsdb_index_get_schemas_for_bordetella(self):
|
||||
async with BIGSdbIndex() as index:
|
||||
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||
assert len(schemas.keys()) > 0
|
||||
assert "MLST" in schemas
|
||||
assert isinstance(schemas["MLST"], int)
|
||||
|
||||
async def test_bigsdb_index_get_databases_has_only_seqdef(self):
|
||||
async with BIGSdbIndex() as index:
|
||||
databases = await index.get_known_seqdef_dbs()
|
||||
assert len(databases.keys()) > 0
|
||||
for database_name in databases.keys():
|
||||
assert database_name.endswith("seqdef")
|
||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
@pytest.mark.parametrize("local", [
|
||||
(False)
|
||||
])
|
||||
async def test_bigsdb_index_instantiates_correct_profiler(self, local):
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb(local, "pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
assert isinstance(profiler, BIGSdbMLSTProfiler)
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
@@ -1,21 +0,0 @@
|
||||
from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci
|
||||
from autobigs.engine.data.structures.mlst import Allele
|
||||
|
||||
|
||||
def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
|
||||
alleles_map = {
|
||||
"adk": [Allele("adk", "1", None)]
|
||||
}
|
||||
results = dict_loci_alleles_variants_from_loci(alleles_map)
|
||||
for loci, variant in results.items():
|
||||
assert isinstance(variant, str)
|
||||
assert variant == "1"
|
||||
|
||||
def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
|
||||
alleles_map = {
|
||||
"adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
|
||||
}
|
||||
results = dict_loci_alleles_variants_from_loci(alleles_map)
|
||||
for loci, variant in results.items():
|
||||
assert isinstance(variant, list)
|
||||
assert len(variant) == 2
|
@@ -1,7 +0,0 @@
|
||||
from autobigs.engine.data.local.fasta import read_fasta
|
||||
|
||||
|
||||
async def test_fasta_reader_not_none():
|
||||
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
|
||||
async for named_string in named_strings:
|
||||
assert named_string.name == "BX470248.1"
|
@@ -1,244 +0,0 @@
|
||||
import random
|
||||
import re
|
||||
from typing import Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from autobigs.engine.data.structures.genomics import NamedString
|
||||
from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
|
||||
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||
for sequence in sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1)
|
||||
if gene.lower() not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(sequence.seq), 0.125)
|
||||
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
mlst_targets.remove(gene.lower())
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||
async def dummy_allele_generator():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
assert mlst_st_data.sequence_type == "1"
|
||||
|
||||
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "2", None),
|
||||
Allele("glyA", "36", None),
|
||||
Allele("tyrB", "4", None),
|
||||
Allele("icd", "4", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "5", None),
|
||||
]
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||
assert mlst_profile.clonal_complex == "unknown"
|
||||
assert mlst_profile.sequence_type == "unknown"
|
||||
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
|
||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
|
||||
assert len(dummy_alleles) == 0
|
||||
|
||||
async def test_pubmlst_profiling_results_in_correct_st():
|
||||
async def generate_dummy_targets():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||
assert mlst_st_data.sequence_type == "3"
|
||||
|
||||
async def test_pubmlst_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-3 complex"
|
||||
assert profile.sequence_type == "3"
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
|
||||
async def test_bigsdb_index_instantiates_correct_profiler():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_same_string_twice():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
|
||||
if name == "should_fail":
|
||||
assert profile is None
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
|
||||
if name == "should_fail":
|
||||
assert profile is not None
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
with pytest.raises(NoBIGSdbMatchesException):
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
|
||||
if name == "should_fail":
|
||||
pytest.fail("Exception should have been thrown, no exception was thrown.")
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_index_get_schemas_for_bordetella():
|
||||
async with BIGSdbIndex() as index:
|
||||
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||
assert len(schemas.keys()) > 0
|
||||
assert "MLST" in schemas
|
||||
assert isinstance(schemas["MLST"], int)
|
||||
|
||||
async def test_bigsdb_index_get_databases_has_only_seqdef():
|
||||
async with BIGSdbIndex() as index:
|
||||
databases = await index.get_known_seqdef_dbs()
|
||||
assert len(databases.keys()) > 0
|
||||
for database_name in databases.keys():
|
||||
assert database_name.endswith("seqdef")
|
||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
7
tests/autobigs/engine/test_reading.py
Normal file
7
tests/autobigs/engine/test_reading.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from autobigs.engine.reading import read_fasta
|
||||
|
||||
|
||||
async def test_fasta_reader_not_none():
|
||||
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
|
||||
for named_string in named_strings:
|
||||
assert named_string.name == "BX470248.1"
|
47
tests/autobigs/engine/test_writing.py
Normal file
47
tests/autobigs/engine/test_writing.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from typing import AsyncIterable, Iterable
|
||||
|
||||
import pytest
|
||||
from autobigs.engine.structures.alignment import AlignmentStats
|
||||
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
|
||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||
import tempfile
|
||||
from csv import reader
|
||||
from os import path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_alphabet_mlst_profile():
|
||||
return MLSTProfile((
|
||||
Allele("A", "1", None),
|
||||
Allele("D", "1", None),
|
||||
Allele("B", "1", None),
|
||||
Allele("C", "1", None),
|
||||
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
|
||||
), "mysterious", "very mysterious")
|
||||
|
||||
async def iterable_to_asynciterable(iterable: Iterable):
|
||||
for iterated in iterable:
|
||||
yield iterated
|
||||
|
||||
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
|
||||
dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
output_path = path.join(temp_dir, "out.csv")
|
||||
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
|
||||
with open(output_path) as csv_handle:
|
||||
csv_reader = reader(csv_handle)
|
||||
lines = list(csv_reader)
|
||||
target_columns = lines[4:]
|
||||
assert target_columns == sorted(target_columns)
|
||||
|
||||
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
|
||||
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
|
||||
expected_mapping = {
|
||||
"A": "1",
|
||||
"B": "1",
|
||||
"C": ("1", "2*"),
|
||||
"D": "1"
|
||||
}
|
||||
for allele_name, allele_ids in mapping.items():
|
||||
assert allele_name in expected_mapping
|
||||
assert allele_ids == expected_mapping[allele_name]
|
28244
tests/resources/2014-102_hinfluenza.fasta
Normal file
28244
tests/resources/2014-102_hinfluenza.fasta
Normal file
File diff suppressed because it is too large
Load Diff
27751
tests/resources/2014-102_hinfluenza_features.fasta
Normal file
27751
tests/resources/2014-102_hinfluenza_features.fasta
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
11
tests/resources/tohama_I_bpertussis_adk.fasta
Normal file
11
tests/resources/tohama_I_bpertussis_adk.fasta
Normal file
@@ -0,0 +1,11 @@
|
||||
>lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS]
|
||||
ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT
|
||||
ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT
|
||||
GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT
|
||||
CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG
|
||||
ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT
|
||||
CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC
|
||||
AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG
|
||||
TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA
|
||||
GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC
|
||||
CGCCTGTCGCAGGCTCTGCAGAGCTAA
|
Reference in New Issue
Block a user