Compare commits
4 Commits
0.11.1
...
b8cebb8ba4
Author | SHA1 | Date | |
---|---|---|---|
b8cebb8ba4 | |||
7384895578 | |||
5a03c7e8d8 | |||
ddf9cde175 |
@@ -13,6 +13,7 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||||
|
license = {text = "GPL-3.0-or-later"}
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
||||||
|
@@ -22,15 +22,15 @@ from Bio.Align import PairwiseAligner
|
|||||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
|
def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@@ -52,14 +52,14 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
|||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
|
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
|
||||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||||
uri_path = "sequence"
|
uri_path = "sequence"
|
||||||
if isinstance(query_sequence_strings, str):
|
if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString):
|
||||||
query_sequence_strings = [query_sequence_strings]
|
query_sequence_strings = [query_sequence_strings]
|
||||||
for sequence_string in query_sequence_strings:
|
for sequence_string in query_sequence_strings:
|
||||||
async with self._http_client.post(uri_path, json={
|
async with self._http_client.post(uri_path, json={
|
||||||
"sequence": sequence_string,
|
"sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
|
||||||
"partial_matches": True
|
"partial_matches": True
|
||||||
}) as response:
|
}) as response:
|
||||||
sequence_response: dict = await response.json()
|
sequence_response: dict = await response.json()
|
||||||
@@ -70,7 +70,8 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
|||||||
for allele_loci, alleles in exact_matches.items():
|
for allele_loci, alleles in exact_matches.items():
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
alelle_id = allele["allele_id"]
|
alelle_id = allele["allele_id"]
|
||||||
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||||
|
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
|
||||||
elif "partial_matches" in sequence_response:
|
elif "partial_matches" in sequence_response:
|
||||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||||
for allele_loci, partial_match in partial_matches.items():
|
for allele_loci, partial_match in partial_matches.items():
|
||||||
@@ -82,23 +83,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
|||||||
gaps=int(partial_match["gaps"]),
|
gaps=int(partial_match["gaps"]),
|
||||||
match_metric=int(partial_match["bitscore"])
|
match_metric=int(partial_match["bitscore"])
|
||||||
)
|
)
|
||||||
yield Allele(
|
result_allele = Allele(
|
||||||
allele_locus=allele_loci,
|
allele_locus=allele_loci,
|
||||||
allele_variant=str(partial_match["allele"]),
|
allele_variant=str(partial_match["allele"]),
|
||||||
partial_match_profile=partial_match_profile
|
partial_match_profile=partial_match_profile
|
||||||
)
|
)
|
||||||
|
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
|
||||||
else:
|
else:
|
||||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
|
||||||
|
|
||||||
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
|
||||||
uri_path = "designations"
|
uri_path = "designations"
|
||||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||||
|
names_list = []
|
||||||
|
def insert_allele_to_request_dict(allele: Union[Allele, tuple[str, Allele]]):
|
||||||
|
if isinstance(allele, Allele):
|
||||||
|
allele_val = allele
|
||||||
|
else:
|
||||||
|
allele_val = allele[1]
|
||||||
|
names_list.append(allele[0])
|
||||||
|
allele_request_dict[allele_val.allele_locus].append({"allele": str(allele_val.allele_variant)})
|
||||||
|
|
||||||
if isinstance(alleles, AsyncIterable):
|
if isinstance(alleles, AsyncIterable):
|
||||||
async for allele in alleles:
|
async for allele in alleles:
|
||||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
insert_allele_to_request_dict(allele)
|
||||||
else:
|
else:
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
|
insert_allele_to_request_dict(allele)
|
||||||
request_json = {
|
request_json = {
|
||||||
"designations": allele_request_dict
|
"designations": allele_request_dict
|
||||||
}
|
}
|
||||||
@@ -111,26 +122,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
|||||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||||
schema_exact_matches: dict = response_json["exact_matches"]
|
schema_exact_matches: dict = response_json["exact_matches"]
|
||||||
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
|
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
|
||||||
if len(exact_match_alleles) > 1:
|
|
||||||
raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
|
|
||||||
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
|
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
|
||||||
if len(allele_set) == 0:
|
if len(allele_set) == 0:
|
||||||
raise ValueError("Passed in no alleles.")
|
raise ValueError("Passed in no alleles.")
|
||||||
return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||||
|
if len(names_list) > 0:
|
||||||
|
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile)
|
||||||
|
return result_mlst_profile
|
||||||
|
|
||||||
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
|
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
|
||||||
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
|
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
|
||||||
return await self.determine_mlst_st(alleles)
|
return await self.determine_mlst_st(alleles)
|
||||||
|
|
||||||
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||||
|
tasks = []
|
||||||
async for named_strings in query_named_string_groups:
|
async for named_strings in query_named_string_groups:
|
||||||
for named_string in named_strings:
|
tasks.append(self.profile_string(named_strings))
|
||||||
|
for task in asyncio.as_completed(tasks):
|
||||||
try:
|
try:
|
||||||
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
|
yield await task
|
||||||
except NoBIGSdbMatchesException as e:
|
except NoBIGSdbMatchesException as e:
|
||||||
if stop_on_fail:
|
if stop_on_fail:
|
||||||
raise e
|
raise e
|
||||||
yield NamedMLSTProfile(named_string.name, None)
|
causal_name = e.get_causal_query_name()
|
||||||
|
if causal_name is None:
|
||||||
|
raise ValueError("Missing query name despite requiring names.")
|
||||||
|
else:
|
||||||
|
yield NamedMLSTProfile(causal_name, None)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._http_client.close()
|
await self._http_client.close()
|
||||||
|
@@ -5,8 +5,12 @@ class BIGSDbDatabaseAPIException(Exception):
|
|||||||
|
|
||||||
|
|
||||||
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
||||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args):
|
||||||
|
self._query_name = query_name
|
||||||
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||||
|
|
||||||
|
def get_causal_query_name(self) -> Union[str, None]:
|
||||||
|
return self._query_name
|
||||||
|
|
||||||
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
|
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
|
||||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
|
@@ -13,5 +13,8 @@ async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
|
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
|
||||||
|
tasks = []
|
||||||
for handle in handles:
|
for handle in handles:
|
||||||
yield await read_fasta(handle)
|
tasks.append(read_fasta(handle))
|
||||||
|
for task in asyncio.as_completed(tasks):
|
||||||
|
yield await task
|
@@ -3,7 +3,7 @@ import csv
|
|||||||
from os import PathLike
|
from os import PathLike
|
||||||
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
|
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
|
||||||
|
|
||||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
|
||||||
|
|
||||||
|
|
||||||
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
|
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
|
||||||
@@ -17,12 +17,14 @@ def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Seque
|
|||||||
result[locus] = tuple(result[locus]) # type: ignore
|
result[locus] = tuple(result[locus]) # type: ignore
|
||||||
return dict(result)
|
return dict(result)
|
||||||
|
|
||||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[NamedMLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||||
failed = list()
|
failed = list()
|
||||||
with open(handle, "w", newline='') as filehandle:
|
with open(handle, "w", newline='') as filehandle:
|
||||||
header = None
|
header = None
|
||||||
writer: Union[csv.DictWriter, None] = None
|
writer: Union[csv.DictWriter, None] = None
|
||||||
async for name, mlst_profile in mlst_profiles_iterable:
|
async for named_mlst_profile in mlst_profiles_iterable:
|
||||||
|
name = named_mlst_profile.name
|
||||||
|
mlst_profile = named_mlst_profile.mlst_profile
|
||||||
if mlst_profile is None:
|
if mlst_profile is None:
|
||||||
failed.append(name)
|
failed.append(name)
|
||||||
continue
|
continue
|
||||||
|
@@ -3,7 +3,7 @@ from typing import AsyncIterable, Iterable
|
|||||||
import pytest
|
import pytest
|
||||||
from autobigs.engine.structures.alignment import AlignmentStats
|
from autobigs.engine.structures.alignment import AlignmentStats
|
||||||
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
|
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
|
||||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from csv import reader
|
from csv import reader
|
||||||
from os import path
|
from os import path
|
||||||
@@ -11,20 +11,20 @@ from os import path
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def dummy_alphabet_mlst_profile():
|
def dummy_alphabet_mlst_profile():
|
||||||
return MLSTProfile((
|
return NamedMLSTProfile("name", MLSTProfile((
|
||||||
Allele("A", "1", None),
|
Allele("A", "1", None),
|
||||||
Allele("D", "1", None),
|
Allele("D", "1", None),
|
||||||
Allele("B", "1", None),
|
Allele("B", "1", None),
|
||||||
Allele("C", "1", None),
|
Allele("C", "1", None),
|
||||||
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
|
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
|
||||||
), "mysterious", "very mysterious")
|
), "mysterious", "very mysterious"))
|
||||||
|
|
||||||
async def iterable_to_asynciterable(iterable: Iterable):
|
async def iterable_to_asynciterable(iterable: Iterable):
|
||||||
for iterated in iterable:
|
for iterated in iterable:
|
||||||
yield iterated
|
yield iterated
|
||||||
|
|
||||||
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
|
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
|
||||||
dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
|
dummy_profiles = [dummy_alphabet_mlst_profile]
|
||||||
with tempfile.TemporaryDirectory() as temp_dir:
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
output_path = path.join(temp_dir, "out.csv")
|
output_path = path.join(temp_dir, "out.csv")
|
||||||
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
|
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
|
||||||
@@ -34,8 +34,8 @@ async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile
|
|||||||
target_columns = lines[4:]
|
target_columns = lines[4:]
|
||||||
assert target_columns == sorted(target_columns)
|
assert target_columns == sorted(target_columns)
|
||||||
|
|
||||||
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
|
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: NamedMLSTProfile):
|
||||||
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
|
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.mlst_profile.alleles) # type: ignore
|
||||||
expected_mapping = {
|
expected_mapping = {
|
||||||
"A": "1",
|
"A": "1",
|
||||||
"B": "1",
|
"B": "1",
|
||||||
@@ -44,4 +44,4 @@ async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profil
|
|||||||
}
|
}
|
||||||
for allele_name, allele_ids in mapping.items():
|
for allele_name, allele_ids in mapping.items():
|
||||||
assert allele_name in expected_mapping
|
assert allele_name in expected_mapping
|
||||||
assert allele_ids == expected_mapping[allele_name]
|
assert allele_ids == expected_mapping[allele_name]
|
||||||
|
Reference in New Issue
Block a user