began adding PubMLST support

This commit is contained in:
2025-01-08 16:54:57 +00:00
parent ad28d9bf20
commit d580402523
9 changed files with 23717 additions and 23 deletions

View File

@@ -3,23 +3,23 @@ from numbers import Number
from typing import Mapping, Sequence, Set, Union
@dataclass
@dataclass(frozen=True)
class StringAnnotation:
type: str
start: int
end: int
feature_properties: Mapping[str, Set[str]]
@dataclass
@dataclass(frozen=True)
class NamedString:
name: str
sequence: str
@dataclass
@dataclass(frozen=True)
class AnnotatedString(NamedString):
annotations: Sequence[StringAnnotation]
@dataclass
@dataclass(frozen=True)
class SangerTraceData(NamedString):
seq_param_file_name: str
analysis_proto_settings_name: str

View File

@@ -1,12 +1,12 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
@dataclass
@dataclass(frozen=True)
class Allele:
allele_loci: str
allele_variant: str
@dataclass
@dataclass(frozen=True)
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: int

View File

@@ -110,6 +110,6 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString]:
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
for sanger_trace in sanger_traces:
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]

View File

@@ -1,12 +1,13 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.remote.databases.mlst import MLSTProfiler
class InstitutPasteurProfiler(AbstractAsyncContextManager):
class InstitutPasteurProfiler(MLSTProfiler):
async def __aenter__(self):
return self
@@ -16,9 +17,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "schemes/3/sequence"
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
@@ -29,8 +30,8 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "schemes/3/designations"
async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
@@ -50,10 +51,20 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
return await self.fetch_mlst_st(alleles)
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(schema_id, string)
return await self.fetch_mlst_st(schema_id, alleles)
async def get_scheme_ids(self) -> Mapping[str, int]:
uri_path = "schemes"
response = await self._http_client.get(uri_path)
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
return schema_descriptions
async def close(self):
await self._http_client.close()

View File

@@ -13,7 +13,7 @@ MLST_DATABASES = [
class MLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
pass
@abstractmethod

View File

@@ -17,7 +17,7 @@ class PubMLSTProfiler(MLSTProfiler):
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string