began adding PubMLST support

This commit is contained in:
Harrison Deng 2025-01-08 16:54:57 +00:00
parent ad28d9bf20
commit d580402523
9 changed files with 23717 additions and 23 deletions

View File

@ -3,23 +3,23 @@ from numbers import Number
from typing import Mapping, Sequence, Set, Union
@dataclass
@dataclass(frozen=True)
class StringAnnotation:
type: str
start: int
end: int
feature_properties: Mapping[str, Set[str]]
@dataclass
@dataclass(frozen=True)
class NamedString:
name: str
sequence: str
@dataclass
@dataclass(frozen=True)
class AnnotatedString(NamedString):
annotations: Sequence[StringAnnotation]
@dataclass
@dataclass(frozen=True)
class SangerTraceData(NamedString):
seq_param_file_name: str
analysis_proto_settings_name: str

View File

@ -1,12 +1,12 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
@dataclass
@dataclass(frozen=True)
class Allele:
allele_loci: str
allele_variant: str
@dataclass
@dataclass(frozen=True)
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: int

View File

@ -110,6 +110,6 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString]:
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
for sanger_trace in sanger_traces:
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]

View File

@ -1,12 +1,13 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.remote.databases.mlst import MLSTProfiler
class InstitutPasteurProfiler(AbstractAsyncContextManager):
class InstitutPasteurProfiler(MLSTProfiler):
async def __aenter__(self):
return self
@ -16,9 +17,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "schemes/3/sequence"
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
@ -29,8 +30,8 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "schemes/3/designations"
async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
@ -50,10 +51,20 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
return await self.fetch_mlst_st(alleles)
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(schema_id, string)
return await self.fetch_mlst_st(schema_id, alleles)
async def get_scheme_ids(self) -> Mapping[str, int]:
uri_path = "schemes"
response = await self._http_client.get(uri_path)
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
return schema_descriptions
async def close(self):
await self._http_client.close()

View File

@ -13,7 +13,7 @@ MLST_DATABASES = [
class MLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
pass
@abstractmethod

View File

@ -17,7 +17,7 @@ class PubMLSTProfiler(MLSTProfiler):
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string

View File

@ -6,7 +6,7 @@ from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteu
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
@ -16,7 +16,6 @@ async def test_profiling_results_in_exact_matches_when_exact():
assert len(targets_left) == 0
async def test_profiling_results_in_correct_st():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
@ -27,8 +26,7 @@ async def test_profiling_results_in_correct_st():
Allele("pgm", "1"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
@ -46,7 +44,7 @@ async def test_sequence_profiling_is_correct():
Allele("pgm", "1"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence)
profile = await dummy_profiler.profile_string(3, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"

View File

@ -0,0 +1,50 @@
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
async def test_profiling_results_in_exact_matches_when_exact():
dummy_alleles = {
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
}
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence)
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match)
assert len(dummy_alleles) == 0
async def test_profiling_results_in_correct_st():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, dummy_alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-3 complex"
assert mlst_st_data.sequence_type == "3"
async def test_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(1, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-3 complex"
assert profile.sequence_type == "3"

File diff suppressed because it is too large Load Diff