began adding PubMLST support

This commit is contained in:
Harrison Deng 2025-01-08 16:54:57 +00:00
parent ad28d9bf20
commit d580402523
9 changed files with 23717 additions and 23 deletions

View File

@ -3,23 +3,23 @@ from numbers import Number
from typing import Mapping, Sequence, Set, Union from typing import Mapping, Sequence, Set, Union
@dataclass @dataclass(frozen=True)
class StringAnnotation: class StringAnnotation:
type: str type: str
start: int start: int
end: int end: int
feature_properties: Mapping[str, Set[str]] feature_properties: Mapping[str, Set[str]]
@dataclass @dataclass(frozen=True)
class NamedString: class NamedString:
name: str name: str
sequence: str sequence: str
@dataclass @dataclass(frozen=True)
class AnnotatedString(NamedString): class AnnotatedString(NamedString):
annotations: Sequence[StringAnnotation] annotations: Sequence[StringAnnotation]
@dataclass @dataclass(frozen=True)
class SangerTraceData(NamedString): class SangerTraceData(NamedString):
seq_param_file_name: str seq_param_file_name: str
analysis_proto_settings_name: str analysis_proto_settings_name: str

View File

@ -1,12 +1,12 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Mapping, Sequence from typing import Mapping, Sequence
@dataclass @dataclass(frozen=True)
class Allele: class Allele:
allele_loci: str allele_loci: str
allele_variant: str allele_variant: str
@dataclass @dataclass(frozen=True)
class MLSTProfile: class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]] alleles: Mapping[str, Sequence[Allele]]
sequence_type: int sequence_type: int

View File

@ -110,6 +110,6 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq) return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString]: async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
for sanger_trace in sanger_traces: for sanger_trace in sanger_traces:
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1] yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]

View File

@ -1,12 +1,13 @@
from collections import defaultdict from collections import defaultdict
from contextlib import AbstractAsyncContextManager from contextlib import AbstractAsyncContextManager
import re import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.mlst import Allele, MLSTProfile from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString from automlst.engine.data.genomics import NamedString
from automlst.engine.remote.databases.mlst import MLSTProfiler
class InstitutPasteurProfiler(AbstractAsyncContextManager): class InstitutPasteurProfiler(MLSTProfiler):
async def __aenter__(self): async def __aenter__(self):
return self return self
@ -16,9 +17,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/" self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]: async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "schemes/3/sequence" uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={ response = await self._http_client.post(uri_path, json={
"sequence": sequence_string "sequence": sequence_string
}) })
@ -29,8 +30,8 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
alelle_id = allele["allele_id"] alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "schemes/3/designations" uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable): if isinstance(alleles, AsyncIterable):
async for allele in alleles: async for allele in alleles:
@ -50,10 +51,20 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile: async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string) alleles = self.fetch_mlst_allele_variants(schema_id, string)
return await self.fetch_mlst_st(alleles) return await self.fetch_mlst_st(schema_id, alleles)
async def get_scheme_ids(self) -> Mapping[str, int]:
uri_path = "schemes"
response = await self._http_client.get(uri_path)
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
return schema_descriptions
async def close(self): async def close(self):
await self._http_client.close() await self._http_client.close()

View File

@ -13,7 +13,7 @@ MLST_DATABASES = [
class MLSTProfiler(AbstractAsyncContextManager): class MLSTProfiler(AbstractAsyncContextManager):
@abstractmethod @abstractmethod
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]: def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
pass pass
@abstractmethod @abstractmethod

View File

@ -17,7 +17,7 @@ class PubMLSTProfiler(MLSTProfiler):
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/" self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]: async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
uri_path = f"schemes/{schema_id}/sequence" uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={ response = await self._http_client.post(uri_path, json={
"sequence": sequence_string "sequence": sequence_string

View File

@ -6,7 +6,7 @@ from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteu
async def test_profiling_results_in_exact_matches_when_exact(): async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches: async for exact_match in exact_matches:
assert isinstance(exact_match, Allele) assert isinstance(exact_match, Allele)
@ -16,7 +16,6 @@ async def test_profiling_results_in_exact_matches_when_exact():
assert len(targets_left) == 0 assert len(targets_left) == 0
async def test_profiling_results_in_correct_st(): async def test_profiling_results_in_correct_st():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_alleles = [ dummy_alleles = [
Allele("adk", "1"), Allele("adk", "1"),
Allele("fumC", "1"), Allele("fumC", "1"),
@ -27,8 +26,7 @@ async def test_profiling_results_in_correct_st():
Allele("pgm", "1"), Allele("pgm", "1"),
] ]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_alleles)
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
assert mlst_st_data is not None assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile) assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex" assert mlst_st_data.clonal_complex == "ST-2 complex"
@ -46,7 +44,7 @@ async def test_sequence_profiling_is_correct():
Allele("pgm", "1"), Allele("pgm", "1"),
] ]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence) profile = await dummy_profiler.profile_string(3, sequence)
assert profile is not None assert profile is not None
assert isinstance(profile, MLSTProfile) assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex" assert profile.clonal_complex == "ST-2 complex"

View File

@ -0,0 +1,50 @@
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
async def test_profiling_results_in_exact_matches_when_exact():
dummy_alleles = {
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
}
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence)
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match)
assert len(dummy_alleles) == 0
async def test_profiling_results_in_correct_st():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, dummy_alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-3 complex"
assert mlst_st_data.sequence_type == "3"
async def test_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(1, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-3 complex"
assert profile.sequence_type == "3"

File diff suppressed because it is too large Load Diff