began adding PubMLST support
This commit is contained in:
parent
ad28d9bf20
commit
d580402523
@ -3,23 +3,23 @@ from numbers import Number
|
||||
from typing import Mapping, Sequence, Set, Union
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class StringAnnotation:
|
||||
type: str
|
||||
start: int
|
||||
end: int
|
||||
feature_properties: Mapping[str, Set[str]]
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class NamedString:
|
||||
name: str
|
||||
sequence: str
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class AnnotatedString(NamedString):
|
||||
annotations: Sequence[StringAnnotation]
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class SangerTraceData(NamedString):
|
||||
seq_param_file_name: str
|
||||
analysis_proto_settings_name: str
|
||||
|
@ -1,12 +1,12 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
|
||||
@dataclass
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: int
|
||||
|
@ -110,6 +110,6 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
|
||||
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment
|
||||
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
|
||||
|
||||
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString]:
|
||||
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
||||
for sanger_trace in sanger_traces:
|
||||
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]
|
@ -1,12 +1,13 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import re
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.remote.databases.mlst import MLSTProfiler
|
||||
|
||||
class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
class InstitutPasteurProfiler(MLSTProfiler):
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
@ -16,9 +17,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "schemes/3/sequence"
|
||||
uri_path = f"schemes/{schema_id}/sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
})
|
||||
@ -29,8 +30,8 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
|
||||
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "schemes/3/designations"
|
||||
async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = f"schemes/{schema_id}/designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
@ -50,10 +51,20 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(schema_id, string)
|
||||
return await self.fetch_mlst_st(schema_id, alleles)
|
||||
|
||||
async def get_scheme_ids(self) -> Mapping[str, int]:
|
||||
uri_path = "schemes"
|
||||
response = await self._http_client.get(uri_path)
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
return schema_descriptions
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
@ -13,7 +13,7 @@ MLST_DATABASES = [
|
||||
|
||||
class MLSTProfiler(AbstractAsyncContextManager):
|
||||
@abstractmethod
|
||||
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
|
||||
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
@ -17,7 +17,7 @@ class PubMLSTProfiler(MLSTProfiler):
|
||||
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]:
|
||||
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
uri_path = f"schemes/{schema_id}/sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
|
@ -6,7 +6,7 @@ from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteu
|
||||
async def test_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence)
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
@ -16,7 +16,6 @@ async def test_profiling_results_in_exact_matches_when_exact():
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_profiling_results_in_correct_st():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("fumC", "1"),
|
||||
@ -27,8 +26,7 @@ async def test_profiling_results_in_correct_st():
|
||||
Allele("pgm", "1"),
|
||||
]
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_alleles)
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
@ -46,7 +44,7 @@ async def test_sequence_profiling_is_correct():
|
||||
Allele("pgm", "1"),
|
||||
]
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
profile = await dummy_profiler.profile_string(3, sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
|
@ -0,0 +1,50 @@
|
||||
from Bio import SeqIO
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
|
||||
from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
|
||||
|
||||
|
||||
async def test_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
|
||||
assert len(dummy_alleles) == 0
|
||||
|
||||
async def test_profiling_results_in_correct_st():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
]
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, dummy_alleles)
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||
assert mlst_st_data.sequence_type == "3"
|
||||
|
||||
async def test_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(1, sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-3 complex"
|
||||
assert profile.sequence_type == "3"
|
23635
tests/resources/FDAARGOS_1560.fasta
Normal file
23635
tests/resources/FDAARGOS_1560.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user