Added unit tests for pubMLST MLST profiling

This commit is contained in:
Harrison Deng 2025-01-08 17:34:30 +00:00
parent 424beeb559
commit 645357ac58
5 changed files with 110 additions and 105 deletions

View File

@ -1,7 +1,7 @@
import asyncio
from numbers import Number
from os import path
from typing import AsyncGenerator, Collection, Sequence, Union
from typing import Any, AsyncGenerator, Collection, Sequence, Union
from automlst.engine.data.genomics import NamedString, SangerTraceData
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, Align
@ -21,95 +21,98 @@ async def read_abif(seq_path: str) -> SangerTraceData:
biopython_annotations = biopython_seq.annotations
# Lot of type ignoring since Biopython did not define their typing.
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
trace_data = SangerTraceData(
path.basename(seq_path),
biopython_seq.seq,
biopython_abif_raw.get("APFN2"), # type: ignore
biopython_abif_raw.get("APrN1"), # type: ignore
biopython_abif_raw.get("APrV1"), # type: ignore
biopython_abif_raw.get("APrX1"), # type: ignore
biopython_abif_raw.get("APXV1"), # type: ignore
biopython_abif_raw.get("CMNT1"), # type: ignore
biopython_abif_raw.get("CpEP1"), # type: ignore
biopython_abif_raw.get("CTID1"), # type: ignore
biopython_abif_raw.get("CTNM1"), # type: ignore
biopython_abif_raw.get("CTTL1"), # type: ignore
biopython_abif_raw.get("DATA1"), # type: ignore
biopython_abif_raw.get("DATA2"), # type: ignore
biopython_abif_raw.get("DATA3"), # type: ignore
biopython_abif_raw.get("DATA4"), # type: ignore
biopython_abif_raw.get("DATA5"), # type: ignore
biopython_abif_raw.get("DATA6"), # type: ignore
biopython_abif_raw.get("DATA7"), # type: ignore
biopython_abif_raw.get("DATA8"), # type: ignore
biopython_abif_raw.get("DSam1"), # type: ignore
biopython_abif_raw.get("DyeN1"), # type: ignore
biopython_abif_raw.get("DyeN2"), # type: ignore
biopython_abif_raw.get("DyeN3"), # type: ignore
biopython_abif_raw.get("DyeN4"), # type: ignore
biopython_abif_raw.get("DyeW1"), # type: ignore
biopython_abif_raw.get("DyeW2"), # type: ignore
biopython_abif_raw.get("DyeW3"), # type: ignore
biopython_abif_raw.get("DyeW4"), # type: ignore
biopython_abif_raw.get("DySN1"), # type: ignore
biopython_abif_raw.get("EPVt1"), # type: ignore
biopython_abif_raw.get("EVNT1"), # type: ignore
biopython_abif_raw.get("EVNT2"), # type: ignore
biopython_abif_raw.get("EVNT3"), # type: ignore
biopython_abif_raw.get("EVNT4"), # type: ignore
biopython_abif_raw.get("FWO_1"), # type: ignore
biopython_abif_raw.get("GTyp1"), # type: ignore
biopython_abif_raw.get("InSc1"), # type: ignore
biopython_abif_raw.get("InVt1"), # type: ignore
biopython_abif_raw.get("LANE1"), # type: ignore
biopython_abif_raw.get("LIMS1"), # type: ignore
biopython_abif_raw.get("LNTD1"), # type: ignore
biopython_abif_raw.get("LsrP1"), # type: ignore
biopython_abif_raw.get("MCHN1"), # type: ignore
biopython_abif_raw.get("MODF1"), # type: ignore
biopython_abif_raw.get("MODL1"), # type: ignore
biopython_abif_raw.get("NAVG1"), # type: ignore
biopython_abif_raw.get("NLNE1"), # type: ignore
biopython_abif_raw.get("OfSc1"), # type: ignore
biopython_abif_raw.get("PDMF1"), # type: ignore
biopython_abif_raw.get("PXLB1"), # type: ignore
biopython_abif_raw.get("RGCm1"), # type: ignore
biopython_abif_raw.get("RGNm1"), # type: ignore
biopython_abif_raw.get("RMdV1"), # type: ignore
biopython_abif_raw.get("RMdX1"), # type: ignore
biopython_abif_raw.get("RMXV1"), # type: ignore
biopython_abif_raw.get("RPrN1"), # type: ignore
biopython_abif_raw.get("RPrV1"), # type: ignore
biopython_abif_raw.get("RUND1"), # type: ignore
biopython_abif_raw.get("RUND2"), # type: ignore
biopython_abif_raw.get("RUND3"), # type: ignore
biopython_abif_raw.get("RUND4"), # type: ignore
biopython_abif_raw.get("RunN1"), # type: ignore
biopython_abif_raw.get("RUNT1"), # type: ignore
biopython_abif_raw.get("RUNT2"), # type: ignore
biopython_abif_raw.get("RUNT3"), # type: ignore
biopython_abif_raw.get("RUNT4"), # type: ignore
biopython_abif_raw.get("Satd"), # type: ignore
biopython_abif_raw.get("Scal1"), # type: ignore
biopython_abif_raw.get("SCAN1"), # type: ignore
biopython_abif_raw.get("SMED1"), # type: ignore
biopython_abif_raw.get("SMLt"), # type: ignore
biopython_abif_raw.get("SMPL1"), # type: ignore
biopython_abif_raw.get("SVER1"), # type: ignore
biopython_abif_raw.get("SVER3"), # type: ignore
biopython_abif_raw.get("Tmpr1"), # type: ignore
biopython_abif_raw.get("TUBE"), # type: ignore
biopython_abif_raw.get("User") # type: ignore
biopython_abif_raw.get("APFN2"), # type: ignore
biopython_abif_raw.get("APrN1"), # type: ignore
biopython_abif_raw.get("APrV1"), # type: ignore
biopython_abif_raw.get("APrX1"), # type: ignore
biopython_abif_raw.get("APXV1"), # type: ignore
biopython_abif_raw.get("CMNT1"), # type: ignore
biopython_abif_raw.get("CpEP1"), # type: ignore
biopython_abif_raw.get("CTID1"), # type: ignore
biopython_abif_raw.get("CTNM1"), # type: ignore
biopython_abif_raw.get("CTTL1"), # type: ignore
biopython_abif_raw.get("DATA1"), # type: ignore
biopython_abif_raw.get("DATA2"), # type: ignore
biopython_abif_raw.get("DATA3"), # type: ignore
biopython_abif_raw.get("DATA4"), # type: ignore
biopython_abif_raw.get("DATA5"), # type: ignore
biopython_abif_raw.get("DATA6"), # type: ignore
biopython_abif_raw.get("DATA7"), # type: ignore
biopython_abif_raw.get("DATA8"), # type: ignore
biopython_abif_raw.get("DSam1"), # type: ignore
biopython_abif_raw.get("DyeN1"), # type: ignore
biopython_abif_raw.get("DyeN2"), # type: ignore
biopython_abif_raw.get("DyeN3"), # type: ignore
biopython_abif_raw.get("DyeN4"), # type: ignore
biopython_abif_raw.get("DyeW1"), # type: ignore
biopython_abif_raw.get("DyeW2"), # type: ignore
biopython_abif_raw.get("DyeW3"), # type: ignore
biopython_abif_raw.get("DyeW4"), # type: ignore
biopython_abif_raw.get("DySN1"), # type: ignore
biopython_abif_raw.get("EPVt1"), # type: ignore
biopython_abif_raw.get("EVNT1"), # type: ignore
biopython_abif_raw.get("EVNT2"), # type: ignore
biopython_abif_raw.get("EVNT3"), # type: ignore
biopython_abif_raw.get("EVNT4"), # type: ignore
biopython_abif_raw.get("FWO_1"), # type: ignore
biopython_abif_raw.get("GTyp1"), # type: ignore
biopython_abif_raw.get("InSc1"), # type: ignore
biopython_abif_raw.get("InVt1"), # type: ignore
biopython_abif_raw.get("LANE1"), # type: ignore
biopython_abif_raw.get("LIMS1"), # type: ignore
biopython_abif_raw.get("LNTD1"), # type: ignore
biopython_abif_raw.get("LsrP1"), # type: ignore
biopython_abif_raw.get("MCHN1"), # type: ignore
biopython_abif_raw.get("MODF1"), # type: ignore
biopython_abif_raw.get("MODL1"), # type: ignore
biopython_abif_raw.get("NAVG1"), # type: ignore
biopython_abif_raw.get("NLNE1"), # type: ignore
biopython_abif_raw.get("OfSc1"), # type: ignore
biopython_abif_raw.get("PDMF1"), # type: ignore
biopython_abif_raw.get("PXLB1"), # type: ignore
biopython_abif_raw.get("RGCm1"), # type: ignore
biopython_abif_raw.get("RGNm1"), # type: ignore
biopython_abif_raw.get("RMdV1"), # type: ignore
biopython_abif_raw.get("RMdX1"), # type: ignore
biopython_abif_raw.get("RMXV1"), # type: ignore
biopython_abif_raw.get("RPrN1"), # type: ignore
biopython_abif_raw.get("RPrV1"), # type: ignore
biopython_abif_raw.get("RUND1"), # type: ignore
biopython_abif_raw.get("RUND2"), # type: ignore
biopython_abif_raw.get("RUND3"), # type: ignore
biopython_abif_raw.get("RUND4"), # type: ignore
biopython_abif_raw.get("RunN1"), # type: ignore
biopython_abif_raw.get("RUNT1"), # type: ignore
biopython_abif_raw.get("RUNT2"), # type: ignore
biopython_abif_raw.get("RUNT3"), # type: ignore
biopython_abif_raw.get("RUNT4"), # type: ignore
biopython_abif_raw.get("Satd"), # type: ignore
biopython_abif_raw.get("Scal1"), # type: ignore
biopython_abif_raw.get("SCAN1"), # type: ignore
biopython_abif_raw.get("SMED1"), # type: ignore
biopython_abif_raw.get("SMLt"), # type: ignore
biopython_abif_raw.get("SMPL1"), # type: ignore
biopython_abif_raw.get("SVER1"), # type: ignore
biopython_abif_raw.get("SVER3"), # type: ignore
biopython_abif_raw.get("Tmpr1"), # type: ignore
biopython_abif_raw.get("TUBE"), # type: ignore
biopython_abif_raw.get("User") # type: ignore
)
return trace_data
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
aligner = Align.PairwiseAligner(scoring="blastn")
aligner.mode = "local"
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
0] # take the best alignment
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
for sanger_trace in sanger_traces:
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]

View File

@ -30,15 +30,11 @@ class InstitutPasteurProfiler(MLSTProfiler):
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
else:
for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
response = await self._http_client.post(uri_path, json={
"designations": allele_request_dict
})

View File

@ -1,6 +1,6 @@
from abc import abstractmethod
from contextlib import AbstractAsyncContextManager
from typing import AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union
from aiohttp import ClientSession

View File

@ -16,7 +16,8 @@ async def test_profiling_results_in_exact_matches_when_exact():
assert len(targets_left) == 0
async def test_profiling_results_in_correct_st():
dummy_alleles = [
async def dummy_allele_generator():
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
@ -24,9 +25,11 @@ async def test_profiling_results_in_correct_st():
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_alleles)
mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_allele_generator())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"

View File

@ -1,6 +1,6 @@
import asyncio
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
@ -24,25 +24,28 @@ async def test_profiling_results_in_exact_matches_when_exact():
assert len(dummy_alleles) == 0
async def test_profiling_results_in_correct_st():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, dummy_alleles)
async def generate_dummy_targets():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, generate_dummy_targets())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-3 complex"
assert mlst_st_data.sequence_type == "3"
async def test_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(1, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)