From 645357ac5855af9458419bdbfb67febcf88c1cc1 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 8 Jan 2025 17:34:30 +0000 Subject: [PATCH] Added unit tests for pubMLST MLST profiling --- src/automlst/engine/local/abif.py | 163 +++++++++--------- .../remote/databases/institutpasteur/mlst.py | 10 +- src/automlst/engine/remote/databases/mlst.py | 2 +- .../databases/institutpasteur/test_mlst.py | 9 +- .../remote/databases/pubmlst/test_mlst.py | 31 ++-- 5 files changed, 110 insertions(+), 105 deletions(-) diff --git a/src/automlst/engine/local/abif.py b/src/automlst/engine/local/abif.py index e4d2bf0..0e6637f 100644 --- a/src/automlst/engine/local/abif.py +++ b/src/automlst/engine/local/abif.py @@ -1,7 +1,7 @@ import asyncio from numbers import Number from os import path -from typing import AsyncGenerator, Collection, Sequence, Union +from typing import Any, AsyncGenerator, Collection, Sequence, Union from automlst.engine.data.genomics import NamedString, SangerTraceData from Bio.SeqRecord import SeqRecord from Bio import SeqIO, Align @@ -21,95 +21,98 @@ async def read_abif(seq_path: str) -> SangerTraceData: biopython_annotations = biopython_seq.annotations # Lot of type ignoring since Biopython did not define their typing. - biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore + biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore trace_data = SangerTraceData( path.basename(seq_path), biopython_seq.seq, - biopython_abif_raw.get("APFN2"), # type: ignore - biopython_abif_raw.get("APrN1"), # type: ignore - biopython_abif_raw.get("APrV1"), # type: ignore - biopython_abif_raw.get("APrX1"), # type: ignore - biopython_abif_raw.get("APXV1"), # type: ignore - biopython_abif_raw.get("CMNT1"), # type: ignore - biopython_abif_raw.get("CpEP1"), # type: ignore - biopython_abif_raw.get("CTID1"), # type: ignore - biopython_abif_raw.get("CTNM1"), # type: ignore - biopython_abif_raw.get("CTTL1"), # type: ignore - biopython_abif_raw.get("DATA1"), # type: ignore - biopython_abif_raw.get("DATA2"), # type: ignore - biopython_abif_raw.get("DATA3"), # type: ignore - biopython_abif_raw.get("DATA4"), # type: ignore - biopython_abif_raw.get("DATA5"), # type: ignore - biopython_abif_raw.get("DATA6"), # type: ignore - biopython_abif_raw.get("DATA7"), # type: ignore - biopython_abif_raw.get("DATA8"), # type: ignore - biopython_abif_raw.get("DSam1"), # type: ignore - biopython_abif_raw.get("DyeN1"), # type: ignore - biopython_abif_raw.get("DyeN2"), # type: ignore - biopython_abif_raw.get("DyeN3"), # type: ignore - biopython_abif_raw.get("DyeN4"), # type: ignore - biopython_abif_raw.get("DyeW1"), # type: ignore - biopython_abif_raw.get("DyeW2"), # type: ignore - biopython_abif_raw.get("DyeW3"), # type: ignore - biopython_abif_raw.get("DyeW4"), # type: ignore - biopython_abif_raw.get("DySN1"), # type: ignore - biopython_abif_raw.get("EPVt1"), # type: ignore - biopython_abif_raw.get("EVNT1"), # type: ignore - biopython_abif_raw.get("EVNT2"), # type: ignore - biopython_abif_raw.get("EVNT3"), # type: ignore - biopython_abif_raw.get("EVNT4"), # type: ignore - biopython_abif_raw.get("FWO_1"), # type: ignore - biopython_abif_raw.get("GTyp1"), # type: ignore - biopython_abif_raw.get("InSc1"), # type: ignore - biopython_abif_raw.get("InVt1"), # type: ignore - biopython_abif_raw.get("LANE1"), # type: ignore - biopython_abif_raw.get("LIMS1"), # type: ignore - biopython_abif_raw.get("LNTD1"), # type: ignore - biopython_abif_raw.get("LsrP1"), # type: ignore - biopython_abif_raw.get("MCHN1"), # type: ignore - biopython_abif_raw.get("MODF1"), # type: ignore - biopython_abif_raw.get("MODL1"), # type: ignore - biopython_abif_raw.get("NAVG1"), # type: ignore - biopython_abif_raw.get("NLNE1"), # type: ignore - biopython_abif_raw.get("OfSc1"), # type: ignore - biopython_abif_raw.get("PDMF1"), # type: ignore - biopython_abif_raw.get("PXLB1"), # type: ignore - biopython_abif_raw.get("RGCm1"), # type: ignore - biopython_abif_raw.get("RGNm1"), # type: ignore - biopython_abif_raw.get("RMdV1"), # type: ignore - biopython_abif_raw.get("RMdX1"), # type: ignore - biopython_abif_raw.get("RMXV1"), # type: ignore - biopython_abif_raw.get("RPrN1"), # type: ignore - biopython_abif_raw.get("RPrV1"), # type: ignore - biopython_abif_raw.get("RUND1"), # type: ignore - biopython_abif_raw.get("RUND2"), # type: ignore - biopython_abif_raw.get("RUND3"), # type: ignore - biopython_abif_raw.get("RUND4"), # type: ignore - biopython_abif_raw.get("RunN1"), # type: ignore - biopython_abif_raw.get("RUNT1"), # type: ignore - biopython_abif_raw.get("RUNT2"), # type: ignore - biopython_abif_raw.get("RUNT3"), # type: ignore - biopython_abif_raw.get("RUNT4"), # type: ignore - biopython_abif_raw.get("Satd"), # type: ignore - biopython_abif_raw.get("Scal1"), # type: ignore - biopython_abif_raw.get("SCAN1"), # type: ignore - biopython_abif_raw.get("SMED1"), # type: ignore - biopython_abif_raw.get("SMLt"), # type: ignore - biopython_abif_raw.get("SMPL1"), # type: ignore - biopython_abif_raw.get("SVER1"), # type: ignore - biopython_abif_raw.get("SVER3"), # type: ignore - biopython_abif_raw.get("Tmpr1"), # type: ignore - biopython_abif_raw.get("TUBE"), # type: ignore - biopython_abif_raw.get("User") # type: ignore + biopython_abif_raw.get("APFN2"), # type: ignore + biopython_abif_raw.get("APrN1"), # type: ignore + biopython_abif_raw.get("APrV1"), # type: ignore + biopython_abif_raw.get("APrX1"), # type: ignore + biopython_abif_raw.get("APXV1"), # type: ignore + biopython_abif_raw.get("CMNT1"), # type: ignore + biopython_abif_raw.get("CpEP1"), # type: ignore + biopython_abif_raw.get("CTID1"), # type: ignore + biopython_abif_raw.get("CTNM1"), # type: ignore + biopython_abif_raw.get("CTTL1"), # type: ignore + biopython_abif_raw.get("DATA1"), # type: ignore + biopython_abif_raw.get("DATA2"), # type: ignore + biopython_abif_raw.get("DATA3"), # type: ignore + biopython_abif_raw.get("DATA4"), # type: ignore + biopython_abif_raw.get("DATA5"), # type: ignore + biopython_abif_raw.get("DATA6"), # type: ignore + biopython_abif_raw.get("DATA7"), # type: ignore + biopython_abif_raw.get("DATA8"), # type: ignore + biopython_abif_raw.get("DSam1"), # type: ignore + biopython_abif_raw.get("DyeN1"), # type: ignore + biopython_abif_raw.get("DyeN2"), # type: ignore + biopython_abif_raw.get("DyeN3"), # type: ignore + biopython_abif_raw.get("DyeN4"), # type: ignore + biopython_abif_raw.get("DyeW1"), # type: ignore + biopython_abif_raw.get("DyeW2"), # type: ignore + biopython_abif_raw.get("DyeW3"), # type: ignore + biopython_abif_raw.get("DyeW4"), # type: ignore + biopython_abif_raw.get("DySN1"), # type: ignore + biopython_abif_raw.get("EPVt1"), # type: ignore + biopython_abif_raw.get("EVNT1"), # type: ignore + biopython_abif_raw.get("EVNT2"), # type: ignore + biopython_abif_raw.get("EVNT3"), # type: ignore + biopython_abif_raw.get("EVNT4"), # type: ignore + biopython_abif_raw.get("FWO_1"), # type: ignore + biopython_abif_raw.get("GTyp1"), # type: ignore + biopython_abif_raw.get("InSc1"), # type: ignore + biopython_abif_raw.get("InVt1"), # type: ignore + biopython_abif_raw.get("LANE1"), # type: ignore + biopython_abif_raw.get("LIMS1"), # type: ignore + biopython_abif_raw.get("LNTD1"), # type: ignore + biopython_abif_raw.get("LsrP1"), # type: ignore + biopython_abif_raw.get("MCHN1"), # type: ignore + biopython_abif_raw.get("MODF1"), # type: ignore + biopython_abif_raw.get("MODL1"), # type: ignore + biopython_abif_raw.get("NAVG1"), # type: ignore + biopython_abif_raw.get("NLNE1"), # type: ignore + biopython_abif_raw.get("OfSc1"), # type: ignore + biopython_abif_raw.get("PDMF1"), # type: ignore + biopython_abif_raw.get("PXLB1"), # type: ignore + biopython_abif_raw.get("RGCm1"), # type: ignore + biopython_abif_raw.get("RGNm1"), # type: ignore + biopython_abif_raw.get("RMdV1"), # type: ignore + biopython_abif_raw.get("RMdX1"), # type: ignore + biopython_abif_raw.get("RMXV1"), # type: ignore + biopython_abif_raw.get("RPrN1"), # type: ignore + biopython_abif_raw.get("RPrV1"), # type: ignore + biopython_abif_raw.get("RUND1"), # type: ignore + biopython_abif_raw.get("RUND2"), # type: ignore + biopython_abif_raw.get("RUND3"), # type: ignore + biopython_abif_raw.get("RUND4"), # type: ignore + biopython_abif_raw.get("RunN1"), # type: ignore + biopython_abif_raw.get("RUNT1"), # type: ignore + biopython_abif_raw.get("RUNT2"), # type: ignore + biopython_abif_raw.get("RUNT3"), # type: ignore + biopython_abif_raw.get("RUNT4"), # type: ignore + biopython_abif_raw.get("Satd"), # type: ignore + biopython_abif_raw.get("Scal1"), # type: ignore + biopython_abif_raw.get("SCAN1"), # type: ignore + biopython_abif_raw.get("SMED1"), # type: ignore + biopython_abif_raw.get("SMLt"), # type: ignore + biopython_abif_raw.get("SMPL1"), # type: ignore + biopython_abif_raw.get("SVER1"), # type: ignore + biopython_abif_raw.get("SVER3"), # type: ignore + biopython_abif_raw.get("Tmpr1"), # type: ignore + biopython_abif_raw.get("TUBE"), # type: ignore + biopython_abif_raw.get("User") # type: ignore ) return trace_data + def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]: aligner = Align.PairwiseAligner(scoring="blastn") aligner.mode = "local" - alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment + alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[ + 0] # take the best alignment return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq) + async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]: for sanger_trace in sanger_traces: - yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1] \ No newline at end of file + yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1] diff --git a/src/automlst/engine/remote/databases/institutpasteur/mlst.py b/src/automlst/engine/remote/databases/institutpasteur/mlst.py index ae32f5c..e254343 100644 --- a/src/automlst/engine/remote/databases/institutpasteur/mlst.py +++ b/src/automlst/engine/remote/databases/institutpasteur/mlst.py @@ -30,15 +30,11 @@ class InstitutPasteurProfiler(MLSTProfiler): alelle_id = allele["allele_id"] yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) - async def fetch_mlst_st(self, schema_id: int, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: + async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: uri_path = f"schemes/{schema_id}/designations" allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) - if isinstance(alleles, AsyncIterable): - async for allele in alleles: - allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) - else: - for allele in alleles: - allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) + async for allele in alleles: + allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) response = await self._http_client.post(uri_path, json={ "designations": allele_request_dict }) diff --git a/src/automlst/engine/remote/databases/mlst.py b/src/automlst/engine/remote/databases/mlst.py index 2b7a063..881ce58 100644 --- a/src/automlst/engine/remote/databases/mlst.py +++ b/src/automlst/engine/remote/databases/mlst.py @@ -1,6 +1,6 @@ from abc import abstractmethod from contextlib import AbstractAsyncContextManager -from typing import AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union +from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union from aiohttp import ClientSession diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py index f99466c..ba37f10 100644 --- a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py +++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py @@ -16,7 +16,8 @@ async def test_profiling_results_in_exact_matches_when_exact(): assert len(targets_left) == 0 async def test_profiling_results_in_correct_st(): - dummy_alleles = [ + async def dummy_allele_generator(): + dummy_alleles = [ Allele("adk", "1"), Allele("fumC", "1"), Allele("glyA", "1"), @@ -24,9 +25,11 @@ async def test_profiling_results_in_correct_st(): Allele("icd", "1"), Allele("pepA", "1"), Allele("pgm", "1"), - ] + ] + for dummy_allele in dummy_alleles: + yield dummy_allele async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: - mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_alleles) + mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_allele_generator()) assert mlst_st_data is not None assert isinstance(mlst_st_data, MLSTProfile) assert mlst_st_data.clonal_complex == "ST-2 complex" diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py index 4577f89..4f3f755 100644 --- a/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py +++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py @@ -1,6 +1,6 @@ +import asyncio from Bio import SeqIO from automlst.engine.data.mlst import Allele, MLSTProfile -from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler @@ -24,25 +24,28 @@ async def test_profiling_results_in_exact_matches_when_exact(): assert len(dummy_alleles) == 0 async def test_profiling_results_in_correct_st(): - dummy_alleles = [ - Allele("adk", "1"), - Allele("atpG", "1"), - Allele("frdB", "1"), - Allele("fucK", "1"), - Allele("mdh", "1"), - Allele("pgi", "1"), - Allele("recA", "5"), - ] - async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: - mlst_st_data = await dummy_profiler.fetch_mlst_st(1, dummy_alleles) + async def generate_dummy_targets(): + dummy_alleles = [ + Allele("adk", "1"), + Allele("atpG", "1"), + Allele("frdB", "1"), + Allele("fucK", "1"), + Allele("mdh", "1"), + Allele("pgi", "1"), + Allele("recA", "5"), + ] + for dummy_allele in dummy_alleles: + yield dummy_allele + async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: + mlst_st_data = await dummy_profiler.fetch_mlst_st(1, generate_dummy_targets()) assert mlst_st_data is not None assert isinstance(mlst_st_data, MLSTProfile) assert mlst_st_data.clonal_complex == "ST-3 complex" assert mlst_st_data.sequence_type == "3" async def test_sequence_profiling_is_correct(): - sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) - async with InstitutPasteurProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: + sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) + async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: profile = await dummy_profiler.profile_string(1, sequence) assert profile is not None assert isinstance(profile, MLSTProfile)