From c3a492fa8feb0ef5789a4bab32d72c25a2abcb69 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 8 Jan 2025 16:24:42 +0000 Subject: [PATCH] restructured MLST profiling classes --- .../cli/{aggregator.py => aggregated.py} | 2 +- src/automlst/cli/root.py | 24 +++++-- src/automlst/engine/data/genomics.py | 3 +- src/automlst/engine/local/abif.py | 17 ++++- .../{profiling.py => mlstprofiler.py} | 0 src/automlst/engine/remote/databases/mlst.py | 33 +++++++++ .../remote/databases/pubmlst/__init__.py | 0 .../remote/databases/pubmlst/mlstprofiler.py | 68 +++++++++++++++++++ .../institutpasteur/test_profiling.py | 2 +- 9 files changed, 138 insertions(+), 11 deletions(-) rename src/automlst/cli/{aggregator.py => aggregated.py} (91%) rename src/automlst/engine/remote/databases/institutpasteur/{profiling.py => mlstprofiler.py} (100%) create mode 100644 src/automlst/engine/remote/databases/mlst.py create mode 100644 src/automlst/engine/remote/databases/pubmlst/__init__.py create mode 100644 src/automlst/engine/remote/databases/pubmlst/mlstprofiler.py diff --git a/src/automlst/cli/aggregator.py b/src/automlst/cli/aggregated.py similarity index 91% rename from src/automlst/cli/aggregator.py rename to src/automlst/cli/aggregated.py index b799d4c..e674956 100644 --- a/src/automlst/cli/aggregator.py +++ b/src/automlst/cli/aggregated.py @@ -4,7 +4,7 @@ from automlst.engine.data.MLST import MLSTProfile from automlst.engine.data.genomics import NamedString from automlst.engine.local.abif import read_abif from automlst.engine.local.fasta import read_fasta -from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler +from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]: diff --git a/src/automlst/cli/root.py b/src/automlst/cli/root.py index 85d8289..a1f9e69 100644 --- a/src/automlst/cli/root.py +++ b/src/automlst/cli/root.py @@ -4,7 +4,7 @@ import datetime from os import path import os -from automlst.cli import aggregator +from automlst.cli import aggregated from automlst.engine.data.genomics import NamedString from automlst.engine.local.abif import read_abif from automlst.engine.local.csv import write_mlst_profiles_as_csv @@ -39,6 +39,22 @@ parser.add_argument( type=str, help="The ABIF files to process. Multiple can be listed." ) +parser.add_argument( + "--ncbi-assembly-reference", "-refncbi", + dest="ncbi_assembly_reference", + required=False, + default=None, + type=str, + help="The NCBI GenBank accession ID for the consensus assembly. Either this argument, or the path equivalent must be given if ABIF files are used." +) +parser.add_argument( + "--assembly-reference", "-ref", + dest="assembly_reference", + required=False, + default=None, + type=str, + help="The path to the FASTA sequence to be used as a reference for consensus building." +) parser.add_argument( "--institut-pasteur-mlst", "-ipdbmlst", @@ -51,16 +67,16 @@ parser.add_argument( parser.add_argument( "out", default="./.", - help="The output folder. Files will be named by the provided (or default) run name." + help="The output folder. Files will be named by the provided (or default) run name. Outputs will be automatically generated depending on which arguments are used." ) def cli(): args = parser.parse_args() - gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs) + gen_strings = aggregated.aggregate_sequences(args.fastas, args.abifs) os.makedirs(args.out, exist_ok=True) if args.institut_pasteur_db is not None: - mlst_profiles = aggregator.profile_all_genetic_strings( + mlst_profiles = aggregated.profile_all_genetic_strings( gen_strings, args.institut_pasteur_db) asyncio.run(write_mlst_profiles_as_csv( asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv")))) diff --git a/src/automlst/engine/data/genomics.py b/src/automlst/engine/data/genomics.py index a6e0ff2..76e515f 100644 --- a/src/automlst/engine/data/genomics.py +++ b/src/automlst/engine/data/genomics.py @@ -20,8 +20,7 @@ class AnnotatedString(NamedString): annotations: Sequence[StringAnnotation] @dataclass -class SangerTraceData: - sequence: Sequence[str] +class SangerTraceData(NamedString): seq_param_file_name: str analysis_proto_settings_name: str analysis_rpto_settings_ver: str diff --git a/src/automlst/engine/local/abif.py b/src/automlst/engine/local/abif.py index 77aa4f4..ee8995f 100644 --- a/src/automlst/engine/local/abif.py +++ b/src/automlst/engine/local/abif.py @@ -1,10 +1,10 @@ import asyncio from numbers import Number from os import path -from typing import Sequence, Union -from automlst.engine.data.genomics import SangerTraceData +from typing import AsyncGenerator, Collection, Sequence, Union +from automlst.engine.data.genomics import NamedString, SangerTraceData from Bio.SeqRecord import SeqRecord -from Bio import SeqIO +from Bio import SeqIO, Align def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord: @@ -23,6 +23,7 @@ async def read_abif(seq_path: str) -> SangerTraceData: # Lot of type ignoring since Biopython did not define their typing. biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore trace_data = SangerTraceData( + path.basename(seq_path), biopython_seq.seq, biopython_abif_raw.get("APFN2"), # type: ignore biopython_abif_raw.get("APrN1"), # type: ignore @@ -102,3 +103,13 @@ async def read_abif(seq_path: str) -> SangerTraceData: biopython_abif_raw.get("User") # type: ignore ) return trace_data + +def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]: + aligner = Align.PairwiseAligner(scoring="blastn") + aligner.mode = "local" + alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[0] # take the best alignment + return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq) + +async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString]: + for sanger_trace in sanger_traces: + yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1] \ No newline at end of file diff --git a/src/automlst/engine/remote/databases/institutpasteur/profiling.py b/src/automlst/engine/remote/databases/institutpasteur/mlstprofiler.py similarity index 100% rename from src/automlst/engine/remote/databases/institutpasteur/profiling.py rename to src/automlst/engine/remote/databases/institutpasteur/mlstprofiler.py diff --git a/src/automlst/engine/remote/databases/mlst.py b/src/automlst/engine/remote/databases/mlst.py new file mode 100644 index 0000000..b7b8b1e --- /dev/null +++ b/src/automlst/engine/remote/databases/mlst.py @@ -0,0 +1,33 @@ +from abc import abstractmethod +from contextlib import AbstractAsyncContextManager +from typing import AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union + +from aiohttp import ClientSession + +from automlst.engine.data.MLST import Allele, MLSTProfile + +MLST_DATABASES = [ + "https://bigsdb.pasteur.fr/api/db", + "https://rest.pubmlst.org/db" +] + +class MLSTProfiler(AbstractAsyncContextManager): + @abstractmethod + def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]: + pass + + @abstractmethod + async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: + pass + + @abstractmethod + async def profile_string(self, schema_id: int, string: str) -> MLSTProfile: + pass + + @abstractmethod + async def close(self): + pass + + @abstractmethod + async def get_scheme_ids(self) -> Mapping[str, int]: + pass \ No newline at end of file diff --git a/src/automlst/engine/remote/databases/pubmlst/__init__.py b/src/automlst/engine/remote/databases/pubmlst/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/automlst/engine/remote/databases/pubmlst/mlstprofiler.py b/src/automlst/engine/remote/databases/pubmlst/mlstprofiler.py new file mode 100644 index 0000000..2b586f2 --- /dev/null +++ b/src/automlst/engine/remote/databases/pubmlst/mlstprofiler.py @@ -0,0 +1,68 @@ +from collections import defaultdict +from contextlib import AbstractAsyncContextManager +import re +from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union +from aiohttp import ClientSession, ClientTimeout +from automlst.engine.data.MLST import Allele, MLSTProfile +from automlst.engine.data.genomics import NamedString +from automlst.engine.remote.databases.mlst import MLSTProfiler + +class PubMLSTProfiler(MLSTProfiler): + + async def __aenter__(self): + return self + + + def __init__(self, database_name: str): + self._base_url = f"https://rest.pubmlst.org/db/{database_name}/" + self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) + + async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele]: + uri_path = f"schemes/{schema_id}/sequence" + response = await self._http_client.post(uri_path, json={ + "sequence": sequence_string + }) + sequence_response: dict = await response.json() + exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"] + for allele_loci, alleles in exact_matches.items(): + for allele in alleles: + alelle_id = allele["allele_id"] + yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) + + async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: + uri_path = f"schemes/{schema_id}/designations" + allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) + async for allele in alleles: + allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) + response = await self._http_client.post(uri_path, json={ + "designations": allele_request_dict + }) + response_json = await response.json() + schema_fields_returned = response_json["fields"] + schema_exact_matches = response_json["exact_matches"] + allele_map: dict[str, list[Allele]] = defaultdict(list) + for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): + for exact_match_allele in exact_match_alleles: + allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) + return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + + async def profile_string(self, schema_id: int, string: str) -> MLSTProfile: + alleles = self.fetch_mlst_allele_variants(schema_id, string) + return await self.fetch_mlst_st(schema_id, alleles) + + async def get_scheme_ids(self) -> Mapping[str, int]: + uri_path = "schemes" + response = await self._http_client.get(uri_path) + response_json = await response.json() + schema_descriptions: Mapping[str, int] = dict() + for scheme_definition in response_json["schemes"]: + scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) + scheme_desc: str = scheme_definition["description"] + schema_descriptions[scheme_desc] = scheme_id + return schema_descriptions + + async def close(self): + await self._http_client.close() + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.close() \ No newline at end of file diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py index 950e6c5..3bac78a 100644 --- a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py +++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py @@ -1,6 +1,6 @@ from Bio import SeqIO from automlst.engine.data.MLST import Allele, MLSTProfile -from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler +from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler async def test_profiling_results_in_exact_matches_when_exact():