Completed CLI for B. pertussis MLST typing from fasta files

2025-01-03 19:20:19 +00:00
parent 362e0867e5
commit e9977d2da7
17 changed files with 230 additions and 51 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -12,7 +12,8 @@
 			"extensions": [
 				"ms-python.isort",
 				"njpwerner.autodocstring",
-				"ms-python.black-formatter"
+				"ms-python.black-formatter",
+				"mechatroner.rainbow-csv"
 			]
 		}
 	},
--- a/.gitignore
+++ b/.gitignore
@@ -356,3 +356,4 @@ package

 # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

+src/output.csv
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,28 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/mlstmyfasta/cli/program.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-fa",
+                "${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
+                "-ipdbmlst",
+                "pubmlst_bordetella_seqdef",
+                "-csv",
+                "${workspaceFolder}/output.csv"
+            ],
+            "cwd": "${workspaceFolder}/src",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/src"
+            }
+        }
+    ]
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,11 +7,14 @@ name = "mlstmyfasta"
 dynamic = ["version"]
 dependencies = [
    "biopython",
-    "requests",
+    "aiohttp[speedups]",
 ]
 requires-python = ">=3.11"
 description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."

+[project.scripts]
+mlstmyfasta = "mlstmyfasta.cli.program:cli"
+
 [tool.pyright]
 extraPaths = ["src"]
 exclude = [
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 aiohttp[speedups]
 biopython
 pytest
-pytest-asyncio
-dataclasses-json
+pytest-asyncio
--- a/src/mlstmyfasta/cli/aggregator.py
+++ b/src/mlstmyfasta/cli/aggregator.py
@@ -0,0 +1,23 @@
+from os import path
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
+from mlstmyfasta.engine.data.MLST import MLSTProfile
+from mlstmyfasta.engine.data.genomics import NamedString
+from mlstmyfasta.engine.local.abif import read_abif
+from mlstmyfasta.engine.local.fasta import read_fasta
+from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
+
+
+async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
+    for fasta_path in fastas:
+        async for fasta in read_fasta(fasta_path):
+            yield fasta.sequence
+    for abif_path in abifs:
+        abif_data = await read_abif(abif_path)
+        yield "".join(abif_data.sequence)
+
+async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
+    profiles = list()
+    async with InstitutPasteurProfiler(database_name=database_name) as profiler:
+        async for string in strings:
+            profiles.append(await profiler.profile_string(string))
+    return profiles
--- a/src/mlstmyfasta/cli/program.py
+++ b/src/mlstmyfasta/cli/program.py
@@ -0,0 +1,58 @@
+import argparse
+import asyncio
+from os import path
+
+from mlstmyfasta.cli import aggregator
+from mlstmyfasta.engine.data.genomics import NamedString
+from mlstmyfasta.engine.local.abif import read_abif
+from mlstmyfasta.engine.local.csv import write_mlst_profiles_as_csv
+from mlstmyfasta.engine.local.fasta import read_fasta
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--fasta", "-fa", "-fst",
+    nargs="+",
+    action='extend',
+    dest="fastas",
+    required=False,
+    default=[],
+    type=str,
+    help="The FASTA files to process. Multiple can be listed."
+)
+parser.add_argument(
+    "--abif", "-abi", "-ab1",
+    action='extend',
+    dest="abifs",
+    required=False,
+    default=[],
+    type=str,
+    help="The ABIF files to process. Multiple can be listed."
+)
+parser.add_argument(
+    "--institut-pasteur-mlst",
+    "-ipdbmlst",
+    dest="institut_pasteur_db",
+    type=str,
+    help="The Institut Pasteur MLST database to use."
+)
+parser.add_argument(
+    "-csv",
+    dest="csv_path",
+    required=False,
+    default=None,
+    help="The destination to place the CSV output."
+)
+
+
+def cli():
+    args = parser.parse_args()
+    gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs)
+    mlst_profiles = aggregator.profile_all_genetic_strings(
+        gen_strings, args.institut_pasteur_db)
+    asyncio.run(write_mlst_profiles_as_csv(
+        asyncio.run(mlst_profiles), str(args.csv_path)))
+
+
+if __name__ == "__main__":
+    cli()
--- a/src/mlstmyfasta/engine/data/MLST.py
+++ b/src/mlstmyfasta/engine/data/MLST.py
@@ -1,6 +1,13 @@
 from dataclasses import dataclass
+from typing import Mapping, Sequence

@dataclass
 class Allele:
    allele_loci: str
-    allele_variant: str
+    allele_variant: str
+
+@dataclass
+class MLSTProfile:
+    alleles: Mapping[str, Sequence[Allele]]
+    sequence_type: int
+    clonal_complex: str
--- a/src/mlstmyfasta/engine/local/abif.py
+++ b/src/mlstmyfasta/engine/local/abif.py
@@ -12,7 +12,7 @@ def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
        return SeqIO.read(seq_handle, "abi")


-async def load_sanger_sequence(seq_path: str) -> SangerTraceData:
+async def read_abif(seq_path: str) -> SangerTraceData:
    ext = path.splitext(seq_path)[1]
    if ext.lower() != ".ab1" and ext.lower() != "abi":
        raise ValueError(
--- a/src/mlstmyfasta/engine/local/csv.py
+++ b/src/mlstmyfasta/engine/local/csv.py
@@ -0,0 +1,31 @@
+import csv
+from io import TextIOWrapper
+from os import PathLike
+from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
+
+from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
+
+
+def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
+    result_dict: dict[str, list[str]] = {}
+    for loci, alleles in alleles_map.items():
+        result_dict[loci] = list()
+        for allele in alleles:
+            result_dict[loci].append(allele.allele_variant)
+    return result_dict
+
+
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
+    mlst_profiles = list(mlst_profiles_iterable)
+    header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
+    with open(handle, "w", newline='') as filehandle:
+        writer = csv.DictWriter(filehandle, fieldnames=header)
+        writer.writeheader()
+        for mlst_profile in mlst_profiles:
+            row_dictionary = {
+                "st": mlst_profile.sequence_type,
+                "clonal-complex": mlst_profile.clonal_complex,
+                **loci_alleles_variants_from_loci(mlst_profile.alleles)
+            }
+
+            writer.writerow(rowdict=row_dictionary)
--- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py
+++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py
@@ -1,10 +1,10 @@
+from collections import defaultdict
 from contextlib import AbstractAsyncContextManager
 import re
-from typing import Sequence
+from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
 from aiohttp import ClientSession, ClientTimeout
-from mlstmyfasta.engine.data.MLST import Allele
+from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
 from mlstmyfasta.engine.data.genomics import NamedString
-from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse

 class InstitutPasteurProfiler(AbstractAsyncContextManager):

@@ -16,9 +16,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
        self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))

-    async def fetch_mlst_profile(self, sequence_string: str):
+    async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
-        uri_path = f"schemes/3/sequence"
+        uri_path = "schemes/3/sequence"
        response = await self._http_client.post(uri_path, json={
            "sequence": sequence_string
        })
@@ -29,6 +29,32 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
                alelle_id = allele["allele_id"]
                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)

+    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+        uri_path = "schemes/3/designations"
+        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        if isinstance(alleles, AsyncIterable):
+            async for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        else:
+            for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        response = await self._http_client.post(uri_path, json={
+            "designations": allele_request_dict
+        })
+        response_json = await response.json()
+        schema_fields_returned = response_json["fields"]
+        schema_exact_matches = response_json["exact_matches"]
+        allele_map: dict[str, list[Allele]] = defaultdict(list)
+        for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
+            for exact_match_allele in exact_match_alleles:
+                allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
+        return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+
+    async def profile_string(self, string: str) -> MLSTProfile:
+        alleles = self.fetch_mlst_allele_variants(string)
+        return await self.fetch_mlst_st(alleles)
+
+
    async def close(self):
        await self._http_client.close()

--- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py
+++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py
@@ -1,22 +0,0 @@
-from dataclasses import dataclass
-from typing import Sequence, Union
-
-from dataclasses_json import dataclass_json
-
-@dataclass_json
-@dataclass
-class InstitutPasteurSequenceRequest:
-    sequence: str
-    details: Union[bool, None]
-    partial_matches: Union[bool, None]
-    base64: Union[bool, None]
-
-@dataclass_json
-@dataclass
-class InstitutPasteurSequenceResponse:
-    exact_matches: Sequence[tuple[int, str]]
-    start: Union[int, None]
-    end: Union[int, None]
-    orientation: Union[str, None]
-    length: Union[int, None]
-    contig: Union[str, None]
--- a/src/mlstmyfasta/http/server.py
+++ b/src/mlstmyfasta/http/server.py
@@ -0,0 +1,5 @@
+import logging
+from aiohttp import web
+
+webapp = web.Application(logger=logging.getLogger(__name__))
+routes = web.RouteTableDef
--- a/tests/mlstmyfasta/engine/local/test_abif.py
+++ b/tests/mlstmyfasta/engine/local/test_abif.py
@@ -1,8 +1,8 @@
 import os

-from mlstmyfasta.engine.local.abif import load_sanger_sequence
+from mlstmyfasta.engine.local.abif import read_abif

 async def test_load_sanger_sequence_has_data():
    assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
-    result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1")
+    result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
    assert result_data is not None
--- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py
+++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py
--- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py
+++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py
@@ -1,16 +0,0 @@
-from Bio import SeqIO
-from mlstmyfasta.engine.data.MLST import Allele
-from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
-
-
-async def test_profiling_results_in_exact_matches_when_exact():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence)
-        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
-            targets_left.remove(exact_match.allele_loci)
-
-        assert len(targets_left) == 0
--- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiling.py
+++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiling.py
@@ -0,0 +1,35 @@
+from Bio import SeqIO
+from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
+from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
+
+
+async def test_profiling_results_in_exact_matches_when_exact():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
+        async for exact_match in exact_matches:
+            assert isinstance(exact_match, Allele)
+            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
+            targets_left.remove(exact_match.allele_loci)
+
+        assert len(targets_left) == 0
+
+async def test_profiling_results_in_correct_st():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    dummy_alleles = [
+        Allele("adk", "1"),
+        Allele("fumC", "1"),
+        Allele("glyA", "1"),
+        Allele("tyrB", "1"),
+        Allele("icd", "1"),
+        Allele("pepA", "1"),
+        Allele("pgm", "1"),
+    ]
+    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-2 complex"
+        assert mlst_st_data.sequence_type == "1"