diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 251ee70..b8fde32 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,7 +12,8 @@ "extensions": [ "ms-python.isort", "njpwerner.autodocstring", - "ms-python.black-formatter" + "ms-python.black-formatter", + "mechatroner.rainbow-csv" ] } }, diff --git a/.gitignore b/.gitignore index 9126208..9d9a271 100644 --- a/.gitignore +++ b/.gitignore @@ -356,3 +356,4 @@ package # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) +src/output.csv diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..6ab0726 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,28 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/src/mlstmyfasta/cli/program.py", + "console": "integratedTerminal", + "args": [ + "-fa", + "${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta", + "-ipdbmlst", + "pubmlst_bordetella_seqdef", + "-csv", + "${workspaceFolder}/output.csv" + ], + "cwd": "${workspaceFolder}/src", + "env": { + "PYTHONPATH": "${workspaceFolder}/src" + } + } + ] +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b5c9f69..3215489 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,11 +7,14 @@ name = "mlstmyfasta" dynamic = ["version"] dependencies = [ "biopython", - "requests", + "aiohttp[speedups]", ] requires-python = ">=3.11" description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases." +[project.scripts] +mlstmyfasta = "mlstmyfasta.cli.program:cli" + [tool.pyright] extraPaths = ["src"] exclude = [ diff --git a/requirements.txt b/requirements.txt index ec8e53e..e114e01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiohttp[speedups] biopython pytest -pytest-asyncio -dataclasses-json \ No newline at end of file +pytest-asyncio \ No newline at end of file diff --git a/src/mlstmyfasta/cli/aggregator.py b/src/mlstmyfasta/cli/aggregator.py new file mode 100644 index 0000000..f708290 --- /dev/null +++ b/src/mlstmyfasta/cli/aggregator.py @@ -0,0 +1,23 @@ +from os import path +from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence +from mlstmyfasta.engine.data.MLST import MLSTProfile +from mlstmyfasta.engine.data.genomics import NamedString +from mlstmyfasta.engine.local.abif import read_abif +from mlstmyfasta.engine.local.fasta import read_fasta +from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler + + +async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]: + for fasta_path in fastas: + async for fasta in read_fasta(fasta_path): + yield fasta.sequence + for abif_path in abifs: + abif_data = await read_abif(abif_path) + yield "".join(abif_data.sequence) + +async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]: + profiles = list() + async with InstitutPasteurProfiler(database_name=database_name) as profiler: + async for string in strings: + profiles.append(await profiler.profile_string(string)) + return profiles \ No newline at end of file diff --git a/src/mlstmyfasta/cli/program.py b/src/mlstmyfasta/cli/program.py new file mode 100644 index 0000000..0564a2c --- /dev/null +++ b/src/mlstmyfasta/cli/program.py @@ -0,0 +1,58 @@ +import argparse +import asyncio +from os import path + +from mlstmyfasta.cli import aggregator +from mlstmyfasta.engine.data.genomics import NamedString +from mlstmyfasta.engine.local.abif import read_abif +from mlstmyfasta.engine.local.csv import write_mlst_profiles_as_csv +from mlstmyfasta.engine.local.fasta import read_fasta + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--fasta", "-fa", "-fst", + nargs="+", + action='extend', + dest="fastas", + required=False, + default=[], + type=str, + help="The FASTA files to process. Multiple can be listed." +) +parser.add_argument( + "--abif", "-abi", "-ab1", + action='extend', + dest="abifs", + required=False, + default=[], + type=str, + help="The ABIF files to process. Multiple can be listed." +) +parser.add_argument( + "--institut-pasteur-mlst", + "-ipdbmlst", + dest="institut_pasteur_db", + type=str, + help="The Institut Pasteur MLST database to use." +) +parser.add_argument( + "-csv", + dest="csv_path", + required=False, + default=None, + help="The destination to place the CSV output." +) + + +def cli(): + args = parser.parse_args() + gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs) + mlst_profiles = aggregator.profile_all_genetic_strings( + gen_strings, args.institut_pasteur_db) + asyncio.run(write_mlst_profiles_as_csv( + asyncio.run(mlst_profiles), str(args.csv_path))) + + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/src/mlstmyfasta/engine/data/MLST.py b/src/mlstmyfasta/engine/data/MLST.py index 5ce6194..b42284f 100644 --- a/src/mlstmyfasta/engine/data/MLST.py +++ b/src/mlstmyfasta/engine/data/MLST.py @@ -1,6 +1,13 @@ from dataclasses import dataclass +from typing import Mapping, Sequence @dataclass class Allele: allele_loci: str - allele_variant: str \ No newline at end of file + allele_variant: str + +@dataclass +class MLSTProfile: + alleles: Mapping[str, Sequence[Allele]] + sequence_type: int + clonal_complex: str \ No newline at end of file diff --git a/src/mlstmyfasta/engine/local/abif.py b/src/mlstmyfasta/engine/local/abif.py index f9e4ece..1296793 100644 --- a/src/mlstmyfasta/engine/local/abif.py +++ b/src/mlstmyfasta/engine/local/abif.py @@ -12,7 +12,7 @@ def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord: return SeqIO.read(seq_handle, "abi") -async def load_sanger_sequence(seq_path: str) -> SangerTraceData: +async def read_abif(seq_path: str) -> SangerTraceData: ext = path.splitext(seq_path)[1] if ext.lower() != ".ab1" and ext.lower() != "abi": raise ValueError( diff --git a/src/mlstmyfasta/engine/local/csv.py b/src/mlstmyfasta/engine/local/csv.py new file mode 100644 index 0000000..87beb5c --- /dev/null +++ b/src/mlstmyfasta/engine/local/csv.py @@ -0,0 +1,31 @@ +import csv +from io import TextIOWrapper +from os import PathLike +from typing import AsyncIterable, Iterable, Mapping, Sequence, Union + +from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile + + +def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]): + result_dict: dict[str, list[str]] = {} + for loci, alleles in alleles_map.items(): + result_dict[loci] = list() + for allele in alleles: + result_dict[loci].append(allele.allele_variant) + return result_dict + + +async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]): + mlst_profiles = list(mlst_profiles_iterable) + header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()] + with open(handle, "w", newline='') as filehandle: + writer = csv.DictWriter(filehandle, fieldnames=header) + writer.writeheader() + for mlst_profile in mlst_profiles: + row_dictionary = { + "st": mlst_profile.sequence_type, + "clonal-complex": mlst_profile.clonal_complex, + **loci_alleles_variants_from_loci(mlst_profile.alleles) + } + + writer.writerow(rowdict=row_dictionary) diff --git a/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py b/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py index 46d2053..1b95add 100644 --- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py +++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py @@ -1,10 +1,10 @@ +from collections import defaultdict from contextlib import AbstractAsyncContextManager import re -from typing import Sequence +from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union from aiohttp import ClientSession, ClientTimeout -from mlstmyfasta.engine.data.MLST import Allele +from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile from mlstmyfasta.engine.data.genomics import NamedString -from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse class InstitutPasteurProfiler(AbstractAsyncContextManager): @@ -16,9 +16,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager): self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/" self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) - async def fetch_mlst_profile(self, sequence_string: str): + async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]: # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes - uri_path = f"schemes/3/sequence" + uri_path = "schemes/3/sequence" response = await self._http_client.post(uri_path, json={ "sequence": sequence_string }) @@ -29,6 +29,32 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager): alelle_id = allele["allele_id"] yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) + async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: + uri_path = "schemes/3/designations" + allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) + if isinstance(alleles, AsyncIterable): + async for allele in alleles: + allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) + else: + for allele in alleles: + allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) + response = await self._http_client.post(uri_path, json={ + "designations": allele_request_dict + }) + response_json = await response.json() + schema_fields_returned = response_json["fields"] + schema_exact_matches = response_json["exact_matches"] + allele_map: dict[str, list[Allele]] = defaultdict(list) + for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): + for exact_match_allele in exact_match_alleles: + allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) + return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + + async def profile_string(self, string: str) -> MLSTProfile: + alleles = self.fetch_mlst_allele_variants(string) + return await self.fetch_mlst_st(alleles) + + async def close(self): await self._http_client.close() diff --git a/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py b/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py deleted file mode 100644 index daa200b..0000000 --- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py +++ /dev/null @@ -1,22 +0,0 @@ -from dataclasses import dataclass -from typing import Sequence, Union - -from dataclasses_json import dataclass_json - -@dataclass_json -@dataclass -class InstitutPasteurSequenceRequest: - sequence: str - details: Union[bool, None] - partial_matches: Union[bool, None] - base64: Union[bool, None] - -@dataclass_json -@dataclass -class InstitutPasteurSequenceResponse: - exact_matches: Sequence[tuple[int, str]] - start: Union[int, None] - end: Union[int, None] - orientation: Union[str, None] - length: Union[int, None] - contig: Union[str, None] \ No newline at end of file diff --git a/src/mlstmyfasta/http/server.py b/src/mlstmyfasta/http/server.py new file mode 100644 index 0000000..65f12f6 --- /dev/null +++ b/src/mlstmyfasta/http/server.py @@ -0,0 +1,5 @@ +import logging +from aiohttp import web + +webapp = web.Application(logger=logging.getLogger(__name__)) +routes = web.RouteTableDef \ No newline at end of file diff --git a/tests/mlstmyfasta/engine/local/test_abif.py b/tests/mlstmyfasta/engine/local/test_abif.py index 611cec0..7f70024 100644 --- a/tests/mlstmyfasta/engine/local/test_abif.py +++ b/tests/mlstmyfasta/engine/local/test_abif.py @@ -1,8 +1,8 @@ import os -from mlstmyfasta.engine.local.abif import load_sanger_sequence +from mlstmyfasta.engine.local.abif import read_abif async def test_load_sanger_sequence_has_data(): assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1") - result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1") + result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1") assert result_data is not None \ No newline at end of file diff --git a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py deleted file mode 100644 index 4f2a0bd..0000000 --- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py +++ /dev/null @@ -1,16 +0,0 @@ -from Bio import SeqIO -from mlstmyfasta.engine.data.MLST import Allele -from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler - - -async def test_profiling_results_in_exact_matches_when_exact(): - sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) - async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: - exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence) - targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} - async for exact_match in exact_matches: - assert isinstance(exact_match, Allele) - assert exact_match.allele_variant == '1' # All of Tohama I has allele id I - targets_left.remove(exact_match.allele_loci) - - assert len(targets_left) == 0 \ No newline at end of file diff --git a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiling.py b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiling.py new file mode 100644 index 0000000..dde0d74 --- /dev/null +++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiling.py @@ -0,0 +1,35 @@ +from Bio import SeqIO +from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile +from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler + + +async def test_profiling_results_in_exact_matches_when_exact(): + sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) + async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: + exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) + targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} + async for exact_match in exact_matches: + assert isinstance(exact_match, Allele) + assert exact_match.allele_variant == '1' # All of Tohama I has allele id I + targets_left.remove(exact_match.allele_loci) + + assert len(targets_left) == 0 + +async def test_profiling_results_in_correct_st(): + sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) + dummy_alleles = [ + Allele("adk", "1"), + Allele("fumC", "1"), + Allele("glyA", "1"), + Allele("tyrB", "1"), + Allele("icd", "1"), + Allele("pepA", "1"), + Allele("pgm", "1"), + ] + async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: + exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) + mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles) + assert mlst_st_data is not None + assert isinstance(mlst_st_data, MLSTProfile) + assert mlst_st_data.clonal_complex == "ST-2 complex" + assert mlst_st_data.sequence_type == "1" \ No newline at end of file