Completed CLI for B. pertussis MLST typing from fasta files

This commit is contained in:
Harrison Deng 2025-01-03 19:20:19 +00:00
parent 362e0867e5
commit e9977d2da7
17 changed files with 230 additions and 51 deletions

View File

@ -12,7 +12,8 @@
"extensions": [
"ms-python.isort",
"njpwerner.autodocstring",
"ms-python.black-formatter"
"ms-python.black-formatter",
"mechatroner.rainbow-csv"
]
}
},

1
.gitignore vendored
View File

@ -356,3 +356,4 @@ package
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
src/output.csv

28
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,28 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/mlstmyfasta/cli/program.py",
"console": "integratedTerminal",
"args": [
"-fa",
"${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
"-ipdbmlst",
"pubmlst_bordetella_seqdef",
"-csv",
"${workspaceFolder}/output.csv"
],
"cwd": "${workspaceFolder}/src",
"env": {
"PYTHONPATH": "${workspaceFolder}/src"
}
}
]
}

View File

@ -7,11 +7,14 @@ name = "mlstmyfasta"
dynamic = ["version"]
dependencies = [
"biopython",
"requests",
"aiohttp[speedups]",
]
requires-python = ">=3.11"
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
[project.scripts]
mlstmyfasta = "mlstmyfasta.cli.program:cli"
[tool.pyright]
extraPaths = ["src"]
exclude = [

View File

@ -1,5 +1,4 @@
aiohttp[speedups]
biopython
pytest
pytest-asyncio
dataclasses-json
pytest-asyncio

View File

@ -0,0 +1,23 @@
from os import path
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
from mlstmyfasta.engine.data.MLST import MLSTProfile
from mlstmyfasta.engine.data.genomics import NamedString
from mlstmyfasta.engine.local.abif import read_abif
from mlstmyfasta.engine.local.fasta import read_fasta
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
for fasta_path in fastas:
async for fasta in read_fasta(fasta_path):
yield fasta.sequence
for abif_path in abifs:
abif_data = await read_abif(abif_path)
yield "".join(abif_data.sequence)
async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
profiles = list()
async with InstitutPasteurProfiler(database_name=database_name) as profiler:
async for string in strings:
profiles.append(await profiler.profile_string(string))
return profiles

View File

@ -0,0 +1,58 @@
import argparse
import asyncio
from os import path
from mlstmyfasta.cli import aggregator
from mlstmyfasta.engine.data.genomics import NamedString
from mlstmyfasta.engine.local.abif import read_abif
from mlstmyfasta.engine.local.csv import write_mlst_profiles_as_csv
from mlstmyfasta.engine.local.fasta import read_fasta
parser = argparse.ArgumentParser()
parser.add_argument(
"--fasta", "-fa", "-fst",
nargs="+",
action='extend',
dest="fastas",
required=False,
default=[],
type=str,
help="The FASTA files to process. Multiple can be listed."
)
parser.add_argument(
"--abif", "-abi", "-ab1",
action='extend',
dest="abifs",
required=False,
default=[],
type=str,
help="The ABIF files to process. Multiple can be listed."
)
parser.add_argument(
"--institut-pasteur-mlst",
"-ipdbmlst",
dest="institut_pasteur_db",
type=str,
help="The Institut Pasteur MLST database to use."
)
parser.add_argument(
"-csv",
dest="csv_path",
required=False,
default=None,
help="The destination to place the CSV output."
)
def cli():
args = parser.parse_args()
gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs)
mlst_profiles = aggregator.profile_all_genetic_strings(
gen_strings, args.institut_pasteur_db)
asyncio.run(write_mlst_profiles_as_csv(
asyncio.run(mlst_profiles), str(args.csv_path)))
if __name__ == "__main__":
cli()

View File

@ -1,6 +1,13 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
@dataclass
class Allele:
allele_loci: str
allele_variant: str
allele_variant: str
@dataclass
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: int
clonal_complex: str

View File

@ -12,7 +12,7 @@ def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
return SeqIO.read(seq_handle, "abi")
async def load_sanger_sequence(seq_path: str) -> SangerTraceData:
async def read_abif(seq_path: str) -> SangerTraceData:
ext = path.splitext(seq_path)[1]
if ext.lower() != ".ab1" and ext.lower() != "abi":
raise ValueError(

View File

@ -0,0 +1,31 @@
import csv
from io import TextIOWrapper
from os import PathLike
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {}
for loci, alleles in alleles_map.items():
result_dict[loci] = list()
for allele in alleles:
result_dict[loci].append(allele.allele_variant)
return result_dict
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
mlst_profiles = list(mlst_profiles_iterable)
header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
with open(handle, "w", newline='') as filehandle:
writer = csv.DictWriter(filehandle, fieldnames=header)
writer.writeheader()
for mlst_profile in mlst_profiles:
row_dictionary = {
"st": mlst_profile.sequence_type,
"clonal-complex": mlst_profile.clonal_complex,
**loci_alleles_variants_from_loci(mlst_profile.alleles)
}
writer.writerow(rowdict=row_dictionary)

View File

@ -1,10 +1,10 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Sequence
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from mlstmyfasta.engine.data.MLST import Allele
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
from mlstmyfasta.engine.data.genomics import NamedString
from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse
class InstitutPasteurProfiler(AbstractAsyncContextManager):
@ -16,9 +16,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_profile(self, sequence_string: str):
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = f"schemes/3/sequence"
uri_path = "schemes/3/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
@ -29,6 +29,32 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "schemes/3/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
else:
for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
response = await self._http_client.post(uri_path, json={
"designations": allele_request_dict
})
response_json = await response.json()
schema_fields_returned = response_json["fields"]
schema_exact_matches = response_json["exact_matches"]
allele_map: dict[str, list[Allele]] = defaultdict(list)
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
return await self.fetch_mlst_st(alleles)
async def close(self):
await self._http_client.close()

View File

@ -1,22 +0,0 @@
from dataclasses import dataclass
from typing import Sequence, Union
from dataclasses_json import dataclass_json
@dataclass_json
@dataclass
class InstitutPasteurSequenceRequest:
sequence: str
details: Union[bool, None]
partial_matches: Union[bool, None]
base64: Union[bool, None]
@dataclass_json
@dataclass
class InstitutPasteurSequenceResponse:
exact_matches: Sequence[tuple[int, str]]
start: Union[int, None]
end: Union[int, None]
orientation: Union[str, None]
length: Union[int, None]
contig: Union[str, None]

View File

@ -0,0 +1,5 @@
import logging
from aiohttp import web
webapp = web.Application(logger=logging.getLogger(__name__))
routes = web.RouteTableDef

View File

@ -1,8 +1,8 @@
import os
from mlstmyfasta.engine.local.abif import load_sanger_sequence
from mlstmyfasta.engine.local.abif import read_abif
async def test_load_sanger_sequence_has_data():
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
assert result_data is not None

View File

@ -1,16 +0,0 @@
from Bio import SeqIO
from mlstmyfasta.engine.data.MLST import Allele
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0

View File

@ -0,0 +1,35 @@
from Bio import SeqIO
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0
async def test_profiling_results_in_correct_st():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
assert mlst_st_data.sequence_type == "1"