Completed CLI for B. pertussis MLST typing from fasta files
This commit is contained in:
parent
362e0867e5
commit
e9977d2da7
@ -12,7 +12,8 @@
|
||||
"extensions": [
|
||||
"ms-python.isort",
|
||||
"njpwerner.autodocstring",
|
||||
"ms-python.black-formatter"
|
||||
"ms-python.black-formatter",
|
||||
"mechatroner.rainbow-csv"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -356,3 +356,4 @@ package
|
||||
|
||||
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||
|
||||
src/output.csv
|
||||
|
28
.vscode/launch.json
vendored
Normal file
28
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,28 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python Debugger: Current File with Arguments",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/src/mlstmyfasta/cli/program.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [
|
||||
"-fa",
|
||||
"${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
|
||||
"-ipdbmlst",
|
||||
"pubmlst_bordetella_seqdef",
|
||||
"-csv",
|
||||
"${workspaceFolder}/output.csv"
|
||||
],
|
||||
"cwd": "${workspaceFolder}/src",
|
||||
"env": {
|
||||
"PYTHONPATH": "${workspaceFolder}/src"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -7,11 +7,14 @@ name = "mlstmyfasta"
|
||||
dynamic = ["version"]
|
||||
dependencies = [
|
||||
"biopython",
|
||||
"requests",
|
||||
"aiohttp[speedups]",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||
|
||||
[project.scripts]
|
||||
mlstmyfasta = "mlstmyfasta.cli.program:cli"
|
||||
|
||||
[tool.pyright]
|
||||
extraPaths = ["src"]
|
||||
exclude = [
|
||||
|
@ -2,4 +2,3 @@ aiohttp[speedups]
|
||||
biopython
|
||||
pytest
|
||||
pytest-asyncio
|
||||
dataclasses-json
|
23
src/mlstmyfasta/cli/aggregator.py
Normal file
23
src/mlstmyfasta/cli/aggregator.py
Normal file
@ -0,0 +1,23 @@
|
||||
from os import path
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
|
||||
from mlstmyfasta.engine.data.MLST import MLSTProfile
|
||||
from mlstmyfasta.engine.data.genomics import NamedString
|
||||
from mlstmyfasta.engine.local.abif import read_abif
|
||||
from mlstmyfasta.engine.local.fasta import read_fasta
|
||||
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
|
||||
|
||||
|
||||
async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
|
||||
for fasta_path in fastas:
|
||||
async for fasta in read_fasta(fasta_path):
|
||||
yield fasta.sequence
|
||||
for abif_path in abifs:
|
||||
abif_data = await read_abif(abif_path)
|
||||
yield "".join(abif_data.sequence)
|
||||
|
||||
async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
|
||||
profiles = list()
|
||||
async with InstitutPasteurProfiler(database_name=database_name) as profiler:
|
||||
async for string in strings:
|
||||
profiles.append(await profiler.profile_string(string))
|
||||
return profiles
|
58
src/mlstmyfasta/cli/program.py
Normal file
58
src/mlstmyfasta/cli/program.py
Normal file
@ -0,0 +1,58 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
from os import path
|
||||
|
||||
from mlstmyfasta.cli import aggregator
|
||||
from mlstmyfasta.engine.data.genomics import NamedString
|
||||
from mlstmyfasta.engine.local.abif import read_abif
|
||||
from mlstmyfasta.engine.local.csv import write_mlst_profiles_as_csv
|
||||
from mlstmyfasta.engine.local.fasta import read_fasta
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--fasta", "-fa", "-fst",
|
||||
nargs="+",
|
||||
action='extend',
|
||||
dest="fastas",
|
||||
required=False,
|
||||
default=[],
|
||||
type=str,
|
||||
help="The FASTA files to process. Multiple can be listed."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--abif", "-abi", "-ab1",
|
||||
action='extend',
|
||||
dest="abifs",
|
||||
required=False,
|
||||
default=[],
|
||||
type=str,
|
||||
help="The ABIF files to process. Multiple can be listed."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--institut-pasteur-mlst",
|
||||
"-ipdbmlst",
|
||||
dest="institut_pasteur_db",
|
||||
type=str,
|
||||
help="The Institut Pasteur MLST database to use."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-csv",
|
||||
dest="csv_path",
|
||||
required=False,
|
||||
default=None,
|
||||
help="The destination to place the CSV output."
|
||||
)
|
||||
|
||||
|
||||
def cli():
|
||||
args = parser.parse_args()
|
||||
gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs)
|
||||
mlst_profiles = aggregator.profile_all_genetic_strings(
|
||||
gen_strings, args.institut_pasteur_db)
|
||||
asyncio.run(write_mlst_profiles_as_csv(
|
||||
asyncio.run(mlst_profiles), str(args.csv_path)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
@ -1,6 +1,13 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence
|
||||
|
||||
@dataclass
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
|
||||
@dataclass
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: int
|
||||
clonal_complex: str
|
@ -12,7 +12,7 @@ def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
||||
return SeqIO.read(seq_handle, "abi")
|
||||
|
||||
|
||||
async def load_sanger_sequence(seq_path: str) -> SangerTraceData:
|
||||
async def read_abif(seq_path: str) -> SangerTraceData:
|
||||
ext = path.splitext(seq_path)[1]
|
||||
if ext.lower() != ".ab1" and ext.lower() != "abi":
|
||||
raise ValueError(
|
||||
|
31
src/mlstmyfasta/engine/local/csv.py
Normal file
31
src/mlstmyfasta/engine/local/csv.py
Normal file
@ -0,0 +1,31 @@
|
||||
import csv
|
||||
from io import TextIOWrapper
|
||||
from os import PathLike
|
||||
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
|
||||
|
||||
|
||||
def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, list[str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
result_dict[loci] = list()
|
||||
for allele in alleles:
|
||||
result_dict[loci].append(allele.allele_variant)
|
||||
return result_dict
|
||||
|
||||
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
|
||||
mlst_profiles = list(mlst_profiles_iterable)
|
||||
header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
|
||||
with open(handle, "w", newline='') as filehandle:
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
writer.writeheader()
|
||||
for mlst_profile in mlst_profiles:
|
||||
row_dictionary = {
|
||||
"st": mlst_profile.sequence_type,
|
||||
"clonal-complex": mlst_profile.clonal_complex,
|
||||
**loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
}
|
||||
|
||||
writer.writerow(rowdict=row_dictionary)
|
@ -1,10 +1,10 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import re
|
||||
from typing import Sequence
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
from mlstmyfasta.engine.data.MLST import Allele
|
||||
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
|
||||
from mlstmyfasta.engine.data.genomics import NamedString
|
||||
from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse
|
||||
|
||||
class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
|
||||
@ -16,9 +16,9 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def fetch_mlst_profile(self, sequence_string: str):
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = f"schemes/3/sequence"
|
||||
uri_path = "schemes/3/sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
})
|
||||
@ -29,6 +29,32 @@ class InstitutPasteurProfiler(AbstractAsyncContextManager):
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
|
||||
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "schemes/3/designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"designations": allele_request_dict
|
||||
})
|
||||
response_json = await response.json()
|
||||
schema_fields_returned = response_json["fields"]
|
||||
schema_exact_matches = response_json["exact_matches"]
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
|
@ -1,22 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Sequence, Union
|
||||
|
||||
from dataclasses_json import dataclass_json
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class InstitutPasteurSequenceRequest:
|
||||
sequence: str
|
||||
details: Union[bool, None]
|
||||
partial_matches: Union[bool, None]
|
||||
base64: Union[bool, None]
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class InstitutPasteurSequenceResponse:
|
||||
exact_matches: Sequence[tuple[int, str]]
|
||||
start: Union[int, None]
|
||||
end: Union[int, None]
|
||||
orientation: Union[str, None]
|
||||
length: Union[int, None]
|
||||
contig: Union[str, None]
|
5
src/mlstmyfasta/http/server.py
Normal file
5
src/mlstmyfasta/http/server.py
Normal file
@ -0,0 +1,5 @@
|
||||
import logging
|
||||
from aiohttp import web
|
||||
|
||||
webapp = web.Application(logger=logging.getLogger(__name__))
|
||||
routes = web.RouteTableDef
|
@ -1,8 +1,8 @@
|
||||
import os
|
||||
|
||||
from mlstmyfasta.engine.local.abif import load_sanger_sequence
|
||||
from mlstmyfasta.engine.local.abif import read_abif
|
||||
|
||||
async def test_load_sanger_sequence_has_data():
|
||||
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
assert result_data is not None
|
@ -1,16 +0,0 @@
|
||||
from Bio import SeqIO
|
||||
from mlstmyfasta.engine.data.MLST import Allele
|
||||
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
|
||||
|
||||
|
||||
async def test_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence)
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
@ -0,0 +1,35 @@
|
||||
from Bio import SeqIO
|
||||
from mlstmyfasta.engine.data.MLST import Allele, MLSTProfile
|
||||
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
|
||||
|
||||
|
||||
async def test_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_profiling_results_in_correct_st():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("fumC", "1"),
|
||||
Allele("glyA", "1"),
|
||||
Allele("tyrB", "1"),
|
||||
Allele("icd", "1"),
|
||||
Allele("pepA", "1"),
|
||||
Allele("pgm", "1"),
|
||||
]
|
||||
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
assert mlst_st_data.sequence_type == "1"
|
Loading…
x
Reference in New Issue
Block a user