diff --git a/.vscode/launch.json b/.vscode/launch.json index edfb26e..15a17f7 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,18 +6,12 @@ "configurations": [ { - "name": "CLI ipdbmlst", + "name": "CLI blank", "type": "debugpy", "request": "launch", - "program": "${workspaceFolder}/src/automlst/cli/root.py", + "program": "${workspaceFolder}/src/automlst/cli/program.py", "console": "integratedTerminal", - "args": [ - "-fa", - "${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta", - "-ipdbmlst", - "pubmlst_bordetella_seqdef", - "${workspaceFolder}/output" - ], + "args": [], "cwd": "${workspaceFolder}/src", "env": { "PYTHONPATH": "${workspaceFolder}/src" diff --git a/pyproject.toml b/pyproject.toml index 0a09d92..3fde94b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ requires-python = ">=3.11" description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases." [project.scripts] -automlst = "automlst.cli.root:run" +automlst = "automlst.cli.program:run" [tool.pyright] extraPaths = ["src"] diff --git a/src/automlst/engine/remote/databases/institutpasteur/__init__.py b/src/automlst/cli/__init__.py similarity index 100% rename from src/automlst/engine/remote/databases/institutpasteur/__init__.py rename to src/automlst/cli/__init__.py diff --git a/src/automlst/cli/aggregated.py b/src/automlst/cli/aggregated.py deleted file mode 100644 index 4cb8207..0000000 --- a/src/automlst/cli/aggregated.py +++ /dev/null @@ -1,23 +0,0 @@ -from os import path -from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence -from automlst.engine.data.mlst import MLSTProfile -from automlst.engine.data.genomics import NamedString -from automlst.engine.local.abif import read_abif -from automlst.engine.local.fasta import read_fasta -from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler - - -async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]: - for fasta_path in fastas: - async for fasta in read_fasta(fasta_path): - yield fasta.sequence - for abif_path in abifs: - abif_data = await read_abif(abif_path) - yield "".join(abif_data.sequence) - -async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]: - profiles = list() - async with InstitutPasteurProfiler(database_name=database_name) as profiler: - async for string in strings: - profiles.append(await profiler.profile_string(string)) - return profiles \ No newline at end of file diff --git a/src/automlst/cli/info.py b/src/automlst/cli/info.py new file mode 100644 index 0000000..26cd65f --- /dev/null +++ b/src/automlst/cli/info.py @@ -0,0 +1,43 @@ +import asyncio +from automlst.cli import program +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex + + +parser = program.subparsers.add_parser(__name__) + +parser.add_argument( + "--retrieve-bigsdbs", "-l", + action="store_true", + dest="list_dbs", + required=False, + default=False, + type=bool, + help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)." +) + +parser.add_argument( + "--retrieve-bigsdb-schemas", "-lschemas", + nargs="+", + action="extend", + dest="list_bigsdb_schemas", + required=False, + default=[], + type=str, + help="Lists the known schema IDs for a given BIGSdb sequence definition database name" +) + +async def run(args): + async with BIGSdbIndex() as bigsdb_index: + if args.list_dbs: + known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False) + print(", ".join(known_seqdef_dbs.keys())) + + for bigsdb_schema_name in args.list_bigsdb_schemas: + schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name) + for schema_desc, schema_id in schemas.items(): + print(f"{schema_desc}: {schema_id}") + +def run_asynchronously(args): + asyncio.run(run(args)) + +parser.set_defaults(func=run_asynchronously) diff --git a/src/automlst/cli/profile.py b/src/automlst/cli/profile.py new file mode 100644 index 0000000..1abc966 --- /dev/null +++ b/src/automlst/cli/profile.py @@ -0,0 +1,55 @@ + +import asyncio +import datetime +from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence, Union +from automlst.cli import program +from automlst.engine.data.genomics import NamedString +from automlst.engine.data.mlst import MLSTProfile +from automlst.engine.local.abif import read_abif, reference_consensus_assembly +from automlst.engine.local.csv import write_mlst_profiles_as_csv +from automlst.engine.local.fasta import read_fasta, read_multiple_fastas +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler + + +parser = program.subparsers.add_parser(__name__) + +parser.add_argument( + "--fasta", "-fa", "-fst", + nargs="+", + action='extend', + dest="fastas", + required=False, + default=[], + type=str, + help="The FASTA files to process. Multiple can be listed." +) + +parser.add_argument( + "seqdefdb", + help="The BIGSdb seqdef database to use for typing." +) + +parser.add_argument( + "schema", + type=int, + help="The BIGSdb seqdef database schema ID (integer) to use for typing." +) + +parser.add_argument( + "out", + default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}', + help="The output CSV name (.csv will be appended)." +) + + +async def run(args): + async with BIGSdbIndex() as bigsdb_index: + gen_strings = read_multiple_fastas(args.fastas) + async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler: + mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings) + await write_mlst_profiles_as_csv(mlst_profiles, args.out) + +def run_asynchronously(args): + asyncio.run(run(args)) + +parser.set_defaults(func=run_asynchronously) diff --git a/src/automlst/cli/program.py b/src/automlst/cli/program.py new file mode 100644 index 0000000..e3e7a5e --- /dev/null +++ b/src/automlst/cli/program.py @@ -0,0 +1,22 @@ +import argparse +import asyncio +import datetime +from os import path +import os + +from automlst.engine.data.genomics import NamedString +from automlst.engine.local.abif import read_abif +from automlst.engine.local.csv import write_mlst_profiles_as_csv +from automlst.engine.local.fasta import read_fasta +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex + +root_parser = argparse.ArgumentParser() +subparsers = root_parser.add_subparsers(required=True) + +def run(): + args = root_parser.parse_args() + args.func(args) + + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/src/automlst/cli/root.py b/src/automlst/cli/root.py deleted file mode 100644 index 0de7827..0000000 --- a/src/automlst/cli/root.py +++ /dev/null @@ -1,86 +0,0 @@ -import argparse -import asyncio -import datetime -from os import path -import os - -from automlst.cli import aggregated -from automlst.engine.data.genomics import NamedString -from automlst.engine.local.abif import read_abif -from automlst.engine.local.csv import write_mlst_profiles_as_csv -from automlst.engine.local.fasta import read_fasta - - -parser = argparse.ArgumentParser() -parser.add_argument( - "--run-name", "-name", - dest="run_name", - required=False, - default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"), - type=str, - help="The name of the run. Will use a date and time string if not provided." -) -parser.add_argument( - "--fasta", "-fa", "-fst", - nargs="+", - action='extend', - dest="fastas", - required=False, - default=[], - type=str, - help="The FASTA files to process. Multiple can be listed." -) -parser.add_argument( - "--abif", "-abi", "-ab1", - action='extend', - dest="abifs", - required=False, - default=[], - type=str, - help="The ABIF files to process. Multiple can be listed." -) -parser.add_argument( - "--ncbi-assembly-reference", "-refncbi", - dest="ncbi_assembly_reference", - required=False, - default=None, - type=str, - help="The NCBI GenBank accession ID for the consensus assembly. Either this argument, or the path equivalent must be given if ABIF files are used." -) -parser.add_argument( - "--assembly-reference", "-ref", - dest="assembly_reference", - required=False, - default=None, - type=str, - help="The path to the FASTA sequence to be used as a reference for consensus building." -) -parser.add_argument( - "--institut-pasteur-mlst", - "-ipdbmlst", - dest="institut_pasteur_db", - required=False, - default=None, - type=str, - help="The Institut Pasteur MLST database to use." -) -parser.add_argument( - "out", - default="./.", - help="The output folder. Files will be named by the provided (or default) run name. Outputs will be automatically generated depending on which arguments are used." -) - - -def run(): - args = parser.parse_args() - gen_strings = aggregated.aggregate_sequences(args.fastas, args.abifs) - os.makedirs(args.out, exist_ok=True) - if args.institut_pasteur_db is not None: - mlst_profiles = aggregated.profile_all_genetic_strings( - gen_strings, args.institut_pasteur_db) - asyncio.run(write_mlst_profiles_as_csv( - asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv")))) - - -if __name__ == "__main__": - run() \ No newline at end of file diff --git a/src/automlst/engine/local/abif.py b/src/automlst/engine/local/abif.py index 0e6637f..654705f 100644 --- a/src/automlst/engine/local/abif.py +++ b/src/automlst/engine/local/abif.py @@ -1,11 +1,13 @@ import asyncio from numbers import Number from os import path -from typing import Any, AsyncGenerator, Collection, Sequence, Union +from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union from automlst.engine.data.genomics import NamedString, SangerTraceData from Bio.SeqRecord import SeqRecord from Bio import SeqIO, Align +from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank + def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord: with open(seq_path, "rb") as seq_handle: @@ -110,9 +112,15 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri aligner.mode = "local" alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[ 0] # take the best alignment - return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq) + # TODO actually assemble the consensus sequence here + raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.") -async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]: +async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]: + if isinstance(reference, str): + reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence) + else: + reference_seq: NamedString = reference for sanger_trace in sanger_traces: - yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1] + yield NamedString("NA", "NA") + raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.") \ No newline at end of file diff --git a/src/automlst/engine/local/csv.py b/src/automlst/engine/local/csv.py index 4d47e25..2d88304 100644 --- a/src/automlst/engine/local/csv.py +++ b/src/automlst/engine/local/csv.py @@ -6,7 +6,7 @@ from typing import AsyncIterable, Iterable, Mapping, Sequence, Union from automlst.engine.data.mlst import Allele, MLSTProfile -def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]): +def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]): result_dict: dict[str, list[str]] = {} for loci, alleles in alleles_map.items(): result_dict[loci] = list() @@ -15,17 +15,19 @@ def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]) return result_dict -async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]): - mlst_profiles = list(mlst_profiles_iterable) - header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()] +async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]): with open(handle, "w", newline='') as filehandle: - writer = csv.DictWriter(filehandle, fieldnames=header) - writer.writeheader() - for mlst_profile in mlst_profiles: + header = None + writer: Union[csv.DictWriter, None] = None + async for name, mlst_profile in mlst_profiles_iterable: + if writer is None: + header = ["st", "clonal-complex", "id", *mlst_profile.alleles.keys()] + writer = csv.DictWriter(filehandle, fieldnames=header) + writer.writeheader() row_dictionary = { "st": mlst_profile.sequence_type, "clonal-complex": mlst_profile.clonal_complex, - **loci_alleles_variants_from_loci(mlst_profile.alleles) + "id": name, + **dict_loci_alleles_variants_from_loci(mlst_profile.alleles) } - - writer.writerow(rowdict=row_dictionary) + writer.writerow(rowdict=row_dictionary) \ No newline at end of file diff --git a/src/automlst/engine/local/fasta.py b/src/automlst/engine/local/fasta.py index 6582433..4fb9cb8 100644 --- a/src/automlst/engine/local/fasta.py +++ b/src/automlst/engine/local/fasta.py @@ -1,6 +1,6 @@ import asyncio from io import TextIOWrapper -from typing import Any, AsyncGenerator, Generator, Sequence, Union +from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union from Bio import SeqIO from automlst.engine.data.genomics import NamedString @@ -8,4 +8,9 @@ from automlst.engine.data.genomics import NamedString async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]: fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta") for fasta_sequence in await fasta_sequences: - yield NamedString(fasta_sequence.id, str(fasta_sequence.seq)) \ No newline at end of file + yield NamedString(fasta_sequence.id, str(fasta_sequence.seq)) + +async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]: + for handle in handles: + async for named_seq in read_fasta(handle): + yield named_seq \ No newline at end of file diff --git a/src/automlst/engine/remote/databases/bigsdb.py b/src/automlst/engine/remote/databases/bigsdb.py new file mode 100644 index 0000000..c2db02a --- /dev/null +++ b/src/automlst/engine/remote/databases/bigsdb.py @@ -0,0 +1,127 @@ +from collections import defaultdict +from contextlib import AbstractAsyncContextManager +from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union + +from aiohttp import ClientSession, ClientTimeout + +from automlst.engine.data.genomics import NamedString +from automlst.engine.data.mlst import Allele, MLSTProfile + +class BigSDBMLSTProfiler(AbstractAsyncContextManager): + + def __init__(self, database_api: str, database_name: str, schema_id: int): + self._base_url = f"{database_api}/db/{database_name}/schemes/{schema_id}/" + self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) + + async def __aenter__(self): + return self + + async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]: + # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes + uri_path = "sequence" + response = await self._http_client.post(uri_path, json={ + "sequence": sequence_string + }) + sequence_response: dict = await response.json() + if "exact_matches" not in sequence_response: + # TODO throw exception for not finding matches. + pass + exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"] + for allele_loci, alleles in exact_matches.items(): + for allele in alleles: + alelle_id = allele["allele_id"] + yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) + + async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile: + uri_path = "designations" + allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) + async for allele in alleles: + allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) + + request_json = { + "designations": allele_request_dict + } + async with self._http_client.post(uri_path, json=request_json) as response: + response_json = await response.json() + if "fields" not in response_json: + # TODO raise exception about invalid parameters or no exact parameterization found + pass + schema_fields_returned = response_json["fields"] + schema_exact_matches: dict = response_json["exact_matches"] + allele_map: dict[str, list[Allele]] = defaultdict(list) + for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): + for exact_match_allele in exact_match_alleles: + allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) + return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + + async def profile_string(self, string: str) -> MLSTProfile: + alleles = self.fetch_mlst_allele_variants(string) + return await self.fetch_mlst_st(alleles) + + + async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]: + async for named_string in namedStrings: + yield (named_string.name, await self.profile_string(named_string.sequence)) + + + async def close(self): + await self._http_client.close() + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.close() + +class BIGSdbIndex(AbstractAsyncContextManager): + KNOWN_BIGSDB_APIS = { + "https://bigsdb.pasteur.fr/api", + "https://rest.pubmlst.org" + } + + def __init__(self): + self._http_client = ClientSession() + self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None + self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict() + super().__init__() + + async def __aenter__(self): + return self + + async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]: + if self._known_seqdef_dbs_origin is not None and not force: + return self._known_seqdef_dbs_origin + known_seqdef_dbs = dict() + for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS: + async with self._http_client.get(f"{known_bigsdb}/db") as response: + response_json_databases = await response.json() + for database_group in response_json_databases: + for database_info in database_group["databases"]: + if str(database_info["name"]).endswith("seqdef"): + known_seqdef_dbs[database_info["name"]] = known_bigsdb + self._known_seqdef_dbs_origin = dict(known_seqdef_dbs) + return self._known_seqdef_dbs_origin + + async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str: + return (await self.get_known_seqdef_dbs())[seqdef_db_name] + + async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]: + if self._seqdefdb_schemas[seqdef_db_name] is not None and not force: + return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional + uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/{seqdef_db_name}/schemes" + async with self._http_client.get(uri_path) as response: + response_json = await response.json() + schema_descriptions: Mapping[str, int] = dict() + for scheme_definition in response_json["schemes"]: + scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) + scheme_desc: str = scheme_definition["description"] + schema_descriptions[scheme_desc] = scheme_id + self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions + return self._seqdefdb_schemas[seqdef_db_name] # type: ignore + + async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler: + return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) + + async def close(self): + await self._http_client.close() + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.close() + diff --git a/src/automlst/engine/remote/databases/institutpasteur/mlst.py b/src/automlst/engine/remote/databases/institutpasteur/mlst.py deleted file mode 100644 index e254343..0000000 --- a/src/automlst/engine/remote/databases/institutpasteur/mlst.py +++ /dev/null @@ -1,69 +0,0 @@ -from collections import defaultdict -from contextlib import AbstractAsyncContextManager -import re -from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union -from aiohttp import ClientSession, ClientTimeout -from automlst.engine.data.mlst import Allele, MLSTProfile -from automlst.engine.data.genomics import NamedString -from automlst.engine.remote.databases.mlst import MLSTProfiler - -class InstitutPasteurProfiler(MLSTProfiler): - - async def __aenter__(self): - return self - - - def __init__(self, database_name: str): - self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/" - self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) - - async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]: - # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes - uri_path = f"schemes/{schema_id}/sequence" - response = await self._http_client.post(uri_path, json={ - "sequence": sequence_string - }) - sequence_response: dict = await response.json() - exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"] - for allele_loci, alleles in exact_matches.items(): - for allele in alleles: - alelle_id = allele["allele_id"] - yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) - - async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: - uri_path = f"schemes/{schema_id}/designations" - allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) - async for allele in alleles: - allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) - response = await self._http_client.post(uri_path, json={ - "designations": allele_request_dict - }) - response_json = await response.json() - schema_fields_returned = response_json["fields"] - schema_exact_matches = response_json["exact_matches"] - allele_map: dict[str, list[Allele]] = defaultdict(list) - for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): - for exact_match_allele in exact_match_alleles: - allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) - return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) - - async def profile_string(self, schema_id: int, string: str) -> MLSTProfile: - alleles = self.fetch_mlst_allele_variants(schema_id, string) - return await self.fetch_mlst_st(schema_id, alleles) - - async def get_scheme_ids(self) -> Mapping[str, int]: - uri_path = "schemes" - response = await self._http_client.get(uri_path) - response_json = await response.json() - schema_descriptions: Mapping[str, int] = dict() - for scheme_definition in response_json["schemes"]: - scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) - scheme_desc: str = scheme_definition["description"] - schema_descriptions[scheme_desc] = scheme_id - return schema_descriptions - - async def close(self): - await self._http_client.close() - - async def __aexit__(self, exc_type, exc_value, traceback): - await self.close() \ No newline at end of file diff --git a/src/automlst/engine/remote/databases/mlst.py b/src/automlst/engine/remote/databases/mlst.py deleted file mode 100644 index 881ce58..0000000 --- a/src/automlst/engine/remote/databases/mlst.py +++ /dev/null @@ -1,33 +0,0 @@ -from abc import abstractmethod -from contextlib import AbstractAsyncContextManager -from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union - -from aiohttp import ClientSession - -from automlst.engine.data.mlst import Allele, MLSTProfile - -MLST_DATABASES = [ - "https://bigsdb.pasteur.fr/api/db", - "https://rest.pubmlst.org/db" -] - -class MLSTProfiler(AbstractAsyncContextManager): - @abstractmethod - def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]: - pass - - @abstractmethod - async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: - pass - - @abstractmethod - async def profile_string(self, schema_id: int, string: str) -> MLSTProfile: - pass - - @abstractmethod - async def close(self): - pass - - @abstractmethod - async def get_scheme_ids(self) -> Mapping[str, int]: - pass \ No newline at end of file diff --git a/src/automlst/engine/remote/databases/pubmlst/__init__.py b/src/automlst/engine/remote/databases/pubmlst/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/automlst/engine/remote/databases/pubmlst/mlst.py b/src/automlst/engine/remote/databases/pubmlst/mlst.py deleted file mode 100644 index 54e3af1..0000000 --- a/src/automlst/engine/remote/databases/pubmlst/mlst.py +++ /dev/null @@ -1,68 +0,0 @@ -from collections import defaultdict -from contextlib import AbstractAsyncContextManager -import re -from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union -from aiohttp import ClientSession, ClientTimeout -from automlst.engine.data.mlst import Allele, MLSTProfile -from automlst.engine.data.genomics import NamedString -from automlst.engine.remote.databases.mlst import MLSTProfiler - -class PubMLSTProfiler(MLSTProfiler): - - async def __aenter__(self): - return self - - - def __init__(self, database_name: str): - self._base_url = f"https://rest.pubmlst.org/db/{database_name}/" - self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) - - async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]: - uri_path = f"schemes/{schema_id}/sequence" - response = await self._http_client.post(uri_path, json={ - "sequence": sequence_string - }) - sequence_response: dict = await response.json() - exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"] - for allele_loci, alleles in exact_matches.items(): - for allele in alleles: - alelle_id = allele["allele_id"] - yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) - - async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile: - uri_path = f"schemes/{schema_id}/designations" - allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) - async for allele in alleles: - allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) - response = await self._http_client.post(uri_path, json={ - "designations": allele_request_dict - }) - response_json = await response.json() - schema_fields_returned = response_json["fields"] - schema_exact_matches = response_json["exact_matches"] - allele_map: dict[str, list[Allele]] = defaultdict(list) - for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): - for exact_match_allele in exact_match_alleles: - allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) - return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) - - async def profile_string(self, schema_id: int, string: str) -> MLSTProfile: - alleles = self.fetch_mlst_allele_variants(schema_id, string) - return await self.fetch_mlst_st(schema_id, alleles) - - async def get_scheme_ids(self) -> Mapping[str, int]: - uri_path = "schemes" - response = await self._http_client.get(uri_path) - response_json = await response.json() - schema_descriptions: Mapping[str, int] = dict() - for scheme_definition in response_json["schemes"]: - scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) - scheme_desc: str = scheme_definition["description"] - schema_descriptions[scheme_desc] = scheme_id - return schema_descriptions - - async def close(self): - await self._http_client.close() - - async def __aexit__(self, exc_type, exc_value, traceback): - await self.close() \ No newline at end of file diff --git a/tests/nsbdiagnosistoolkit/engine/local/test_abif.py b/tests/nsbdiagnosistoolkit/engine/local/test_abif.py index a25bbe5..cc514e1 100644 --- a/tests/nsbdiagnosistoolkit/engine/local/test_abif.py +++ b/tests/nsbdiagnosistoolkit/engine/local/test_abif.py @@ -1,8 +1,12 @@ import os -from automlst.engine.local.abif import read_abif +from automlst.engine.local.abif import read_abif, reference_consensus_assembly async def test_load_sanger_sequence_has_data(): assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1") result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1") - assert result_data is not None \ No newline at end of file + assert result_data is not None + +async def test_consensus_assembly_with_ncbi(): + consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")]) + # TODO complete implementing this \ No newline at end of file diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py deleted file mode 100644 index ba37f10..0000000 --- a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py +++ /dev/null @@ -1,54 +0,0 @@ -from Bio import SeqIO -from automlst.engine.data.mlst import Allele, MLSTProfile -from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler - - -async def test_profiling_results_in_exact_matches_when_exact(): - sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) - async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: - exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence) - targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} - async for exact_match in exact_matches: - assert isinstance(exact_match, Allele) - assert exact_match.allele_variant == '1' # All of Tohama I has allele id I - targets_left.remove(exact_match.allele_loci) - - assert len(targets_left) == 0 - -async def test_profiling_results_in_correct_st(): - async def dummy_allele_generator(): - dummy_alleles = [ - Allele("adk", "1"), - Allele("fumC", "1"), - Allele("glyA", "1"), - Allele("tyrB", "1"), - Allele("icd", "1"), - Allele("pepA", "1"), - Allele("pgm", "1"), - ] - for dummy_allele in dummy_alleles: - yield dummy_allele - async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: - mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_allele_generator()) - assert mlst_st_data is not None - assert isinstance(mlst_st_data, MLSTProfile) - assert mlst_st_data.clonal_complex == "ST-2 complex" - assert mlst_st_data.sequence_type == "1" - -async def test_sequence_profiling_is_correct(): - sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) - dummy_alleles = [ - Allele("adk", "1"), - Allele("fumC", "1"), - Allele("glyA", "1"), - Allele("tyrB", "1"), - Allele("icd", "1"), - Allele("pepA", "1"), - Allele("pgm", "1"), - ] - async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler: - profile = await dummy_profiler.profile_string(3, sequence) - assert profile is not None - assert isinstance(profile, MLSTProfile) - assert profile.clonal_complex == "ST-2 complex" - assert profile.sequence_type == "1" \ No newline at end of file diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py deleted file mode 100644 index 4f3f755..0000000 --- a/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py +++ /dev/null @@ -1,53 +0,0 @@ -import asyncio -from Bio import SeqIO -from automlst.engine.data.mlst import Allele, MLSTProfile -from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler - - -async def test_profiling_results_in_exact_matches_when_exact(): - dummy_alleles = { - Allele("adk", "1"), - Allele("atpG", "1"), - Allele("frdB", "1"), - Allele("fucK", "1"), - Allele("mdh", "1"), - Allele("pgi", "1"), - Allele("recA", "5"), - } - sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) - async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: - exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence) - async for exact_match in exact_matches: - assert isinstance(exact_match, Allele) - dummy_alleles.remove(exact_match) - - assert len(dummy_alleles) == 0 - -async def test_profiling_results_in_correct_st(): - async def generate_dummy_targets(): - dummy_alleles = [ - Allele("adk", "1"), - Allele("atpG", "1"), - Allele("frdB", "1"), - Allele("fucK", "1"), - Allele("mdh", "1"), - Allele("pgi", "1"), - Allele("recA", "5"), - ] - for dummy_allele in dummy_alleles: - yield dummy_allele - async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: - mlst_st_data = await dummy_profiler.fetch_mlst_st(1, generate_dummy_targets()) - assert mlst_st_data is not None - assert isinstance(mlst_st_data, MLSTProfile) - assert mlst_st_data.clonal_complex == "ST-3 complex" - assert mlst_st_data.sequence_type == "3" - -async def test_sequence_profiling_is_correct(): - sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) - async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler: - profile = await dummy_profiler.profile_string(1, sequence) - assert profile is not None - assert isinstance(profile, MLSTProfile) - assert profile.clonal_complex == "ST-3 complex" - assert profile.sequence_type == "3" \ No newline at end of file diff --git a/tests/nsbdiagnosistoolkit/engine/remote/databases/test_bigsdb.py b/tests/nsbdiagnosistoolkit/engine/remote/databases/test_bigsdb.py new file mode 100644 index 0000000..eb852e3 --- /dev/null +++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/test_bigsdb.py @@ -0,0 +1,115 @@ +from Bio import SeqIO +from automlst.engine.data.mlst import Allele, MLSTProfile +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler + + +async def test_institutpasteur_profiling_results_in_exact_matches_when_exact(): + sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) + async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: + exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) + targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} + async for exact_match in exact_matches: + assert isinstance(exact_match, Allele) + assert exact_match.allele_variant == '1' # All of Tohama I has allele id I + targets_left.remove(exact_match.allele_loci) + + assert len(targets_left) == 0 + +async def test_institutpasteur_profiling_results_in_correct_mlst_st(): + async def dummy_allele_generator(): + dummy_alleles = [ + Allele("adk", "1"), + Allele("fumC", "1"), + Allele("glyA", "1"), + Allele("tyrB", "1"), + Allele("icd", "1"), + Allele("pepA", "1"), + Allele("pgm", "1"), + ] + for dummy_allele in dummy_alleles: + yield dummy_allele + async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: + mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator()) + assert mlst_st_data is not None + assert isinstance(mlst_st_data, MLSTProfile) + assert mlst_st_data.clonal_complex == "ST-2 complex" + assert mlst_st_data.sequence_type == "1" + +async def test_institutpasteur_sequence_profiling_is_correct(): + sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) + async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: + profile = await dummy_profiler.profile_string(sequence) + assert profile is not None + assert isinstance(profile, MLSTProfile) + assert profile.clonal_complex == "ST-2 complex" + assert profile.sequence_type == "1" + + +async def test_pubmlst_profiling_results_in_exact_matches_when_exact(): + dummy_alleles = { + Allele("adk", "1"), + Allele("atpG", "1"), + Allele("frdB", "1"), + Allele("fucK", "1"), + Allele("mdh", "1"), + Allele("pgi", "1"), + Allele("recA", "5"), + } + sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) + async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: + exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) + async for exact_match in exact_matches: + assert isinstance(exact_match, Allele) + dummy_alleles.remove(exact_match) + + assert len(dummy_alleles) == 0 + +async def test_pubmlst_profiling_results_in_correct_st(): + async def generate_dummy_targets(): + dummy_alleles = [ + Allele("adk", "1"), + Allele("atpG", "1"), + Allele("frdB", "1"), + Allele("fucK", "1"), + Allele("mdh", "1"), + Allele("pgi", "1"), + Allele("recA", "5"), + ] + for dummy_allele in dummy_alleles: + yield dummy_allele + async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: + mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets()) + assert mlst_st_data is not None + assert isinstance(mlst_st_data, MLSTProfile) + assert mlst_st_data.clonal_complex == "ST-3 complex" + assert mlst_st_data.sequence_type == "3" + +async def test_pubmlst_sequence_profiling_is_correct(): + sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) + async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: + profile = await dummy_profiler.profile_string(sequence) + assert profile is not None + assert isinstance(profile, MLSTProfile) + assert profile.clonal_complex == "ST-3 complex" + assert profile.sequence_type == "3" + +async def test_bigsdb_index_all_databases_is_not_empty(): + async with BIGSdbIndex() as bigsdb_index: + assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0 + +async def test_bigsdb_index_references_pubmlst_correctly(): + async with BIGSdbIndex() as bigsdb_index: + assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org" + +async def test_bigsdb_index_references_institutpasteur_correctly(): + async with BIGSdbIndex() as bigsdb_index: + assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api" + + +async def test_bigsdb_index_instantiates_correct_profiler(): + sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) + async with BIGSdbIndex() as bigsdb_index: + async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler: + profile = await profiler.profile_string(sequence) + assert profile.clonal_complex == "ST-2 complex" + assert profile.sequence_type == "1" \ No newline at end of file