Moved to a general BIGSdb implementation

Updated tests Removed ABIF UI for the time being Began updating CLI
2025-01-08 21:32:10 +00:00
parent 645357ac58
commit 42d0f56b18
20 changed files with 403 additions and 414 deletions
--- a/src/automlst/cli/init.py
+++ b/src/automlst/cli/init.py
--- a/src/automlst/cli/aggregated.py
+++ b/src/automlst/cli/aggregated.py
@@ -1,23 +0,0 @@
-from os import path
-from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
-from automlst.engine.data.mlst import MLSTProfile
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.local.abif import read_abif
-from automlst.engine.local.fasta import read_fasta
-from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
-
-
-async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
-    for fasta_path in fastas:
-        async for fasta in read_fasta(fasta_path):
-            yield fasta.sequence
-    for abif_path in abifs:
-        abif_data = await read_abif(abif_path)
-        yield "".join(abif_data.sequence)
-
-async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
-    profiles = list()
-    async with InstitutPasteurProfiler(database_name=database_name) as profiler:
-        async for string in strings:
-            profiles.append(await profiler.profile_string(string))
-    return profiles
--- a/src/automlst/cli/info.py
+++ b/src/automlst/cli/info.py
@@ -0,0 +1,43 @@
+import asyncio
+from automlst.cli import program
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
+
+
+parser = program.subparsers.add_parser(__name__)
+
+parser.add_argument(
+    "--retrieve-bigsdbs", "-l",
+    action="store_true",
+    dest="list_dbs",
+    required=False,
+    default=False,
+    type=bool,
+    help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
+)
+
+parser.add_argument(
+    "--retrieve-bigsdb-schemas", "-lschemas",
+    nargs="+",
+    action="extend",
+    dest="list_bigsdb_schemas",
+    required=False,
+    default=[],
+    type=str,
+    help="Lists the known schema IDs for a given BIGSdb sequence definition database name"
+)
+
+async def run(args):
+    async with BIGSdbIndex() as bigsdb_index:
+        if args.list_dbs:
+            known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
+            print(", ".join(known_seqdef_dbs.keys()))
+
+        for bigsdb_schema_name in args.list_bigsdb_schemas:
+            schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
+            for schema_desc, schema_id in schemas.items():
+                print(f"{schema_desc}: {schema_id}")
+
+def run_asynchronously(args):
+    asyncio.run(run(args))
+
+parser.set_defaults(func=run_asynchronously)
--- a/src/automlst/cli/profile.py
+++ b/src/automlst/cli/profile.py
@@ -0,0 +1,55 @@
+
+import asyncio
+import datetime
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence, Union
+from automlst.cli import program
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.data.mlst import MLSTProfile
+from automlst.engine.local.abif import read_abif, reference_consensus_assembly
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_fasta, read_multiple_fastas
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
+
+
+parser = program.subparsers.add_parser(__name__)
+
+parser.add_argument(
+    "--fasta", "-fa", "-fst",
+    nargs="+",
+    action='extend',
+    dest="fastas",
+    required=False,
+    default=[],
+    type=str,
+    help="The FASTA files to process. Multiple can be listed."
+)
+
+parser.add_argument(
+    "seqdefdb",
+    help="The BIGSdb seqdef database to use for typing."
+)
+
+parser.add_argument(
+    "schema",
+    type=int,
+    help="The BIGSdb seqdef database schema ID (integer) to use for typing."
+)
+
+parser.add_argument(
+    "out",
+    default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
+    help="The output CSV name (.csv will be appended)."
+)
+
+
+async def run(args):
+    async with BIGSdbIndex() as bigsdb_index:
+        gen_strings = read_multiple_fastas(args.fastas)
+        async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
+            mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
+            await write_mlst_profiles_as_csv(mlst_profiles, args.out)
+
+def run_asynchronously(args):
+    asyncio.run(run(args))
+
+parser.set_defaults(func=run_asynchronously)
--- a/src/automlst/cli/program.py
+++ b/src/automlst/cli/program.py
@@ -0,0 +1,22 @@
+import argparse
+import asyncio
+import datetime
+from os import path
+import os
+
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.local.abif import read_abif
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_fasta
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
+
+root_parser = argparse.ArgumentParser()
+subparsers = root_parser.add_subparsers(required=True)
+
+def run():
+    args = root_parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    run()
--- a/src/automlst/cli/root.py
+++ b/src/automlst/cli/root.py
@@ -1,86 +0,0 @@
-import argparse
-import asyncio
-import datetime
-from os import path
-import os
-
-from automlst.cli import aggregated
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.local.abif import read_abif
-from automlst.engine.local.csv import write_mlst_profiles_as_csv
-from automlst.engine.local.fasta import read_fasta
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run-name", "-name",
-    dest="run_name",
-    required=False,
-    default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"),
-    type=str,
-    help="The name of the run. Will use a date and time string if not provided."
-)
-parser.add_argument(
-    "--fasta", "-fa", "-fst",
-    nargs="+",
-    action='extend',
-    dest="fastas",
-    required=False,
-    default=[],
-    type=str,
-    help="The FASTA files to process. Multiple can be listed."
-)
-parser.add_argument(
-    "--abif", "-abi", "-ab1",
-    action='extend',
-    dest="abifs",
-    required=False,
-    default=[],
-    type=str,
-    help="The ABIF files to process. Multiple can be listed."
-)
-parser.add_argument(
-    "--ncbi-assembly-reference", "-refncbi",
-    dest="ncbi_assembly_reference",
-    required=False,
-    default=None,
-    type=str,
-    help="The NCBI GenBank accession ID for the consensus assembly. Either this argument, or the path equivalent must be given if ABIF files are used."
-)
-parser.add_argument(
-    "--assembly-reference", "-ref",
-    dest="assembly_reference",
-    required=False,
-    default=None,
-    type=str,
-    help="The path to the FASTA sequence to be used as a reference for consensus building."
-)
-parser.add_argument(
-    "--institut-pasteur-mlst",
-    "-ipdbmlst",
-    dest="institut_pasteur_db",
-    required=False,
-    default=None,
-    type=str,
-    help="The Institut Pasteur MLST database to use."
-)
-parser.add_argument(
-    "out",
-    default="./.",
-    help="The output folder. Files will be named by the provided (or default) run name. Outputs will be automatically generated depending on which arguments are used."
-)
-
-
-def run():
-    args = parser.parse_args()
-    gen_strings = aggregated.aggregate_sequences(args.fastas, args.abifs)
-    os.makedirs(args.out, exist_ok=True)
-    if args.institut_pasteur_db is not None:
-        mlst_profiles = aggregated.profile_all_genetic_strings(
-            gen_strings, args.institut_pasteur_db)
-        asyncio.run(write_mlst_profiles_as_csv(
-            asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv"))))
-
-
-if __name__ == "__main__":
-    run()