From 9b8e448512a499d102176719a48451d585adc50e Mon Sep 17 00:00:00 2001
From: Harrison Deng <yunyangdeng@outlook.com>
Date: Thu, 9 Jan 2025 07:39:18 +0000
Subject: [PATCH] Completed updating CLI to be more organized with bettert
 error messages

---
 .devcontainer/devcontainer.json               |  9 ++-
 .vscode/launch.json                           |  8 ++-
 src/automlst/cli/exactmatch.py                | 48 ++++++++++++++++
 src/automlst/cli/info.py                      | 47 ++++++++--------
 src/automlst/cli/meta.py                      |  2 +
 src/automlst/cli/profile.py                   | 55 -------------------
 src/automlst/cli/program.py                   |  7 ++-
 src/automlst/engine/local/csv.py              |  2 +-
 .../engine/remote/databases/bigsdb.py         | 16 ++++--
 9 files changed, 107 insertions(+), 87 deletions(-)
 create mode 100644 src/automlst/cli/exactmatch.py
 create mode 100644 src/automlst/cli/meta.py
 delete mode 100644 src/automlst/cli/profile.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index ce3acb4..c58d0e4 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -12,7 +12,14 @@
 	// "forwardPorts": [],
 
 	// Use 'postCreateCommand' to run commands after the container is created.
-	"postCreateCommand": "pip3 install --user -r requirements.txt"
+	"postCreateCommand": "pip3 install --user -r requirements.txt",
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"mechatroner.rainbow-csv"
+			]
+		}
+	}
 
 	// Configure tool-specific properties.
 	// "customizations": {},
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 15a17f7..308d5b0 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -6,12 +6,16 @@
     "configurations": [
         
         {
-            "name": "CLI blank",
+            "name": "automlst info -lschema pubmlst_bordetella_seqdef",
             "type": "debugpy",
             "request": "launch",
             "program": "${workspaceFolder}/src/automlst/cli/program.py",
             "console": "integratedTerminal",
-            "args": [],
+            "args": [
+                "info",
+                "-lschemas",
+                "pubmlst_bordetella_seqdef"
+            ],
             "cwd": "${workspaceFolder}/src",
             "env": {
                 "PYTHONPATH": "${workspaceFolder}/src"
diff --git a/src/automlst/cli/exactmatch.py b/src/automlst/cli/exactmatch.py
new file mode 100644
index 0000000..304c75f
--- /dev/null
+++ b/src/automlst/cli/exactmatch.py
@@ -0,0 +1,48 @@
+
+from argparse import ArgumentParser
+import asyncio
+import datetime
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_multiple_fastas
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
+
+
+def setup_parser(parser: ArgumentParser):
+    parser.description = "Returns MLST exact profile matches."
+    parser.add_argument(
+        "fastas",
+        nargs="+",
+        action='extend',
+        default=[],
+        type=str,
+        help="The FASTA files to process. Multiple can be listed."
+    )
+
+    parser.add_argument(
+        "seqdefdb",
+        help="The BIGSdb seqdef database to use for typing."
+    )
+
+    parser.add_argument(
+        "schema",
+        type=int,
+        help="The BIGSdb seqdef database schema ID (integer) to use for typing."
+    )
+
+    parser.add_argument(
+        "out",
+        default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
+        help="The output CSV name (.csv will be appended)."
+    )
+    parser.set_defaults(func=run_asynchronously)
+
+async def run(args):
+    async with BIGSdbIndex() as bigsdb_index:
+        gen_strings = read_multiple_fastas(args.fastas)
+        async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
+            mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
+            await write_mlst_profiles_as_csv(mlst_profiles, args.out)
+
+def run_asynchronously(args):
+    asyncio.run(run(args))
+
diff --git a/src/automlst/cli/info.py b/src/automlst/cli/info.py
index 26cd65f..20a4e9b 100644
--- a/src/automlst/cli/info.py
+++ b/src/automlst/cli/info.py
@@ -1,36 +1,38 @@
+from argparse import ArgumentParser
 import asyncio
-from automlst.cli import program
 from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
 
 
-parser = program.subparsers.add_parser(__name__)
+def setup_parser(parser: ArgumentParser):
+    parser.description = "Fetches the latest BIGSdb MLST database definitions."
+    parser.usage = "test"
+    parser.add_argument(
+        "--retrieve-bigsdbs", "-l",
+        action="store_true",
+        dest="list_dbs",
+        required=False,
+        default=False,
+        help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
+    )
 
-parser.add_argument(
-    "--retrieve-bigsdbs", "-l",
-    action="store_true",
-    dest="list_dbs",
-    required=False,
-    default=False,
-    type=bool,
-    help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
-)
+    parser.add_argument(
+        "--retrieve-bigsdb-schemas", "-lschemas",
+        nargs="+",
+        action="extend",
+        dest="list_bigsdb_schemas",
+        required=False,
+        default=[],
+        type=str,
+        help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given."
+    )
 
-parser.add_argument(
-    "--retrieve-bigsdb-schemas", "-lschemas",
-    nargs="+",
-    action="extend",
-    dest="list_bigsdb_schemas",
-    required=False,
-    default=[],
-    type=str,
-    help="Lists the known schema IDs for a given BIGSdb sequence definition database name"
-)
+    parser.set_defaults(func=run_asynchronously)
 
 async def run(args):
     async with BIGSdbIndex() as bigsdb_index:
         if args.list_dbs:
             known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
-            print(", ".join(known_seqdef_dbs.keys()))
+            print("\n".join(known_seqdef_dbs.keys()))
 
         for bigsdb_schema_name in args.list_bigsdb_schemas:
             schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
@@ -40,4 +42,3 @@ async def run(args):
 def run_asynchronously(args):
     asyncio.run(run(args))
 
-parser.set_defaults(func=run_asynchronously)
diff --git a/src/automlst/cli/meta.py b/src/automlst/cli/meta.py
new file mode 100644
index 0000000..d45096d
--- /dev/null
+++ b/src/automlst/cli/meta.py
@@ -0,0 +1,2 @@
+def get_module_base_name(name):
+    return name.split(".")[-1]
diff --git a/src/automlst/cli/profile.py b/src/automlst/cli/profile.py
deleted file mode 100644
index 1abc966..0000000
--- a/src/automlst/cli/profile.py
+++ /dev/null
@@ -1,55 +0,0 @@
-
-import asyncio
-import datetime
-from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence, Union
-from automlst.cli import program
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.data.mlst import MLSTProfile
-from automlst.engine.local.abif import read_abif, reference_consensus_assembly
-from automlst.engine.local.csv import write_mlst_profiles_as_csv
-from automlst.engine.local.fasta import read_fasta, read_multiple_fastas
-from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
-
-
-parser = program.subparsers.add_parser(__name__)
-
-parser.add_argument(
-    "--fasta", "-fa", "-fst",
-    nargs="+",
-    action='extend',
-    dest="fastas",
-    required=False,
-    default=[],
-    type=str,
-    help="The FASTA files to process. Multiple can be listed."
-)
-
-parser.add_argument(
-    "seqdefdb",
-    help="The BIGSdb seqdef database to use for typing."
-)
-
-parser.add_argument(
-    "schema",
-    type=int,
-    help="The BIGSdb seqdef database schema ID (integer) to use for typing."
-)
-
-parser.add_argument(
-    "out",
-    default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
-    help="The output CSV name (.csv will be appended)."
-)
-
-
-async def run(args):
-    async with BIGSdbIndex() as bigsdb_index:
-        gen_strings = read_multiple_fastas(args.fastas)
-        async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
-            mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
-            await write_mlst_profiles_as_csv(mlst_profiles, args.out)
-
-def run_asynchronously(args):
-    asyncio.run(run(args))
-
-parser.set_defaults(func=run_asynchronously)
diff --git a/src/automlst/cli/program.py b/src/automlst/cli/program.py
index e3e7a5e..0357144 100644
--- a/src/automlst/cli/program.py
+++ b/src/automlst/cli/program.py
@@ -4,6 +4,8 @@ import datetime
 from os import path
 import os
 
+from automlst.cli import exactmatch, info
+from automlst.cli.meta import get_module_base_name
 from automlst.engine.data.genomics import NamedString
 from automlst.engine.local.abif import read_abif
 from automlst.engine.local.csv import write_mlst_profiles_as_csv
@@ -13,10 +15,13 @@ from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
 root_parser = argparse.ArgumentParser()
 subparsers = root_parser.add_subparsers(required=True)
 
+info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
+exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
+
+
 def run():
     args = root_parser.parse_args()
     args.func(args)
 
-
 if __name__ == "__main__":
     run()
\ No newline at end of file
diff --git a/src/automlst/engine/local/csv.py b/src/automlst/engine/local/csv.py
index 2d88304..2340b6a 100644
--- a/src/automlst/engine/local/csv.py
+++ b/src/automlst/engine/local/csv.py
@@ -21,7 +21,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
         writer: Union[csv.DictWriter, None] = None
         async for name, mlst_profile in mlst_profiles_iterable:
             if writer is None:
-                header = ["st", "clonal-complex", "id", *mlst_profile.alleles.keys()]
+                header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
                 writer = csv.DictWriter(filehandle, fieldnames=header)
                 writer.writeheader()
             row_dictionary = {
diff --git a/src/automlst/engine/remote/databases/bigsdb.py b/src/automlst/engine/remote/databases/bigsdb.py
index c2db02a..d523b22 100644
--- a/src/automlst/engine/remote/databases/bigsdb.py
+++ b/src/automlst/engine/remote/databases/bigsdb.py
@@ -10,7 +10,9 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
 class BigSDBMLSTProfiler(AbstractAsyncContextManager):
 
     def __init__(self, database_api: str, database_name: str, schema_id: int):
-        self._base_url = f"{database_api}/db/{database_name}/schemes/{schema_id}/"
+        self._database_name = database_name
+        self._schema_id = schema_id
+        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
         self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
 
     async def __aenter__(self):
@@ -26,6 +28,9 @@ class BigSDBMLSTProfiler(AbstractAsyncContextManager):
         if "exact_matches" not in sequence_response:
             # TODO throw exception for not finding matches.
             pass
+
+        if "exact_matches" not in sequence_response:
+            raise ValueError(f"Unable to find exact matches in \"{self._database_name}\" under schema ID \"{self._schema_id}\".")
         exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
         for allele_loci, alleles in exact_matches.items():
             for allele in alleles:
@@ -100,12 +105,15 @@ class BIGSdbIndex(AbstractAsyncContextManager):
         return self._known_seqdef_dbs_origin
 
     async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
-        return (await self.get_known_seqdef_dbs())[seqdef_db_name]     
+        known_databases = await self.get_known_seqdef_dbs()
+        if seqdef_db_name not in known_databases:
+            raise ValueError(f"The database \"{seqdef_db_name}\" could not be found.")
+        return known_databases[seqdef_db_name]     
 
     async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
-        if self._seqdefdb_schemas[seqdef_db_name] is not None and not force:
+        if seqdef_db_name in self._seqdefdb_schemas and not force:
             return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
-        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/{seqdef_db_name}/schemes"
+        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
         async with self._http_client.get(uri_path) as response: 
             response_json = await response.json()
             schema_descriptions: Mapping[str, int] = dict()