Moved to a general BIGSdb implementation

Updated tests Removed ABIF UI for the time being Began updating CLI
2025-01-08 21:32:10 +00:00 · 2025-01-08 21:32:10 +00:00 · 42d0f56b18
commit 42d0f56b18
parent 645357ac58
20 changed files with 403 additions and 414 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -6,18 +6,12 @@
    "configurations": [
        
        {
-            "name": "CLI ipdbmlst",
+            "name": "CLI blank",
            "type": "debugpy",
            "request": "launch",
-            "program": "${workspaceFolder}/src/automlst/cli/root.py",
+            "program": "${workspaceFolder}/src/automlst/cli/program.py",
            "console": "integratedTerminal",
-            "args": [
-                "-fa",
-                "${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
-                "-ipdbmlst",
-                "pubmlst_bordetella_seqdef",
-                "${workspaceFolder}/output"
-            ],
+            "args": [],
            "cwd": "${workspaceFolder}/src",
            "env": {
                "PYTHONPATH": "${workspaceFolder}/src"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,7 +13,7 @@ requires-python = ">=3.11"
 description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."

 [project.scripts]
-automlst = "automlst.cli.root:run"
+automlst = "automlst.cli.program:run"

 [tool.pyright]
 extraPaths = ["src"]
--- a/src/automlst/engine/remote/databases/institutpasteur/init.py
+++ b/src/automlst/engine/remote/databases/institutpasteur/init.py
--- a/src/automlst/cli/aggregated.py
+++ b/src/automlst/cli/aggregated.py
@ -1,23 +0,0 @@
-from os import path
-from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
-from automlst.engine.data.mlst import MLSTProfile
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.local.abif import read_abif
-from automlst.engine.local.fasta import read_fasta
-from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
-
-
-async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
-    for fasta_path in fastas:
-        async for fasta in read_fasta(fasta_path):
-            yield fasta.sequence
-    for abif_path in abifs:
-        abif_data = await read_abif(abif_path)
-        yield "".join(abif_data.sequence)
-
-async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
-    profiles = list()
-    async with InstitutPasteurProfiler(database_name=database_name) as profiler:
-        async for string in strings:
-            profiles.append(await profiler.profile_string(string))
-    return profiles
--- a/src/automlst/cli/info.py
+++ b/src/automlst/cli/info.py
@ -0,0 +1,43 @@
+import asyncio
+from automlst.cli import program
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
+
+
+parser = program.subparsers.add_parser(__name__)
+
+parser.add_argument(
+    "--retrieve-bigsdbs", "-l",
+    action="store_true",
+    dest="list_dbs",
+    required=False,
+    default=False,
+    type=bool,
+    help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
+)
+
+parser.add_argument(
+    "--retrieve-bigsdb-schemas", "-lschemas",
+    nargs="+",
+    action="extend",
+    dest="list_bigsdb_schemas",
+    required=False,
+    default=[],
+    type=str,
+    help="Lists the known schema IDs for a given BIGSdb sequence definition database name"
+)
+
+async def run(args):
+    async with BIGSdbIndex() as bigsdb_index:
+        if args.list_dbs:
+            known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
+            print(", ".join(known_seqdef_dbs.keys()))
+
+        for bigsdb_schema_name in args.list_bigsdb_schemas:
+            schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
+            for schema_desc, schema_id in schemas.items():
+                print(f"{schema_desc}: {schema_id}")
+
+def run_asynchronously(args):
+    asyncio.run(run(args))
+
+parser.set_defaults(func=run_asynchronously)
--- a/src/automlst/cli/profile.py
+++ b/src/automlst/cli/profile.py
@ -0,0 +1,55 @@
+
+import asyncio
+import datetime
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence, Union
+from automlst.cli import program
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.data.mlst import MLSTProfile
+from automlst.engine.local.abif import read_abif, reference_consensus_assembly
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_fasta, read_multiple_fastas
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
+
+
+parser = program.subparsers.add_parser(__name__)
+
+parser.add_argument(
+    "--fasta", "-fa", "-fst",
+    nargs="+",
+    action='extend',
+    dest="fastas",
+    required=False,
+    default=[],
+    type=str,
+    help="The FASTA files to process. Multiple can be listed."
+)
+
+parser.add_argument(
+    "seqdefdb",
+    help="The BIGSdb seqdef database to use for typing."
+)
+
+parser.add_argument(
+    "schema",
+    type=int,
+    help="The BIGSdb seqdef database schema ID (integer) to use for typing."
+)
+
+parser.add_argument(
+    "out",
+    default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
+    help="The output CSV name (.csv will be appended)."
+)
+
+
+async def run(args):
+    async with BIGSdbIndex() as bigsdb_index:
+        gen_strings = read_multiple_fastas(args.fastas)
+        async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
+            mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
+            await write_mlst_profiles_as_csv(mlst_profiles, args.out)
+
+def run_asynchronously(args):
+    asyncio.run(run(args))
+
+parser.set_defaults(func=run_asynchronously)
--- a/src/automlst/cli/program.py
+++ b/src/automlst/cli/program.py
@ -0,0 +1,22 @@
+import argparse
+import asyncio
+import datetime
+from os import path
+import os
+
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.local.abif import read_abif
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_fasta
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
+
+root_parser = argparse.ArgumentParser()
+subparsers = root_parser.add_subparsers(required=True)
+
+def run():
+    args = root_parser.parse_args()
+    args.func(args)
+
+
+if __name__ == "__main__":
+    run()
--- a/src/automlst/cli/root.py
+++ b/src/automlst/cli/root.py
@ -1,86 +0,0 @@
-import argparse
-import asyncio
-import datetime
-from os import path
-import os
-
-from automlst.cli import aggregated
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.local.abif import read_abif
-from automlst.engine.local.csv import write_mlst_profiles_as_csv
-from automlst.engine.local.fasta import read_fasta
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--run-name", "-name",
-    dest="run_name",
-    required=False,
-    default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"),
-    type=str,
-    help="The name of the run. Will use a date and time string if not provided."
-)
-parser.add_argument(
-    "--fasta", "-fa", "-fst",
-    nargs="+",
-    action='extend',
-    dest="fastas",
-    required=False,
-    default=[],
-    type=str,
-    help="The FASTA files to process. Multiple can be listed."
-)
-parser.add_argument(
-    "--abif", "-abi", "-ab1",
-    action='extend',
-    dest="abifs",
-    required=False,
-    default=[],
-    type=str,
-    help="The ABIF files to process. Multiple can be listed."
-)
-parser.add_argument(
-    "--ncbi-assembly-reference", "-refncbi",
-    dest="ncbi_assembly_reference",
-    required=False,
-    default=None,
-    type=str,
-    help="The NCBI GenBank accession ID for the consensus assembly. Either this argument, or the path equivalent must be given if ABIF files are used."
-)
-parser.add_argument(
-    "--assembly-reference", "-ref",
-    dest="assembly_reference",
-    required=False,
-    default=None,
-    type=str,
-    help="The path to the FASTA sequence to be used as a reference for consensus building."
-)
-parser.add_argument(
-    "--institut-pasteur-mlst",
-    "-ipdbmlst",
-    dest="institut_pasteur_db",
-    required=False,
-    default=None,
-    type=str,
-    help="The Institut Pasteur MLST database to use."
-)
-parser.add_argument(
-    "out",
-    default="./.",
-    help="The output folder. Files will be named by the provided (or default) run name. Outputs will be automatically generated depending on which arguments are used."
-)
-
-
-def run():
-    args = parser.parse_args()
-    gen_strings = aggregated.aggregate_sequences(args.fastas, args.abifs)
-    os.makedirs(args.out, exist_ok=True)
-    if args.institut_pasteur_db is not None:
-        mlst_profiles = aggregated.profile_all_genetic_strings(
-            gen_strings, args.institut_pasteur_db)
-        asyncio.run(write_mlst_profiles_as_csv(
-            asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv"))))
-
-
-if __name__ == "__main__":
-    run()
--- a/src/automlst/engine/local/abif.py
+++ b/src/automlst/engine/local/abif.py
@ -1,11 +1,13 @@
 import asyncio
 from numbers import Number
 from os import path
-from typing import Any, AsyncGenerator, Collection, Sequence, Union
+from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
 from automlst.engine.data.genomics import NamedString, SangerTraceData
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO, Align

+from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
+

 def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
    with open(seq_path, "rb") as seq_handle:
@ -110,9 +112,15 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
    aligner.mode = "local"
    alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
        0]  # take the best alignment
-    return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
+    # TODO actually assemble the consensus sequence here
+    raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")


-async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
+async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
+    if isinstance(reference, str):
+        reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
+    else:
+        reference_seq: NamedString  = reference
    for sanger_trace in sanger_traces:
-        yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]
+        yield NamedString("NA", "NA")
+        raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
--- a/src/automlst/engine/local/csv.py
+++ b/src/automlst/engine/local/csv.py
@ -6,7 +6,7 @@ from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
 from automlst.engine.data.mlst import Allele, MLSTProfile


-def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
+def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
    result_dict: dict[str, list[str]] = {}
    for loci, alleles in alleles_map.items():
        result_dict[loci] = list()
@ -15,17 +15,19 @@ def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]])
    return result_dict


-async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
-    mlst_profiles = list(mlst_profiles_iterable)
-    header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
    with open(handle, "w", newline='') as filehandle:
-        writer = csv.DictWriter(filehandle, fieldnames=header)
-        writer.writeheader()
-        for mlst_profile in mlst_profiles:
+        header = None
+        writer: Union[csv.DictWriter, None] = None
+        async for name, mlst_profile in mlst_profiles_iterable:
+            if writer is None:
+                header = ["st", "clonal-complex", "id", *mlst_profile.alleles.keys()]
+                writer = csv.DictWriter(filehandle, fieldnames=header)
+                writer.writeheader()
            row_dictionary = {
                "st": mlst_profile.sequence_type,
                "clonal-complex": mlst_profile.clonal_complex,
-                **loci_alleles_variants_from_loci(mlst_profile.alleles)
+                "id": name,
+                **dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
            }
-
-            writer.writerow(rowdict=row_dictionary)
+            writer.writerow(rowdict=row_dictionary)
--- a/src/automlst/engine/local/fasta.py
+++ b/src/automlst/engine/local/fasta.py
@ -1,6 +1,6 @@
 import asyncio
 from io import TextIOWrapper
-from typing import Any, AsyncGenerator, Generator, Sequence, Union
+from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
 from Bio import SeqIO

 from automlst.engine.data.genomics import NamedString
@ -8,4 +8,9 @@ from automlst.engine.data.genomics import NamedString
 async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
    for fasta_sequence in await fasta_sequences:
-        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
+    for handle in handles:
+        async for named_seq in read_fasta(handle):
+            yield named_seq
--- a/src/automlst/engine/remote/databases/bigsdb.py
+++ b/src/automlst/engine/remote/databases/bigsdb.py
@ -0,0 +1,127 @@
+from collections import defaultdict
+from contextlib import AbstractAsyncContextManager
+from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
+
+from aiohttp import ClientSession, ClientTimeout
+
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.data.mlst import Allele, MLSTProfile
+
+class BigSDBMLSTProfiler(AbstractAsyncContextManager):
+
+    def __init__(self, database_api: str, database_name: str, schema_id: int):
+        self._base_url = f"{database_api}/db/{database_name}/schemes/{schema_id}/"
+        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
+
+    async def __aenter__(self):
+        return self
+
+    async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
+        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
+        uri_path = "sequence"
+        response = await self._http_client.post(uri_path, json={
+            "sequence": sequence_string
+        })
+        sequence_response: dict = await response.json()
+        if "exact_matches" not in sequence_response:
+            # TODO throw exception for not finding matches.
+            pass
+        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+        for allele_loci, alleles in exact_matches.items():
+            for allele in alleles:
+                alelle_id = allele["allele_id"]
+                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
+
+    async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
+        uri_path = "designations"
+        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        async for allele in alleles:
+            allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+
+        request_json = {
+            "designations": allele_request_dict
+        }
+        async with self._http_client.post(uri_path, json=request_json) as response:
+            response_json = await response.json()
+            if "fields" not in response_json:
+                # TODO raise exception about invalid parameters or no exact parameterization found
+                pass
+            schema_fields_returned = response_json["fields"]
+            schema_exact_matches: dict = response_json["exact_matches"]
+            allele_map: dict[str, list[Allele]] = defaultdict(list)
+            for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
+                for exact_match_allele in exact_match_alleles:
+                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
+            return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+
+    async def profile_string(self, string: str) -> MLSTProfile:
+        alleles = self.fetch_mlst_allele_variants(string)
+        return await self.fetch_mlst_st(alleles)
+
+
+    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
+        async for named_string in namedStrings:
+            yield (named_string.name, await self.profile_string(named_string.sequence))
+
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+
+class BIGSdbIndex(AbstractAsyncContextManager):
+    KNOWN_BIGSDB_APIS = {
+        "https://bigsdb.pasteur.fr/api",
+        "https://rest.pubmlst.org"
+    }
+
+    def __init__(self):
+        self._http_client = ClientSession()
+        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
+        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
+        super().__init__()
+
+    async def __aenter__(self):
+        return self
+    
+    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
+        if self._known_seqdef_dbs_origin is not None and not force:
+            return self._known_seqdef_dbs_origin
+        known_seqdef_dbs = dict()
+        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
+            async with self._http_client.get(f"{known_bigsdb}/db") as response:
+                response_json_databases = await response.json()
+                for database_group in response_json_databases:
+                    for database_info in database_group["databases"]:
+                        if str(database_info["name"]).endswith("seqdef"):
+                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
+        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
+        return self._known_seqdef_dbs_origin
+
+    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
+        return (await self.get_known_seqdef_dbs())[seqdef_db_name]     
+
+    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
+        if self._seqdefdb_schemas[seqdef_db_name] is not None and not force:
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
+        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/{seqdef_db_name}/schemes"
+        async with self._http_client.get(uri_path) as response: 
+            response_json = await response.json()
+            schema_descriptions: Mapping[str, int] = dict()
+            for scheme_definition in response_json["schemes"]:
+                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
+                scheme_desc: str = scheme_definition["description"]
+                schema_descriptions[scheme_desc] = scheme_id
+            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
+
+    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
+        return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+    
--- a/src/automlst/engine/remote/databases/institutpasteur/mlst.py
+++ b/src/automlst/engine/remote/databases/institutpasteur/mlst.py
@ -1,69 +0,0 @@
-from collections import defaultdict
-from contextlib import AbstractAsyncContextManager
-import re
-from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
-from aiohttp import ClientSession, ClientTimeout
-from automlst.engine.data.mlst import Allele, MLSTProfile
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.remote.databases.mlst import MLSTProfiler
-
-class InstitutPasteurProfiler(MLSTProfiler):
-
-    async def __aenter__(self):
-        return self
-
-
-    def __init__(self, database_name: str):
-        self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
-        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
-
-    async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
-        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
-        uri_path = f"schemes/{schema_id}/sequence"
-        response = await self._http_client.post(uri_path, json={
-            "sequence": sequence_string
-        })
-        sequence_response: dict = await response.json()
-        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
-        for allele_loci, alleles in exact_matches.items():
-            for allele in alleles:
-                alelle_id = allele["allele_id"]
-                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
-
-    async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
-        uri_path = f"schemes/{schema_id}/designations"
-        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
-        async for allele in alleles:
-            allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
-        response = await self._http_client.post(uri_path, json={
-            "designations": allele_request_dict
-        })
-        response_json = await response.json()
-        schema_fields_returned = response_json["fields"]
-        schema_exact_matches = response_json["exact_matches"]
-        allele_map: dict[str, list[Allele]] = defaultdict(list)
-        for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
-            for exact_match_allele in exact_match_alleles:
-                allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
-        return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
-
-    async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(schema_id, string)
-        return await self.fetch_mlst_st(schema_id, alleles)
-
-    async def get_scheme_ids(self) -> Mapping[str, int]:
-        uri_path = "schemes"
-        response = await self._http_client.get(uri_path)
-        response_json = await response.json()
-        schema_descriptions: Mapping[str, int] = dict()
-        for scheme_definition in response_json["schemes"]:
-            scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
-            scheme_desc: str = scheme_definition["description"]
-            schema_descriptions[scheme_desc] = scheme_id
-        return schema_descriptions
-
-    async def close(self):
-        await self._http_client.close()
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
--- a/src/automlst/engine/remote/databases/mlst.py
+++ b/src/automlst/engine/remote/databases/mlst.py
@ -1,33 +0,0 @@
-from abc import abstractmethod
-from contextlib import AbstractAsyncContextManager
-from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union
-
-from aiohttp import ClientSession
-
-from automlst.engine.data.mlst import Allele, MLSTProfile
-
-MLST_DATABASES = [
-    "https://bigsdb.pasteur.fr/api/db",
-    "https://rest.pubmlst.org/db"
-]
-
-class MLSTProfiler(AbstractAsyncContextManager):
-    @abstractmethod
-    def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
-        pass
-    
-    @abstractmethod
-    async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
-        pass
-
-    @abstractmethod
-    async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
-        pass
-
-    @abstractmethod
-    async def close(self):
-        pass
-
-    @abstractmethod
-    async def get_scheme_ids(self) -> Mapping[str, int]:
-        pass
--- a/src/automlst/engine/remote/databases/pubmlst/init.py
+++ b/src/automlst/engine/remote/databases/pubmlst/init.py
--- a/src/automlst/engine/remote/databases/pubmlst/mlst.py
+++ b/src/automlst/engine/remote/databases/pubmlst/mlst.py
@ -1,68 +0,0 @@
-from collections import defaultdict
-from contextlib import AbstractAsyncContextManager
-import re
-from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
-from aiohttp import ClientSession, ClientTimeout
-from automlst.engine.data.mlst import Allele, MLSTProfile
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.remote.databases.mlst import MLSTProfiler
-
-class PubMLSTProfiler(MLSTProfiler):
-
-    async def __aenter__(self):
-        return self
-
-
-    def __init__(self, database_name: str):
-        self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
-        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
-
-    async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
-        uri_path = f"schemes/{schema_id}/sequence"
-        response = await self._http_client.post(uri_path, json={
-            "sequence": sequence_string
-        })
-        sequence_response: dict = await response.json()
-        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
-        for allele_loci, alleles in exact_matches.items():
-            for allele in alleles:
-                alelle_id = allele["allele_id"]
-                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
-
-    async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
-        uri_path = f"schemes/{schema_id}/designations"
-        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
-        async for allele in alleles:
-            allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
-        response = await self._http_client.post(uri_path, json={
-            "designations": allele_request_dict
-        })
-        response_json = await response.json()
-        schema_fields_returned = response_json["fields"]
-        schema_exact_matches = response_json["exact_matches"]
-        allele_map: dict[str, list[Allele]] = defaultdict(list)
-        for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
-            for exact_match_allele in exact_match_alleles:
-                allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
-        return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
-
-    async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(schema_id, string)
-        return await self.fetch_mlst_st(schema_id, alleles)
-
-    async def get_scheme_ids(self) -> Mapping[str, int]:
-        uri_path = "schemes"
-        response = await self._http_client.get(uri_path)
-        response_json = await response.json()
-        schema_descriptions: Mapping[str, int] = dict()
-        for scheme_definition in response_json["schemes"]:
-            scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
-            scheme_desc: str = scheme_definition["description"]
-            schema_descriptions[scheme_desc] = scheme_id
-        return schema_descriptions
-    
-    async def close(self):
-        await self._http_client.close()
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
--- a/tests/nsbdiagnosistoolkit/engine/local/test_abif.py
+++ b/tests/nsbdiagnosistoolkit/engine/local/test_abif.py
@ -1,8 +1,12 @@
 import os

-from automlst.engine.local.abif import read_abif
+from automlst.engine.local.abif import read_abif, reference_consensus_assembly

 async def test_load_sanger_sequence_has_data():
    assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
    result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
-    assert result_data is not None
+    assert result_data is not None
+
+async def test_consensus_assembly_with_ncbi():
+    consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
+    # TODO complete implementing this
--- a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py
+++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_mlst.py
@ -1,54 +0,0 @@
-from Bio import SeqIO
-from automlst.engine.data.mlst import Allele, MLSTProfile
-from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
-
-
-async def test_profiling_results_in_exact_matches_when_exact():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence)
-        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
-            targets_left.remove(exact_match.allele_loci)
-
-        assert len(targets_left) == 0
-
-async def test_profiling_results_in_correct_st():
-    async def dummy_allele_generator():
-        dummy_alleles = [
-        Allele("adk", "1"),
-        Allele("fumC", "1"),
-        Allele("glyA", "1"),
-        Allele("tyrB", "1"),
-        Allele("icd", "1"),
-        Allele("pepA", "1"),
-        Allele("pgm", "1"),
-        ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_allele_generator())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-2 complex"
-        assert mlst_st_data.sequence_type == "1"
-
-async def test_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_alleles = [
-        Allele("adk", "1"),
-        Allele("fumC", "1"),
-        Allele("glyA", "1"),
-        Allele("tyrB", "1"),
-        Allele("icd", "1"),
-        Allele("pepA", "1"),
-        Allele("pgm", "1"),
-    ]
-    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
-        profile = await dummy_profiler.profile_string(3, sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-2 complex"
-        assert profile.sequence_type == "1"
--- a/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py
+++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/pubmlst/test_mlst.py
@ -1,53 +0,0 @@
-import asyncio
-from Bio import SeqIO
-from automlst.engine.data.mlst import Allele, MLSTProfile
-from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
-
-
-async def test_profiling_results_in_exact_matches_when_exact():
-    dummy_alleles = {
-        Allele("adk", "1"),
-        Allele("atpG", "1"),
-        Allele("frdB", "1"),
-        Allele("fucK", "1"),
-        Allele("mdh", "1"),
-        Allele("pgi", "1"),
-        Allele("recA", "5"),
-    }
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence)
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            dummy_alleles.remove(exact_match)
-
-        assert len(dummy_alleles) == 0
-
-async def test_profiling_results_in_correct_st():
-    async def generate_dummy_targets():
-        dummy_alleles = [
-                Allele("adk", "1"),
-                Allele("atpG", "1"),
-                Allele("frdB", "1"),
-                Allele("fucK", "1"),
-                Allele("mdh", "1"),
-                Allele("pgi", "1"),
-                Allele("recA", "5"),
-            ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(1, generate_dummy_targets())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-3 complex"
-        assert mlst_st_data.sequence_type == "3"
-
-async def test_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
-        profile = await dummy_profiler.profile_string(1, sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-3 complex"
-        assert profile.sequence_type == "3"
--- a/tests/nsbdiagnosistoolkit/engine/remote/databases/test_bigsdb.py
+++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/test_bigsdb.py
@ -0,0 +1,115 @@
+from Bio import SeqIO
+from automlst.engine.data.mlst import Allele, MLSTProfile
+from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
+
+
+async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
+        async for exact_match in exact_matches:
+            assert isinstance(exact_match, Allele)
+            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
+            targets_left.remove(exact_match.allele_loci)
+
+        assert len(targets_left) == 0
+
+async def test_institutpasteur_profiling_results_in_correct_mlst_st():
+    async def dummy_allele_generator():
+        dummy_alleles = [
+        Allele("adk", "1"),
+        Allele("fumC", "1"),
+        Allele("glyA", "1"),
+        Allele("tyrB", "1"),
+        Allele("icd", "1"),
+        Allele("pepA", "1"),
+        Allele("pgm", "1"),
+        ]
+        for dummy_allele in dummy_alleles:
+            yield dummy_allele
+    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-2 complex"
+        assert mlst_st_data.sequence_type == "1"
+
+async def test_institutpasteur_sequence_profiling_is_correct():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        profile = await dummy_profiler.profile_string(sequence)
+        assert profile is not None
+        assert isinstance(profile, MLSTProfile)
+        assert profile.clonal_complex == "ST-2 complex"
+        assert profile.sequence_type == "1"
+
+
+async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
+    dummy_alleles = {
+        Allele("adk", "1"),
+        Allele("atpG", "1"),
+        Allele("frdB", "1"),
+        Allele("fucK", "1"),
+        Allele("mdh", "1"),
+        Allele("pgi", "1"),
+        Allele("recA", "5"),
+    }
+    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
+    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        async for exact_match in exact_matches:
+            assert isinstance(exact_match, Allele)
+            dummy_alleles.remove(exact_match)
+
+        assert len(dummy_alleles) == 0
+
+async def test_pubmlst_profiling_results_in_correct_st():
+    async def generate_dummy_targets():
+        dummy_alleles = [
+                Allele("adk", "1"),
+                Allele("atpG", "1"),
+                Allele("frdB", "1"),
+                Allele("fucK", "1"),
+                Allele("mdh", "1"),
+                Allele("pgi", "1"),
+                Allele("recA", "5"),
+            ]
+        for dummy_allele in dummy_alleles:
+            yield dummy_allele
+    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-3 complex"
+        assert mlst_st_data.sequence_type == "3"
+
+async def test_pubmlst_sequence_profiling_is_correct():
+    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
+    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        profile = await dummy_profiler.profile_string(sequence)
+        assert profile is not None
+        assert isinstance(profile, MLSTProfile)
+        assert profile.clonal_complex == "ST-3 complex"
+        assert profile.sequence_type == "3"
+
+async def test_bigsdb_index_all_databases_is_not_empty():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
+
+async def test_bigsdb_index_references_pubmlst_correctly():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
+
+async def test_bigsdb_index_references_institutpasteur_correctly():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
+
+
+async def test_bigsdb_index_instantiates_correct_profiler():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BIGSdbIndex() as bigsdb_index:
+        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
+            profile = await profiler.profile_string(sequence)
+            assert profile.clonal_complex == "ST-2 complex"
+            assert profile.sequence_type == "1"