Merge branch 'develop'

Moved ABIF code to separate project
Refactored code layout
2025-01-17 14:34:16 +00:00 · 2025-01-17 14:27:25 +00:00 · 2025-01-16 21:54:52 +00:00 · 2025-01-16 21:29:20 +00:00 · 2025-01-16 21:22:49 +00:00 · 2025-01-10 21:13:26 +00:00
28 changed files with 127722 additions and 485 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -357,3 +357,4 @@ package
 # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)

 output
+*.private.*
--- a/37
+++ b/37
@@ -14,8 +14,9 @@ pipeline {
        }
        stage("unit tests") {
            steps {
-                sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml"
+                sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml --cov=src --cov-report xml:coverage.xml"
                xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
+                recordCoverage(tools: [[parser: 'COBERTURA', pattern: 'coverage.xml']])
            }
        }
        stage("build") {
@@ -23,26 +24,32 @@ pipeline {
                sh "python -m build"
            }
        }
-        stage("test installation") {
-            steps {
-                sh "python -m pip install dist/*.whl --force-reinstall"
-                sh "automlst -h"
-            }
-        }
        stage("archive") {
            steps {
                archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
            }
        }
        stage("publish") {
-            environment {
-                CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
-            }
-            when {
-                branch '**/main'
-            }
-            steps {
-                sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
+            parallel {
+                stage ("git.reslate.systems") {
+                    environment {
+                        TOKEN = credentials('git.reslate.systems')
+                    }
+                    steps {
+                        sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
+                    }
+                }
+                stage ("test.pypi.org") {
+                    when {
+                        tag '*.*'
+                    }
+                    environment {
+                        TOKEN = credentials('test.pypi.org')
+                    }
+                    steps {
+                        sh returnStatus: true, script: 'python -m twine upload -r testpypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
+                    }
+                }
            }
        }
    }
--- a/README.md
+++ b/README.md
@@ -1,3 +1,12 @@
-# FASTA-MLST
+# autoMLST

-A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
+A CLI/library for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
+
+# Components
+
+## automlst.cli
+
+The command line interface, sets up very minimal and mostly makes calls to the library. Uses argparse and is split into two parts:
+
+- `automlst info`: Provides user information on available databases to pull from, and the schemas available.
+- `automlst exactmatch`: Provides users the ability to request exact match results from a given database and schema
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"

 [project]
-name = "automlst"
+name = "automlst.engine"
 dynamic = ["version"]

 dependencies = [
@@ -11,10 +11,7 @@ dependencies = [
    "aiohttp[speedups]",
 ]
 requires-python = ">=3.11"
-description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
-
-[project.scripts]
-automlst = "automlst.cli.program:run"
+description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."

 [tool.setuptools_scm]

--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ pytest
 pytest-asyncio
 build
 twine
-setuptools_scm
+setuptools_scm
+pytest-cov
--- a/src/automlst/cli/exactmatch.py
+++ b/src/automlst/cli/exactmatch.py
@@ -1,48 +0,0 @@
-
-from argparse import ArgumentParser
-import asyncio
-import datetime
-from automlst.engine.local.csv import write_mlst_profiles_as_csv
-from automlst.engine.local.fasta import read_multiple_fastas
-from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
-
-
-def setup_parser(parser: ArgumentParser):
-    parser.description = "Returns MLST exact profile matches."
-    parser.add_argument(
-        "fastas",
-        nargs="+",
-        action='extend',
-        default=[],
-        type=str,
-        help="The FASTA files to process. Multiple can be listed."
-    )
-
-    parser.add_argument(
-        "seqdefdb",
-        help="The BIGSdb seqdef database to use for typing."
-    )
-
-    parser.add_argument(
-        "schema",
-        type=int,
-        help="The BIGSdb seqdef database schema ID (integer) to use for typing."
-    )
-
-    parser.add_argument(
-        "out",
-        default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
-        help="The output CSV name (.csv will be appended)."
-    )
-    parser.set_defaults(func=run_asynchronously)
-
-async def run(args):
-    async with BIGSdbIndex() as bigsdb_index:
-        gen_strings = read_multiple_fastas(args.fastas)
-        async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
-            mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
-            await write_mlst_profiles_as_csv(mlst_profiles, args.out)
-
-def run_asynchronously(args):
-    asyncio.run(run(args))
-
--- a/src/automlst/cli/info.py
+++ b/src/automlst/cli/info.py
@@ -1,44 +0,0 @@
-from argparse import ArgumentParser
-import asyncio
-from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
-
-
-def setup_parser(parser: ArgumentParser):
-    parser.description = "Fetches the latest BIGSdb MLST database definitions."
-    parser.usage = "test"
-    parser.add_argument(
-        "--retrieve-bigsdbs", "-l",
-        action="store_true",
-        dest="list_dbs",
-        required=False,
-        default=False,
-        help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
-    )
-
-    parser.add_argument(
-        "--retrieve-bigsdb-schemas", "-lschemas",
-        nargs="+",
-        action="extend",
-        dest="list_bigsdb_schemas",
-        required=False,
-        default=[],
-        type=str,
-        help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given."
-    )
-
-    parser.set_defaults(func=run_asynchronously)
-
-async def run(args):
-    async with BIGSdbIndex() as bigsdb_index:
-        if args.list_dbs:
-            known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
-            print("\n".join(known_seqdef_dbs.keys()))
-
-        for bigsdb_schema_name in args.list_bigsdb_schemas:
-            schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
-            for schema_desc, schema_id in schemas.items():
-                print(f"{schema_desc}: {schema_id}")
-
-def run_asynchronously(args):
-    asyncio.run(run(args))
-
--- a/src/automlst/cli/meta.py
+++ b/src/automlst/cli/meta.py
@@ -1,2 +0,0 @@
-def get_module_base_name(name):
-    return name.split(".")[-1]
--- a/src/automlst/cli/program.py
+++ b/src/automlst/cli/program.py
@@ -1,27 +0,0 @@
-import argparse
-import asyncio
-import datetime
-from os import path
-import os
-
-from automlst.cli import exactmatch, info
-from automlst.cli.meta import get_module_base_name
-from automlst.engine.data.genomics import NamedString
-from automlst.engine.local.abif import read_abif
-from automlst.engine.local.csv import write_mlst_profiles_as_csv
-from automlst.engine.local.fasta import read_fasta
-from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
-
-root_parser = argparse.ArgumentParser()
-subparsers = root_parser.add_subparsers(required=True)
-
-info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
-exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
-
-
-def run():
-    args = root_parser.parse_args()
-    args.func(args)
-
-if __name__ == "__main__":
-    run()
--- a/src/automlst/engine/data/local/init.py
+++ b/src/automlst/engine/data/local/init.py
--- a/src/automlst/engine/data/local/csv.py
+++ b/src/automlst/engine/data/local/csv.py
@@ -3,23 +3,30 @@ from io import TextIOWrapper
 from os import PathLike
 from typing import AsyncIterable, Iterable, Mapping, Sequence, Union

-from automlst.engine.data.mlst import Allele, MLSTProfile
+from automlst.engine.data.structures.mlst import Allele, MLSTProfile


 def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
-    result_dict: dict[str, list[str]] = {}
+    result_dict: dict[str, Union[list[str], str]] = {}
    for loci, alleles in alleles_map.items():
-        result_dict[loci] = list()
+        if len(alleles) == 1:
+            result_dict[loci] = alleles[0].allele_variant
        for allele in alleles:
-            result_dict[loci].append(allele.allele_variant)
+            result_locis = list()
+            result_locis.append(allele.allele_variant)
+            result_dict[loci] = result_locis
    return result_dict


-async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
+    failed = list()
    with open(handle, "w", newline='') as filehandle:
        header = None
        writer: Union[csv.DictWriter, None] = None
        async for name, mlst_profile in mlst_profiles_iterable:
+            if mlst_profile is None:
+                failed.append(name)
+                continue
            if writer is None:
                header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
                writer = csv.DictWriter(filehandle, fieldnames=header)
@@ -30,4 +37,5 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
                "id": name,
                **dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
            }
-            writer.writerow(rowdict=row_dictionary)
+            writer.writerow(rowdict=row_dictionary)
+    return failed
--- a/src/automlst/engine/data/local/fasta.py
+++ b/src/automlst/engine/data/local/fasta.py
@@ -3,7 +3,7 @@ from io import TextIOWrapper
 from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
 from Bio import SeqIO

-from automlst.engine.data.genomics import NamedString
+from automlst.engine.data.structures.genomics import NamedString

 async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
--- a/src/automlst/engine/data/mlst.py
+++ b/src/automlst/engine/data/mlst.py
@@ -1,13 +0,0 @@
-from dataclasses import dataclass
-from typing import Mapping, Sequence
-
-@dataclass(frozen=True)
-class Allele:
-    allele_loci: str
-    allele_variant: str
-
-@dataclass(frozen=True)
-class MLSTProfile:
-    alleles: Mapping[str, Sequence[Allele]]
-    sequence_type: int
-    clonal_complex: str
--- a/src/automlst/engine/remote/databases/ncbi/init.py
+++ b/src/automlst/engine/remote/databases/ncbi/init.py
--- a/src/automlst/engine/data/remote/databases/bigsdb.py
+++ b/src/automlst/engine/data/remote/databases/bigsdb.py
@@ -1,13 +1,15 @@
 from collections import defaultdict
 from contextlib import AbstractAsyncContextManager
+from numbers import Number
 from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union

 from aiohttp import ClientSession, ClientTimeout

-from automlst.engine.data.genomics import NamedString
-from automlst.engine.data.mlst import Allele, MLSTProfile
+from automlst.engine.data.structures.genomics import NamedString
+from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
+from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException

-class BigSDBMLSTProfiler(AbstractAsyncContextManager):
+class BIGSdbMLSTProfiler(AbstractAsyncContextManager):

    def __init__(self, database_api: str, database_name: str, schema_id: int):
        self._database_name = database_name
@@ -18,56 +20,85 @@ class BigSDBMLSTProfiler(AbstractAsyncContextManager):
    async def __aenter__(self):
        return self

-    async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
+    async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
        uri_path = "sequence"
        response = await self._http_client.post(uri_path, json={
-            "sequence": sequence_string
+            "sequence": sequence_string,
+            "partial_matches": not exact
        })
        sequence_response: dict = await response.json()
-        if "exact_matches" not in sequence_response:
-            # TODO throw exception for not finding matches.
-            pass

-        if "exact_matches" not in sequence_response:
-            raise ValueError(f"Unable to find exact matches in \"{self._database_name}\" under schema ID \"{self._schema_id}\".")
-        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
-        for allele_loci, alleles in exact_matches.items():
-            for allele in alleles:
-                alelle_id = allele["allele_id"]
-                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
+        if "exact_matches" in sequence_response:
+            # loci -> list of alleles with id and loci
+            exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+            for allele_loci, alleles in exact_matches.items():
+                for allele in alleles:
+                    alelle_id = allele["allele_id"]
+                    yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
+        elif "partial_matches" in sequence_response:
+            if exact:
+                raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
+            partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
+            for allele_loci, partial_match in partial_matches.items():
+                if len(partial_match) <= 0:
+                    continue
+                partial_match_profile = PartialAllelicMatchProfile(
+                    percent_identity=float(partial_match["identity"]),
+                    mismatches=int(partial_match["mismatches"]),
+                    bitscore=float(partial_match["bitscore"]),
+                    gaps=int(partial_match["gaps"])
+                )
+                yield Allele(
+                    allele_loci=allele_loci,
+                    allele_variant=str(partial_match["allele"]),
+                    partial_match_profile=partial_match_profile
+                )
+        else:
+            raise NoBIGSdbMatchesException(self._database_name, self._schema_id)

-    async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
+
+
+    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
        uri_path = "designations"
        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
-        async for allele in alleles:
-            allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
-
+        if isinstance(alleles, AsyncIterable):
+            async for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        else:
+            for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
        request_json = {
            "designations": allele_request_dict
        }
        async with self._http_client.post(uri_path, json=request_json) as response:
-            response_json = await response.json()
-            if "fields" not in response_json:
-                # TODO raise exception about invalid parameters or no exact parameterization found
-                pass
-            schema_fields_returned = response_json["fields"]
-            schema_exact_matches: dict = response_json["exact_matches"]
+            response_json: dict = await response.json()
            allele_map: dict[str, list[Allele]] = defaultdict(list)
+            response_json.setdefault("fields", dict())
+            schema_fields_returned: dict[str, str] = response_json["fields"]
+            schema_fields_returned.setdefault("ST", "unknown")
+            schema_fields_returned.setdefault("clonal_complex", "unknown")
+            schema_exact_matches: dict = response_json["exact_matches"]
            for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
                for exact_match_allele in exact_match_alleles:
-                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
-            return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
+            if len(allele_map) == 0:
+                raise ValueError("Passed in no alleles.")
+            return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])

-    async def profile_string(self, string: str) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(string)
+    async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
+        alleles = self.fetch_mlst_allele_variants(string, exact)
        return await self.fetch_mlst_st(alleles)


-    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
+    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
        async for named_string in namedStrings:
-            yield (named_string.name, await self.profile_string(named_string.sequence))
-
+            try:
+                yield (named_string.name, await self.profile_string(named_string.sequence, exact))
+            except NoBIGSdbMatchesException as e:
+                if stop_on_fail:
+                    raise e
+                yield (named_string.name, None)

    async def close(self):
        await self._http_client.close()
@@ -107,7 +138,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
        known_databases = await self.get_known_seqdef_dbs()
        if seqdef_db_name not in known_databases:
-            raise ValueError(f"The database \"{seqdef_db_name}\" could not be found.")
+            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
        return known_databases[seqdef_db_name]     

    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
@@ -124,8 +155,8 @@ class BIGSdbIndex(AbstractAsyncContextManager):
            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore

-    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
-        return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
+    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
+        return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)

    async def close(self):
        await self._http_client.close()
--- a/src/automlst/engine/data/structures/init.py
+++ b/src/automlst/engine/data/structures/init.py
--- a/src/automlst/engine/data/structures/genomics.py
+++ b/src/automlst/engine/data/structures/genomics.py
--- a/src/automlst/engine/data/structures/mlst.py
+++ b/src/automlst/engine/data/structures/mlst.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import Mapping, Sequence, Union
+
+@dataclass(frozen=True)
+class PartialAllelicMatchProfile:
+    percent_identity: float
+    mismatches: int
+    bitscore: float
+    gaps: int
+
+@dataclass(frozen=True)
+class Allele:
+    allele_loci: str
+    allele_variant: str
+    partial_match_profile: Union[None, PartialAllelicMatchProfile]
+
+@dataclass(frozen=True)
+class MLSTProfile:
+    alleles: Mapping[str, Sequence[Allele]]
+    sequence_type: str
+    clonal_complex: str
--- a/src/automlst/engine/exceptions/database.py
+++ b/src/automlst/engine/exceptions/database.py
@@ -0,0 +1,21 @@
+from typing import Union
+
+class BIGSDbDatabaseAPIException(Exception):
+    pass
+
+
+class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
+    def __init__(self, database_name: str, database_schema_id: int, *args):
+        super().__init__(f"No matches found with schema with ID {database_schema_id}  in the database \"{database_name}\".", *args)
+
+class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
+    def __init__(self, database_name: str, database_schema_id: int, *args):
+        super().__init__(f"No exact match found with schema with ID {database_schema_id}  in the database \"{database_name}\".", *args)
+
+class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException):
+    def __init__(self, database_name: str, *args):
+        super().__init__(f"No database \"{database_name}\" found.", *args)
+
+class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException):
+    def __init__(self, database_name: str, database_schema_id: int, *args):
+        super().__init__(f"No schema with ID {database_schema_id}  in \"{database_name}\" found.", *args)
--- a/src/automlst/engine/local/abif.py
+++ b/src/automlst/engine/local/abif.py
@@ -1,126 +0,0 @@
-import asyncio
-from numbers import Number
-from os import path
-from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
-from automlst.engine.data.genomics import NamedString, SangerTraceData
-from Bio.SeqRecord import SeqRecord
-from Bio import SeqIO, Align
-
-from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
-
-
-def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
-    with open(seq_path, "rb") as seq_handle:
-        return SeqIO.read(seq_handle, "abi")
-
-
-async def read_abif(seq_path: str) -> SangerTraceData:
-    ext = path.splitext(seq_path)[1]
-    if ext.lower() != ".ab1" and ext.lower() != "abi":
-        raise ValueError(
-            'seq_path must have file extension of "ab1", or "abi".')
-    biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
-    biopython_annotations = biopython_seq.annotations
-
-    # Lot of type ignoring since Biopython did not define their typing.
-    biopython_abif_raw = biopython_annotations["abif_raw"]  # type: ignore
-    trace_data = SangerTraceData(
-        path.basename(seq_path),
-        biopython_seq.seq,
-        biopython_abif_raw.get("APFN2"),  # type: ignore
-        biopython_abif_raw.get("APrN1"),  # type: ignore
-        biopython_abif_raw.get("APrV1"),  # type: ignore
-        biopython_abif_raw.get("APrX1"),  # type: ignore
-        biopython_abif_raw.get("APXV1"),  # type: ignore
-        biopython_abif_raw.get("CMNT1"),  # type: ignore
-        biopython_abif_raw.get("CpEP1"),  # type: ignore
-        biopython_abif_raw.get("CTID1"),  # type: ignore
-        biopython_abif_raw.get("CTNM1"),  # type: ignore
-        biopython_abif_raw.get("CTTL1"),  # type: ignore
-        biopython_abif_raw.get("DATA1"),  # type: ignore
-        biopython_abif_raw.get("DATA2"),  # type: ignore
-        biopython_abif_raw.get("DATA3"),  # type: ignore
-        biopython_abif_raw.get("DATA4"),  # type: ignore
-        biopython_abif_raw.get("DATA5"),  # type: ignore
-        biopython_abif_raw.get("DATA6"),  # type: ignore
-        biopython_abif_raw.get("DATA7"),  # type: ignore
-        biopython_abif_raw.get("DATA8"),  # type: ignore
-        biopython_abif_raw.get("DSam1"),  # type: ignore
-        biopython_abif_raw.get("DyeN1"),  # type: ignore
-        biopython_abif_raw.get("DyeN2"),  # type: ignore
-        biopython_abif_raw.get("DyeN3"),  # type: ignore
-        biopython_abif_raw.get("DyeN4"),  # type: ignore
-        biopython_abif_raw.get("DyeW1"),  # type: ignore
-        biopython_abif_raw.get("DyeW2"),  # type: ignore
-        biopython_abif_raw.get("DyeW3"),  # type: ignore
-        biopython_abif_raw.get("DyeW4"),  # type: ignore
-        biopython_abif_raw.get("DySN1"),  # type: ignore
-        biopython_abif_raw.get("EPVt1"),  # type: ignore
-        biopython_abif_raw.get("EVNT1"),  # type: ignore
-        biopython_abif_raw.get("EVNT2"),  # type: ignore
-        biopython_abif_raw.get("EVNT3"),  # type: ignore
-        biopython_abif_raw.get("EVNT4"),  # type: ignore
-        biopython_abif_raw.get("FWO_1"),  # type: ignore
-        biopython_abif_raw.get("GTyp1"),  # type: ignore
-        biopython_abif_raw.get("InSc1"),  # type: ignore
-        biopython_abif_raw.get("InVt1"),  # type: ignore
-        biopython_abif_raw.get("LANE1"),  # type: ignore
-        biopython_abif_raw.get("LIMS1"),  # type: ignore
-        biopython_abif_raw.get("LNTD1"),  # type: ignore
-        biopython_abif_raw.get("LsrP1"),  # type: ignore
-        biopython_abif_raw.get("MCHN1"),  # type: ignore
-        biopython_abif_raw.get("MODF1"),  # type: ignore
-        biopython_abif_raw.get("MODL1"),  # type: ignore
-        biopython_abif_raw.get("NAVG1"),  # type: ignore
-        biopython_abif_raw.get("NLNE1"),  # type: ignore
-        biopython_abif_raw.get("OfSc1"),  # type: ignore
-        biopython_abif_raw.get("PDMF1"),  # type: ignore
-        biopython_abif_raw.get("PXLB1"),  # type: ignore
-        biopython_abif_raw.get("RGCm1"),  # type: ignore
-        biopython_abif_raw.get("RGNm1"),  # type: ignore
-        biopython_abif_raw.get("RMdV1"),  # type: ignore
-        biopython_abif_raw.get("RMdX1"),  # type: ignore
-        biopython_abif_raw.get("RMXV1"),  # type: ignore
-        biopython_abif_raw.get("RPrN1"),  # type: ignore
-        biopython_abif_raw.get("RPrV1"),  # type: ignore
-        biopython_abif_raw.get("RUND1"),  # type: ignore
-        biopython_abif_raw.get("RUND2"),  # type: ignore
-        biopython_abif_raw.get("RUND3"),  # type: ignore
-        biopython_abif_raw.get("RUND4"),  # type: ignore
-        biopython_abif_raw.get("RunN1"),  # type: ignore
-        biopython_abif_raw.get("RUNT1"),  # type: ignore
-        biopython_abif_raw.get("RUNT2"),  # type: ignore
-        biopython_abif_raw.get("RUNT3"),  # type: ignore
-        biopython_abif_raw.get("RUNT4"),  # type: ignore
-        biopython_abif_raw.get("Satd"),  # type: ignore
-        biopython_abif_raw.get("Scal1"),  # type: ignore
-        biopython_abif_raw.get("SCAN1"),  # type: ignore
-        biopython_abif_raw.get("SMED1"),  # type: ignore
-        biopython_abif_raw.get("SMLt"),  # type: ignore
-        biopython_abif_raw.get("SMPL1"),  # type: ignore
-        biopython_abif_raw.get("SVER1"),  # type: ignore
-        biopython_abif_raw.get("SVER3"),  # type: ignore
-        biopython_abif_raw.get("Tmpr1"),  # type: ignore
-        biopython_abif_raw.get("TUBE"),  # type: ignore
-        biopython_abif_raw.get("User")  # type: ignore
-    )
-    return trace_data
-
-
-def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
-    aligner = Align.PairwiseAligner(scoring="blastn")
-    aligner.mode = "local"
-    alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
-        0]  # take the best alignment
-    # TODO actually assemble the consensus sequence here
-    raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
-
-
-async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
-    if isinstance(reference, str):
-        reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
-    else:
-        reference_seq: NamedString  = reference
-    for sanger_trace in sanger_traces:
-        yield NamedString("NA", "NA")
-        raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
--- a/src/automlst/engine/remote/databases/ncbi/genbank.py
+++ b/src/automlst/engine/remote/databases/ncbi/genbank.py
@@ -1,27 +0,0 @@
-import asyncio
-from Bio import Entrez
-from Bio import SeqIO
-
-# TODO Change this out for a more professional approach
-Entrez.email = "yunyangdeng@outlook.com"
-
-from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
-
-
-async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
-    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
-        record = SeqIO.read(fetch_stream, "genbank")
-        sequence_features = list()
-        for feature in record.features:
-            start = int(feature.location.start)
-            end = int(feature.location.end)
-            qualifiers = feature.qualifiers
-            for qualifier_key in qualifiers:
-                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
-            sequence_features.append(StringAnnotation(
-                type=feature.type,
-                start=start,
-                end=end+1,  # Position is exclusive
-                feature_properties=qualifiers
-            ))
-        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
--- a/tests/automlst/engine/data/local/test_fasta.py
+++ b/tests/automlst/engine/data/local/test_fasta.py
@@ -1,4 +1,4 @@
-from automlst.engine.local.fasta import read_fasta
+from automlst.engine.data.local.fasta import read_fasta


 async def test_fasta_reader_not_none():
--- a/tests/automlst/engine/data/remote/databases/test_bigsdb.py
+++ b/tests/automlst/engine/data/remote/databases/test_bigsdb.py
@@ -0,0 +1,244 @@
+import random
+import re
+from typing import Collection, Sequence, Union
+from Bio import SeqIO
+import pytest
+from automlst.engine.data.structures.genomics import NamedString
+from automlst.engine.data.structures.mlst import Allele, MLSTProfile
+from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
+from automlst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
+
+def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
+    rand = random.Random(gene)
+    if isinstance(mutation_site_count, float):
+        mutation_site_count = int(mutation_site_count * len(gene))
+    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
+    scrambled = list(gene)
+    for random_location in random_locations:
+        scrambled[random_location] = rand.choice(alphabet)
+    return "".join(scrambled)
+
+async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
+        async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
+            assert isinstance(exact_match, Allele)
+            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
+            targets_left.remove(exact_match.allele_loci)
+
+        assert len(targets_left) == 0
+
+async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
+    sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
+    mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
+        for sequence in sequences:
+            match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
+            if match is None:
+                continue
+            gene = match.group(1)
+            if gene.lower() not in mlst_targets:
+                continue
+            scrambled = gene_scrambler(str(sequence.seq), 0.125)
+            async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
+                assert partial_match.partial_match_profile is not None
+                mlst_targets.remove(gene.lower())
+
+        assert len(mlst_targets) == 0
+
+async def test_institutpasteur_profiling_results_in_correct_mlst_st():
+    async def dummy_allele_generator():
+        dummy_alleles = [
+        Allele("adk", "1", None),
+        Allele("fumC", "1", None),
+        Allele("glyA", "1", None),
+        Allele("tyrB", "1", None),
+        Allele("icd", "1", None),
+        Allele("pepA", "1", None),
+        Allele("pgm", "1", None),
+        ]
+        for dummy_allele in dummy_alleles:
+            yield dummy_allele
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-2 complex"
+        assert mlst_st_data.sequence_type == "1"
+
+async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
+    dummy_alleles = [
+    Allele("adk", "1", None),
+    Allele("fumC", "2", None),
+    Allele("glyA", "36", None),
+    Allele("tyrB", "4", None),
+    Allele("icd", "4", None),
+    Allele("pepA", "1", None),
+    Allele("pgm", "5", None),
+    ]
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
+        assert mlst_profile.clonal_complex == "unknown"
+        assert mlst_profile.sequence_type == "unknown"
+
+
+async def test_institutpasteur_sequence_profiling_is_correct():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        profile = await dummy_profiler.profile_string(sequence)
+        assert profile is not None
+        assert isinstance(profile, MLSTProfile)
+        assert profile.clonal_complex == "ST-2 complex"
+        assert profile.sequence_type == "1"
+    
+
+async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
+    dummy_alleles = {
+        Allele("adk", "1", None),
+        Allele("atpG", "1", None),
+        Allele("frdB", "1", None),
+        Allele("fucK", "1", None),
+        Allele("mdh", "1", None),
+        Allele("pgi", "1", None),
+        Allele("recA", "5", None),
+    }
+    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
+    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
+        async for exact_match in exact_matches:
+            assert isinstance(exact_match, Allele)
+            dummy_alleles.remove(exact_match)
+
+        assert len(dummy_alleles) == 0
+
+async def test_pubmlst_profiling_results_in_correct_st():
+    async def generate_dummy_targets():
+        dummy_alleles = [
+                Allele("adk", "1", None),
+                Allele("atpG", "1", None),
+                Allele("frdB", "1", None),
+                Allele("fucK", "1", None),
+                Allele("mdh", "1", None),
+                Allele("pgi", "1", None),
+                Allele("recA", "5", None),
+            ]
+        for dummy_allele in dummy_alleles:
+            yield dummy_allele
+    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-3 complex"
+        assert mlst_st_data.sequence_type == "3"
+
+async def test_pubmlst_sequence_profiling_is_correct():
+    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
+    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
+        profile = await dummy_profiler.profile_string(sequence)
+        assert profile is not None
+        assert isinstance(profile, MLSTProfile)
+        assert profile.clonal_complex == "ST-3 complex"
+        assert profile.sequence_type == "3"
+
+async def test_bigsdb_index_all_databases_is_not_empty():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
+
+async def test_bigsdb_index_references_pubmlst_correctly():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
+
+async def test_bigsdb_index_references_institutpasteur_correctly():
+    async with BIGSdbIndex() as bigsdb_index:
+        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
+
+
+async def test_bigsdb_index_instantiates_correct_profiler():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with BIGSdbIndex() as bigsdb_index:
+        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
+            profile = await profiler.profile_string(sequence)
+            assert profile.clonal_complex == "ST-2 complex"
+            assert profile.sequence_type == "1"
+
+async def test_bigsdb_profile_multiple_strings_same_string_twice():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
+    async def generate_async_iterable_sequences():
+        for dummy_sequence in dummy_sequences:
+            yield dummy_sequence
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
+            assert profile is not None
+            assert isinstance(profile, MLSTProfile)
+            assert profile.clonal_complex == "ST-2 complex"
+            assert profile.sequence_type == "1"
+
+async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
+    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
+    async def generate_async_iterable_sequences():
+        for dummy_sequence in dummy_sequences:
+            yield dummy_sequence
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
+            if name == "should_fail":
+                assert profile is None
+            else:
+                assert profile is not None
+                assert isinstance(profile, MLSTProfile)
+                assert profile.clonal_complex == "ST-2 complex"
+                assert profile.sequence_type == "1"
+
+async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
+    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
+    async def generate_async_iterable_sequences():
+        for dummy_sequence in dummy_sequences:
+            yield dummy_sequence
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
+            if name == "should_fail":
+                assert profile is not None
+                assert profile.clonal_complex == "unknown"
+                assert profile.sequence_type == "unknown"
+                assert len(profile.alleles) > 0
+            else:
+                assert profile is not None
+                assert isinstance(profile, MLSTProfile)
+                assert profile.clonal_complex == "ST-2 complex"
+                assert profile.sequence_type == "1"
+
+async def test_bigsdb_profile_multiple_strings_fail_second_stop():
+    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
+    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
+    async def generate_async_iterable_sequences():
+        for dummy_sequence in dummy_sequences:
+            yield dummy_sequence
+    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
+        with pytest.raises(NoBIGSdbMatchesException):
+            async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
+                if name == "should_fail":
+                    pytest.fail("Exception should have been thrown, no exception was thrown.")
+                else:
+                    assert profile is not None
+                    assert isinstance(profile, MLSTProfile)
+                    assert profile.clonal_complex == "ST-2 complex"
+                    assert profile.sequence_type == "1"
+
+async def test_bigsdb_index_get_schemas_for_bordetella():
+    async with BIGSdbIndex() as index:
+        schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
+        assert len(schemas.keys()) > 0
+        assert "MLST" in schemas
+        assert isinstance(schemas["MLST"], int)
+
+async def test_bigsdb_index_get_databases_has_only_seqdef():
+    async with BIGSdbIndex() as index:
+        databases = await index.get_known_seqdef_dbs()
+        assert len(databases.keys()) > 0
+        for database_name in databases.keys():
+            assert database_name.endswith("seqdef")
+        assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
--- a/tests/automlst/engine/local/test_abif.py
+++ b/tests/automlst/engine/local/test_abif.py
@@ -1,12 +0,0 @@
-import os
-
-from automlst.engine.local.abif import read_abif, reference_consensus_assembly
-
-async def test_load_sanger_sequence_has_data():
-    assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
-    result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
-    assert result_data is not None
-
-async def test_consensus_assembly_with_ncbi():
-    consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
-    # TODO complete implementing this
--- a/tests/automlst/engine/remote/databases/ncbi/test_genbank.py
+++ b/tests/automlst/engine/remote/databases/ncbi/test_genbank.py
@@ -1,5 +0,0 @@
-from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
-
-
-async def test_fetch_ncbi_genbank_with_id_works():
-    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
--- a/tests/automlst/engine/remote/databases/test_bigsdb.py
+++ b/tests/automlst/engine/remote/databases/test_bigsdb.py
@@ -1,115 +0,0 @@
-from Bio import SeqIO
-from automlst.engine.data.mlst import Allele, MLSTProfile
-from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
-
-
-async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
-        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
-            targets_left.remove(exact_match.allele_loci)
-
-        assert len(targets_left) == 0
-
-async def test_institutpasteur_profiling_results_in_correct_mlst_st():
-    async def dummy_allele_generator():
-        dummy_alleles = [
-        Allele("adk", "1"),
-        Allele("fumC", "1"),
-        Allele("glyA", "1"),
-        Allele("tyrB", "1"),
-        Allele("icd", "1"),
-        Allele("pepA", "1"),
-        Allele("pgm", "1"),
-        ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-2 complex"
-        assert mlst_st_data.sequence_type == "1"
-
-async def test_institutpasteur_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-2 complex"
-        assert profile.sequence_type == "1"
-
-
-async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
-    dummy_alleles = {
-        Allele("adk", "1"),
-        Allele("atpG", "1"),
-        Allele("frdB", "1"),
-        Allele("fucK", "1"),
-        Allele("mdh", "1"),
-        Allele("pgi", "1"),
-        Allele("recA", "5"),
-    }
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            dummy_alleles.remove(exact_match)
-
-        assert len(dummy_alleles) == 0
-
-async def test_pubmlst_profiling_results_in_correct_st():
-    async def generate_dummy_targets():
-        dummy_alleles = [
-                Allele("adk", "1"),
-                Allele("atpG", "1"),
-                Allele("frdB", "1"),
-                Allele("fucK", "1"),
-                Allele("mdh", "1"),
-                Allele("pgi", "1"),
-                Allele("recA", "5"),
-            ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-3 complex"
-        assert mlst_st_data.sequence_type == "3"
-
-async def test_pubmlst_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-3 complex"
-        assert profile.sequence_type == "3"
-
-async def test_bigsdb_index_all_databases_is_not_empty():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
-
-async def test_bigsdb_index_references_pubmlst_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
-
-async def test_bigsdb_index_references_institutpasteur_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
-
-
-async def test_bigsdb_index_instantiates_correct_profiler():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BIGSdbIndex() as bigsdb_index:
-        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
-            profile = await profiler.profile_string(sequence)
-            assert profile.clonal_complex == "ST-2 complex"
-            assert profile.sequence_type == "1"
--- a/tests/resources/12822.fasta
+++ b/tests/resources/12822.fasta
--- a/tests/resources/tohama_I_bpertussis_coding.fasta
+++ b/tests/resources/tohama_I_bpertussis_coding.fasta
Author	SHA1	Message	Date
Harrison Deng	cb22dfac9b	Merge branch 'develop'	2025-01-17 14:34:16 +00:00
Harrison Deng	7ea7ead46a	Moved ABIF code to separate project	2025-01-17 14:27:25 +00:00
Harrison Deng	a3c864b565	Refactored code layout	2025-01-16 21:54:52 +00:00
Harrison Deng	bad7dfc3a8	Changing all publishing steps to use API keys	2025-01-16 21:29:20 +00:00
Harrison Deng	4fe0f0f287	Added stage for publishing to test.pypi.org when the tag a version number.	2025-01-16 21:22:49 +00:00
Harrison Deng	3fd3ef9f20	Updated package description	2025-01-10 21:13:26 +00:00
Harrison Deng	206a105bf9	Rermoved publishing branch and tag conditions, publish errors are no longer considered failures	2025-01-10 17:59:04 +00:00
Harrison Deng	ad082b8af6	Revert "Always publish on build success" This reverts commit `cc6a7563ca`.	2025-01-10 17:43:14 +00:00
Harrison Deng	5118a25b6a	publish step will now fail if twine upload fails	2025-01-10 17:41:42 +00:00
Harrison Deng	cc6a7563ca	Always publish on build success	2025-01-10 17:35:58 +00:00
Harrison Deng	484e31879e	Removed test installation step from CI	2025-01-10 17:28:25 +00:00
Harrison Deng	5449ae0c68	Moved CLI to automlst.cli repo	2025-01-10 17:22:12 +00:00
Harrison Deng	e634647774	Added untested partial matching	2025-01-10 16:12:56 +00:00
Harrison Deng	f20a656f45	Fixed multiple string typing failure handling	2025-01-10 16:00:27 +00:00
Harrison Deng	0c0a2c9d4c	Finished adding partial allele matching	2025-01-10 15:25:31 +00:00
Harrison Deng	03fbbe542e	allele profiling partial matching works	2025-01-09 21:44:28 +00:00
Harrison Deng	e60dba936c	Added a tag name check for publishing	2025-01-09 18:32:51 +00:00
Harrison Deng	9589761ddd	Now tracks failed profilings	2025-01-09 17:27:15 +00:00
Harrison Deng	2843d0d592	Added tests to test continuous MLST despite failure	2025-01-09 17:04:53 +00:00
Harrison Deng	7bd28db6d4	Updated code coverage to only report on source code	2025-01-09 16:51:08 +00:00
Harrison Deng	463e320386	Updated BIGSdb API to be more tolerant towards failures	2025-01-09 16:49:12 +00:00
Harrison Deng	d4f890a150	Added specific tests for BIGSdbIndex	2025-01-09 16:23:42 +00:00
Harrison Deng	022200f197	Refactored class name	2025-01-09 16:22:50 +00:00
Harrison Deng	e66525d341	Updated README.md	2025-01-09 16:08:44 +00:00
Harrison Deng	1d531aff42	Updated CI to only report on automlst and fixed report publishing	2025-01-09 16:08:30 +00:00
Harrison Deng	8febfad282	Added publishing of coverage results	2025-01-09 15:55:39 +00:00
Harrison Deng	42bcfcf61d	Added coverage tracking	2025-01-09 15:54:22 +00:00