Updated URL links

Changed FASTA reading to group based on file for merging partial targets
Merge branch 'features/improved-oop-architecture' into features/non-exact-notation
2025-02-14 20:37:13 +00:00 · 2025-02-14 14:35:53 +00:00 · 2025-02-12 17:53:25 +00:00 · 2025-02-12 17:52:53 +00:00 · 2025-02-12 17:46:55 +00:00 · 2025-02-12 17:43:26 +00:00
26 changed files with 56560 additions and 24128 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # autoBIGS.Engine

-A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
+A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.

 ## Features

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,8 +15,9 @@ requires-python = ">=3.12"
 description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."

 [project.urls]
-Repository = "https://github.com/RealYHD/autoBIGS.engine"
-Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
+Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"

 [tool.setuptools_scm]

--- a/src/autobigs/engine/analysis/init.py
+++ b/src/autobigs/engine/analysis/init.py
--- a/src/autobigs/engine/analysis/bigsdb.py
+++ b/src/autobigs/engine/analysis/bigsdb.py
@@ -0,0 +1,202 @@
+from abc import abstractmethod
+import asyncio
+from collections import defaultdict
+from contextlib import AbstractAsyncContextManager
+import csv
+from os import path
+import os
+import shutil
+import tempfile
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union
+
+from aiohttp import ClientSession, ClientTimeout
+
+from autobigs.engine.reading import read_fasta
+from autobigs.engine.structures.alignment import PairwiseAlignment
+from autobigs.engine.structures.genomics import NamedString
+from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile
+from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
+
+from Bio.Align import PairwiseAligner
+
+class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
+
+    @abstractmethod
+    def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
+        pass
+
+    @abstractmethod
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+        pass
+
+    @abstractmethod
+    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+        pass
+
+    @abstractmethod
+    def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+        pass
+
+    @abstractmethod
+    async def close(self):
+        pass
+
+class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
+
+    def __init__(self, database_api: str, database_name: str, schema_id: int):
+        self._database_name = database_name
+        self._schema_id = schema_id
+        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
+        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
+
+    async def __aenter__(self):
+        return self
+
+    async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
+        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
+        uri_path = "sequence"
+        if isinstance(query_sequence_strings, str):
+            query_sequence_strings = [query_sequence_strings]
+        for sequence_string in query_sequence_strings:
+            async with self._http_client.post(uri_path, json={
+                "sequence": sequence_string,
+                "partial_matches": True
+            }) as response:
+                sequence_response: dict = await response.json()
+
+                if "exact_matches" in sequence_response:
+                    # loci -> list of alleles with id and loci
+                    exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+                    for allele_loci, alleles in exact_matches.items():
+                        for allele in alleles:
+                            alelle_id = allele["allele_id"]
+                            yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
+                elif "partial_matches" in sequence_response:
+                    partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
+                    for allele_loci, partial_match in partial_matches.items():
+                        if len(partial_match) <= 0:
+                            continue
+                        partial_match_profile = AlignmentStats(
+                            percent_identity=float(partial_match["identity"]),
+                            mismatches=int(partial_match["mismatches"]),
+                            gaps=int(partial_match["gaps"]),
+                            match_metric=int(partial_match["bitscore"])
+                        )
+                        yield Allele(
+                            allele_locus=allele_loci,
+                            allele_variant=str(partial_match["allele"]),
+                            partial_match_profile=partial_match_profile
+                        )
+                else:
+                    raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
+
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+        uri_path = "designations"
+        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        if isinstance(alleles, AsyncIterable):
+            async for allele in alleles:
+                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
+        else:
+            for allele in alleles:
+                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
+        request_json = {
+            "designations": allele_request_dict
+        }
+        async with self._http_client.post(uri_path, json=request_json) as response:
+            response_json: dict = await response.json()
+            allele_set: Set[Allele] = set()
+            response_json.setdefault("fields", dict())
+            schema_fields_returned: dict[str, str] = response_json["fields"]
+            schema_fields_returned.setdefault("ST", "unknown")
+            schema_fields_returned.setdefault("clonal_complex", "unknown")
+            schema_exact_matches: dict = response_json["exact_matches"]
+            for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
+                if len(exact_match_alleles) > 1:
+                    raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
+                allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
+            if len(allele_set) == 0:
+                raise ValueError("Passed in no alleles.")
+            return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+
+    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
+        return await self.determine_mlst_st(alleles)
+
+    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+        async for named_strings in query_named_string_groups:
+            for named_string in named_strings:
+                try:
+                    yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
+                except NoBIGSdbMatchesException as e:
+                    if stop_on_fail:
+                        raise e
+                    yield NamedMLSTProfile(named_string.name, None)
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+
+class BIGSdbIndex(AbstractAsyncContextManager):
+    KNOWN_BIGSDB_APIS = {
+        "https://bigsdb.pasteur.fr/api",
+        "https://rest.pubmlst.org"
+    }
+
+    def __init__(self):
+        self._http_client = ClientSession()
+        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
+        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
+        super().__init__()
+
+    async def __aenter__(self):
+        return self
+    
+    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
+        if self._known_seqdef_dbs_origin is not None and not force:
+            return self._known_seqdef_dbs_origin
+        known_seqdef_dbs = dict()
+        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
+            async with self._http_client.get(f"{known_bigsdb}/db") as response:
+                response_json_databases = await response.json()
+                for database_group in response_json_databases:
+                    for database_info in database_group["databases"]:
+                        if str(database_info["name"]).endswith("seqdef"):
+                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
+        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
+        return self._known_seqdef_dbs_origin
+
+    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
+        known_databases = await self.get_known_seqdef_dbs()
+        if seqdef_db_name not in known_databases:
+            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
+        return known_databases[seqdef_db_name]     
+
+    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
+        if seqdef_db_name in self._seqdefdb_schemas and not force:
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
+        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
+        async with self._http_client.get(uri_path) as response: 
+            response_json = await response.json()
+            schema_descriptions: Mapping[str, int] = dict()
+            for scheme_definition in response_json["schemes"]:
+                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
+                scheme_desc: str = scheme_definition["description"]
+                schema_descriptions[scheme_desc] = scheme_id
+            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
+
+    async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
+        return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+
+def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
+    if local:
+        raise NotImplementedError()
+    return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
--- a/src/autobigs/engine/data/local/fasta.py
+++ b/src/autobigs/engine/data/local/fasta.py
@@ -1,16 +0,0 @@
-import asyncio
-from io import TextIOWrapper
-from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
-from Bio import SeqIO
-
-from autobigs.engine.data.structures.genomics import NamedString
-
-async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
-    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
-    for fasta_sequence in await fasta_sequences:
-        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
-
-async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
-    for handle in handles:
-        async for named_seq in read_fasta(handle):
-            yield named_seq
--- a/src/autobigs/engine/data/remote/databases/bigsdb.py
+++ b/src/autobigs/engine/data/remote/databases/bigsdb.py
@@ -1,166 +0,0 @@
-from collections import defaultdict
-from contextlib import AbstractAsyncContextManager
-from numbers import Number
-from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
-
-from aiohttp import ClientSession, ClientTimeout
-
-from autobigs.engine.data.structures.genomics import NamedString
-from autobigs.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
-from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
-
-class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
-
-    def __init__(self, database_api: str, database_name: str, schema_id: int):
-        self._database_name = database_name
-        self._schema_id = schema_id
-        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
-        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
-
-    async def __aenter__(self):
-        return self
-
-    async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
-        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
-        uri_path = "sequence"
-        response = await self._http_client.post(uri_path, json={
-            "sequence": sequence_string,
-            "partial_matches": not exact
-        })
-        sequence_response: dict = await response.json()
-
-        if "exact_matches" in sequence_response:
-            # loci -> list of alleles with id and loci
-            exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
-            for allele_loci, alleles in exact_matches.items():
-                for allele in alleles:
-                    alelle_id = allele["allele_id"]
-                    yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
-        elif "partial_matches" in sequence_response:
-            if exact:
-                raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
-            partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
-            for allele_loci, partial_match in partial_matches.items():
-                if len(partial_match) <= 0:
-                    continue
-                partial_match_profile = PartialAllelicMatchProfile(
-                    percent_identity=float(partial_match["identity"]),
-                    mismatches=int(partial_match["mismatches"]),
-                    bitscore=float(partial_match["bitscore"]),
-                    gaps=int(partial_match["gaps"])
-                )
-                yield Allele(
-                    allele_loci=allele_loci,
-                    allele_variant=str(partial_match["allele"]),
-                    partial_match_profile=partial_match_profile
-                )
-        else:
-            raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
-
-
-
-    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
-        uri_path = "designations"
-        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
-        if isinstance(alleles, AsyncIterable):
-            async for allele in alleles:
-                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
-        else:
-            for allele in alleles:
-                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
-        request_json = {
-            "designations": allele_request_dict
-        }
-        async with self._http_client.post(uri_path, json=request_json) as response:
-            response_json: dict = await response.json()
-            allele_map: dict[str, list[Allele]] = defaultdict(list)
-            response_json.setdefault("fields", dict())
-            schema_fields_returned: dict[str, str] = response_json["fields"]
-            schema_fields_returned.setdefault("ST", "unknown")
-            schema_fields_returned.setdefault("clonal_complex", "unknown")
-            schema_exact_matches: dict = response_json["exact_matches"]
-            for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
-                for exact_match_allele in exact_match_alleles:
-                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
-            if len(allele_map) == 0:
-                raise ValueError("Passed in no alleles.")
-            return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
-
-    async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(string, exact)
-        return await self.fetch_mlst_st(alleles)
-
-
-    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
-        async for named_string in namedStrings:
-            try:
-                yield (named_string.name, await self.profile_string(named_string.sequence, exact))
-            except NoBIGSdbMatchesException as e:
-                if stop_on_fail:
-                    raise e
-                yield (named_string.name, None)
-
-    async def close(self):
-        await self._http_client.close()
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
-
-class BIGSdbIndex(AbstractAsyncContextManager):
-    KNOWN_BIGSDB_APIS = {
-        "https://bigsdb.pasteur.fr/api",
-        "https://rest.pubmlst.org"
-    }
-
-    def __init__(self):
-        self._http_client = ClientSession()
-        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
-        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
-        super().__init__()
-
-    async def __aenter__(self):
-        return self
-    
-    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
-        if self._known_seqdef_dbs_origin is not None and not force:
-            return self._known_seqdef_dbs_origin
-        known_seqdef_dbs = dict()
-        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
-            async with self._http_client.get(f"{known_bigsdb}/db") as response:
-                response_json_databases = await response.json()
-                for database_group in response_json_databases:
-                    for database_info in database_group["databases"]:
-                        if str(database_info["name"]).endswith("seqdef"):
-                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
-        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
-        return self._known_seqdef_dbs_origin
-
-    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
-        known_databases = await self.get_known_seqdef_dbs()
-        if seqdef_db_name not in known_databases:
-            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
-        return known_databases[seqdef_db_name]     
-
-    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
-        if seqdef_db_name in self._seqdefdb_schemas and not force:
-            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
-        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
-        async with self._http_client.get(uri_path) as response: 
-            response_json = await response.json()
-            schema_descriptions: Mapping[str, int] = dict()
-            for scheme_definition in response_json["schemes"]:
-                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
-                scheme_desc: str = scheme_definition["description"]
-                schema_descriptions[scheme_desc] = scheme_id
-            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
-            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
-
-    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
-        return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
-
-    async def close(self):
-        await self._http_client.close()
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
-    
--- a/src/autobigs/engine/data/structures/init.py
+++ b/src/autobigs/engine/data/structures/init.py
--- a/src/autobigs/engine/data/structures/mlst.py
+++ b/src/autobigs/engine/data/structures/mlst.py
@@ -1,21 +0,0 @@
-from dataclasses import dataclass
-from typing import Mapping, Sequence, Union
-
-@dataclass(frozen=True)
-class PartialAllelicMatchProfile:
-    percent_identity: float
-    mismatches: int
-    bitscore: float
-    gaps: int
-
-@dataclass(frozen=True)
-class Allele:
-    allele_loci: str
-    allele_variant: str
-    partial_match_profile: Union[None, PartialAllelicMatchProfile]
-
-@dataclass(frozen=True)
-class MLSTProfile:
-    alleles: Mapping[str, Sequence[Allele]]
-    sequence_type: str
-    clonal_complex: str
--- a/src/autobigs/engine/exceptions/init.py
+++ b/src/autobigs/engine/exceptions/init.py
--- a/src/autobigs/engine/reading.py
+++ b/src/autobigs/engine/reading.py
@@ -0,0 +1,17 @@
+import asyncio
+from io import TextIOWrapper
+from typing import Any, AsyncGenerator, Iterable, Union
+from Bio import SeqIO
+
+from autobigs.engine.structures.genomics import NamedString
+
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
+    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
+    results = []
+    for fasta_sequence in await fasta_sequences:
+        results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
+    return results
+
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
+    for handle in handles:
+        yield await read_fasta(handle)
--- a/src/autobigs/engine/data/remote/init.py
+++ b/src/autobigs/engine/data/remote/init.py
--- a/src/autobigs/engine/structures/alignment.py
+++ b/src/autobigs/engine/structures/alignment.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+from numbers import Number
+from typing import Sequence
+
+@dataclass(frozen=True)
+class AlignmentStats:
+    percent_identity: float
+    mismatches: int
+    gaps: int
+    match_metric: int
+
+@dataclass(frozen=True)
+class PairwiseAlignment:
+    reference: str
+    query: str
+    reference_indices: Sequence[Number]
+    query_indices: Sequence[Number]
+    alignment_stats: AlignmentStats
--- a/src/autobigs/engine/data/structures/genomics.py
+++ b/src/autobigs/engine/data/structures/genomics.py
--- a/src/autobigs/engine/structures/mlst.py
+++ b/src/autobigs/engine/structures/mlst.py
@@ -0,0 +1,33 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Collection, Iterable, Mapping, Sequence, Union
+
+from autobigs.engine.structures.alignment import AlignmentStats
+
+@dataclass(frozen=True)
+class Allele:
+    allele_locus: str
+    allele_variant: str
+    partial_match_profile: Union[None, AlignmentStats]
+
+@dataclass(frozen=True)
+class MLSTProfile:
+    alleles: Collection[Allele]
+    sequence_type: str
+    clonal_complex: str
+
+@dataclass(frozen=True)
+class NamedMLSTProfile:
+    name: str
+    mlst_profile: Union[None, MLSTProfile]
+
+
+def alleles_to_mapping(alleles: Iterable[Allele]):
+    result = defaultdict(list)
+    for allele in alleles:
+        result[allele.allele_locus].append(allele.allele_variant)
+    result = dict(result)
+    for locus, variant in result.items():
+        if len(variant) == 1:
+            result[locus] = variant[0]
+    return result
--- a/src/autobigs/engine/data/local/csv.py
+++ b/src/autobigs/engine/data/local/csv.py
@@ -1,22 +1,21 @@
+from collections import defaultdict
 import csv
 from os import PathLike
-from typing import AsyncIterable, Mapping, Sequence, Union
+from typing import AsyncIterable, Collection, Mapping, Sequence, Union

-from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.structures.mlst import Allele, MLSTProfile


-def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
-    result_dict: dict[str, Union[list[str], str]] = {}
-    for loci, alleles in alleles_map.items():
-        if len(alleles) == 1:
-            result_dict[loci] = alleles[0].allele_variant
+def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
+    result = defaultdict(list)
+    for allele in alleles:
+        result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
+    for locus in result.keys():
+        if len(result[locus]) == 1:
+            result[locus] = result[locus][0] # Take the only one
        else:
-            result_locis = list()
-            for allele in alleles:
-                result_locis.append(allele.allele_variant)
-                result_dict[loci] = result_locis
-    return result_dict
-
+            result[locus] = tuple(result[locus]) # type: ignore
+    return dict(result)

 async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
    failed = list()
@@ -27,15 +26,16 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
            if mlst_profile is None:
                failed.append(name)
                continue
+            allele_mapping = alleles_to_text_map(mlst_profile.alleles)
            if writer is None:
-                header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
+                header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
                writer = csv.DictWriter(filehandle, fieldnames=header)
                writer.writeheader()
            row_dictionary = {
                "st": mlst_profile.sequence_type,
                "clonal-complex": mlst_profile.clonal_complex,
                "id": name,
-                **dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
+                **allele_mapping
            }
            writer.writerow(rowdict=row_dictionary)
    return failed
--- a/tests/autobigs/engine/analysis/test_bigsdb.py
+++ b/tests/autobigs/engine/analysis/test_bigsdb.py
@@ -0,0 +1,211 @@
+from os import path
+import random
+import re
+from typing import Callable, Collection, Sequence, Union
+from Bio import SeqIO
+import pytest
+from autobigs.engine.analysis import bigsdb
+from autobigs.engine.structures import mlst
+from autobigs.engine.structures.genomics import NamedString
+from autobigs.engine.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
+from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
+
+async def generate_async_iterable(normal_iterable):
+    for dummy_sequence in normal_iterable:
+        yield dummy_sequence
+
+def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
+    rand = random.Random(gene)
+    if isinstance(mutation_site_count, float):
+        mutation_site_count = int(mutation_site_count * len(gene))
+    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
+    scrambled = list(gene)
+    for random_location in random_locations:
+        scrambled[random_location] = rand.choice(alphabet)
+    return "".join(scrambled)
+
+def get_first_sequence_from_fasta(resource: str):
+    return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq)
+
+def get_multiple_sequences_from_fasta(resource: str):
+    return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta"))
+
+bpertussis_tohamaI_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("fumC", "1", None),
+        Allele("glyA", "1", None),
+        Allele("tyrB", "1", None),
+        Allele("icd", "1", None),
+        Allele("pepA", "1", None),
+        Allele("pgm", "1", None)), "1", "ST-2 complex")
+
+bpertussis_tohamaI_bad_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("fumC", "2", None),
+        Allele("glyA", "36", None),
+        Allele("tyrB", "4", None),
+        Allele("icd", "4", None),
+        Allele("pepA", "1", None),
+        Allele("pgm", "5", None),
+    ), "unknown", "unknown")
+
+hinfluenzae_2014_102_profile = MLSTProfile((
+        Allele("adk", "28", None),
+        Allele("atpG", "33", None),
+        Allele("frdB", "7", None),
+        Allele("fucK", "18", None),
+        Allele("mdh", "11", None),
+        Allele("pgi", "125", None),
+        Allele("recA", "89", None)
+    ), "478", "unknown")
+
+hinfluenzae_2014_102_bad_profile = MLSTProfile((
+        Allele("adk", "3", None),
+        Allele("atpG", "121", None),
+        Allele("frdB", "6", None),
+        Allele("fucK", "5", None),
+        Allele("mdh", "12", None),
+        Allele("pgi", "4", None),
+        Allele("recA", "5", None)
+    ), "unknown", "unknown")
+
+
+@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
+    (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
+    (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
+])
+class TestBIGSdbMLSTProfiler:
+    async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        sequence = get_first_sequence_from_fasta(seq_path)
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
+            targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
+            async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
+                assert isinstance(exact_match, Allele)
+                assert exact_match.allele_locus in expected_alleles
+                assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus]
+                targets_left.remove(exact_match.allele_locus)
+
+            assert len(targets_left) == 0
+
+    async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
+        mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
+            for target_sequence in target_sequences:
+                match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
+                if match is None:
+                    continue
+                gene = match.group(1).lower()
+                if gene not in mlst_targets:
+                    continue
+                scrambled = gene_scrambler(str(target_sequence.seq), 0.125)
+                async for partial_match in profiler.determine_mlst_allele_variants([scrambled]):
+                    assert partial_match.partial_match_profile is not None
+                    mlst_targets.remove(gene)
+
+            assert len(mlst_targets) == 0
+
+    async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
+            assert mlst_st_data is not None
+            assert isinstance(mlst_st_data, MLSTProfile)
+            assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
+            assert mlst_st_data.sequence_type == expected_profile.sequence_type
+
+    async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        dummy_alleles = bad_profile.alleles
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
+            assert mlst_profile.clonal_complex == "unknown"
+            assert mlst_profile.sequence_type == "unknown"
+
+
+    async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        sequence = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
+                name, profile = named_profile.name, named_profile.mlst_profile
+                assert profile is not None
+                assert isinstance(profile, MLSTProfile)
+                assert profile.clonal_complex == expected_profile.clonal_complex
+                assert profile.sequence_type == expected_profile.sequence_type
+
+    async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        valid_seq = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
+                name, profile = name_profile.name, name_profile.mlst_profile
+
+                assert profile is not None
+                if name == "should_fail":
+                    assert profile.clonal_complex == "unknown"
+                    assert profile.sequence_type == "unknown"
+                    assert len(profile.alleles) > 0
+                else:
+                    assert isinstance(profile, MLSTProfile)
+                    assert profile.clonal_complex == expected_profile.clonal_complex
+                    assert profile.sequence_type == expected_profile.sequence_type
+
+    async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        valid_seq = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
+
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
+                name, profile = named_profile.name, named_profile.mlst_profile
+                
+                assert profile is not None
+                if name == "should_fail":
+                    assert profile.clonal_complex == "unknown"
+                    assert profile.sequence_type == "unknown"
+                    assert len(profile.alleles) > 0
+                else:
+                    assert isinstance(profile, MLSTProfile)
+                    assert profile.clonal_complex == expected_profile.clonal_complex
+                    assert profile.sequence_type == expected_profile.sequence_type
+
+class TestBIGSdbIndex:
+
+    async def test_bigsdb_index_all_databases_is_not_empty(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
+
+    async def test_bigsdb_index_references_pubmlst_correctly(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
+
+    async def test_bigsdb_index_references_institutpasteur_correctly(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
+
+    async def test_bigsdb_index_get_schemas_for_bordetella(self):
+        async with BIGSdbIndex() as index:
+            schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
+            assert len(schemas.keys()) > 0
+            assert "MLST" in schemas
+            assert isinstance(schemas["MLST"], int)
+
+    async def test_bigsdb_index_get_databases_has_only_seqdef(self):
+        async with BIGSdbIndex() as index:
+            databases = await index.get_known_seqdef_dbs()
+            assert len(databases.keys()) > 0
+            for database_name in databases.keys():
+                assert database_name.endswith("seqdef")
+            assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
+
+    @pytest.mark.parametrize("local", [
+        (False)
+    ])
+    async def test_bigsdb_index_instantiates_correct_profiler(self, local):
+        sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+        async with BIGSdbIndex() as bigsdb_index:
+            async with await bigsdb_index.build_profiler_from_seqdefdb(local, "pubmlst_bordetella_seqdef", 3) as profiler:
+                assert isinstance(profiler, BIGSdbMLSTProfiler)
+                profile = await profiler.profile_string(sequence)
+                assert profile.clonal_complex == "ST-2 complex"
+                assert profile.sequence_type == "1"
--- a/tests/autobigs/engine/data/local/test_csv.py
+++ b/tests/autobigs/engine/data/local/test_csv.py
@@ -1,21 +0,0 @@
-from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci
-from autobigs.engine.data.structures.mlst import Allele
-
-
-def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
-    alleles_map = {
-        "adk": [Allele("adk", "1", None)]
-    }
-    results = dict_loci_alleles_variants_from_loci(alleles_map)
-    for loci, variant in results.items():
-        assert isinstance(variant, str)
-        assert variant == "1"
-
-def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
-    alleles_map = {
-        "adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
-    }
-    results = dict_loci_alleles_variants_from_loci(alleles_map)
-    for loci, variant in results.items():
-        assert isinstance(variant, list)
-        assert len(variant) == 2
--- a/tests/autobigs/engine/data/local/test_fasta.py
+++ b/tests/autobigs/engine/data/local/test_fasta.py
@@ -1,7 +0,0 @@
-from autobigs.engine.data.local.fasta import read_fasta
-
-
-async def test_fasta_reader_not_none():
-    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
-    async for named_string in named_strings:
-        assert named_string.name == "BX470248.1"
--- a/tests/autobigs/engine/data/remote/databases/test_bigsdb.py
+++ b/tests/autobigs/engine/data/remote/databases/test_bigsdb.py
@@ -1,244 +0,0 @@
-import random
-import re
-from typing import Collection, Sequence, Union
-from Bio import SeqIO
-import pytest
-from autobigs.engine.data.structures.genomics import NamedString
-from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
-from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
-from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
-
-def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
-    rand = random.Random(gene)
-    if isinstance(mutation_site_count, float):
-        mutation_site_count = int(mutation_site_count * len(gene))
-    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
-    scrambled = list(gene)
-    for random_location in random_locations:
-        scrambled[random_location] = rand.choice(alphabet)
-    return "".join(scrambled)
-
-async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
-            assert isinstance(exact_match, Allele)
-            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
-            targets_left.remove(exact_match.allele_loci)
-
-        assert len(targets_left) == 0
-
-async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
-    sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
-    mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
-        for sequence in sequences:
-            match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
-            if match is None:
-                continue
-            gene = match.group(1)
-            if gene.lower() not in mlst_targets:
-                continue
-            scrambled = gene_scrambler(str(sequence.seq), 0.125)
-            async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
-                assert partial_match.partial_match_profile is not None
-                mlst_targets.remove(gene.lower())
-
-        assert len(mlst_targets) == 0
-
-async def test_institutpasteur_profiling_results_in_correct_mlst_st():
-    async def dummy_allele_generator():
-        dummy_alleles = [
-        Allele("adk", "1", None),
-        Allele("fumC", "1", None),
-        Allele("glyA", "1", None),
-        Allele("tyrB", "1", None),
-        Allele("icd", "1", None),
-        Allele("pepA", "1", None),
-        Allele("pgm", "1", None),
-        ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-2 complex"
-        assert mlst_st_data.sequence_type == "1"
-
-async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
-    dummy_alleles = [
-    Allele("adk", "1", None),
-    Allele("fumC", "2", None),
-    Allele("glyA", "36", None),
-    Allele("tyrB", "4", None),
-    Allele("icd", "4", None),
-    Allele("pepA", "1", None),
-    Allele("pgm", "5", None),
-    ]
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
-        assert mlst_profile.clonal_complex == "unknown"
-        assert mlst_profile.sequence_type == "unknown"
-
-
-async def test_institutpasteur_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-2 complex"
-        assert profile.sequence_type == "1"
-    
-
-async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
-    dummy_alleles = {
-        Allele("adk", "1", None),
-        Allele("atpG", "1", None),
-        Allele("frdB", "1", None),
-        Allele("fucK", "1", None),
-        Allele("mdh", "1", None),
-        Allele("pgi", "1", None),
-        Allele("recA", "5", None),
-    }
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            dummy_alleles.remove(exact_match)
-
-        assert len(dummy_alleles) == 0
-
-async def test_pubmlst_profiling_results_in_correct_st():
-    async def generate_dummy_targets():
-        dummy_alleles = [
-                Allele("adk", "1", None),
-                Allele("atpG", "1", None),
-                Allele("frdB", "1", None),
-                Allele("fucK", "1", None),
-                Allele("mdh", "1", None),
-                Allele("pgi", "1", None),
-                Allele("recA", "5", None),
-            ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-3 complex"
-        assert mlst_st_data.sequence_type == "3"
-
-async def test_pubmlst_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-3 complex"
-        assert profile.sequence_type == "3"
-
-async def test_bigsdb_index_all_databases_is_not_empty():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
-
-async def test_bigsdb_index_references_pubmlst_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
-
-async def test_bigsdb_index_references_institutpasteur_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
-
-
-async def test_bigsdb_index_instantiates_correct_profiler():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BIGSdbIndex() as bigsdb_index:
-        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
-            profile = await profiler.profile_string(sequence)
-            assert profile.clonal_complex == "ST-2 complex"
-            assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_same_string_twice():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield dummy_sequence
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
-            assert profile is not None
-            assert isinstance(profile, MLSTProfile)
-            assert profile.clonal_complex == "ST-2 complex"
-            assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield dummy_sequence
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
-            if name == "should_fail":
-                assert profile is None
-            else:
-                assert profile is not None
-                assert isinstance(profile, MLSTProfile)
-                assert profile.clonal_complex == "ST-2 complex"
-                assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield dummy_sequence
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
-            if name == "should_fail":
-                assert profile is not None
-                assert profile.clonal_complex == "unknown"
-                assert profile.sequence_type == "unknown"
-                assert len(profile.alleles) > 0
-            else:
-                assert profile is not None
-                assert isinstance(profile, MLSTProfile)
-                assert profile.clonal_complex == "ST-2 complex"
-                assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_fail_second_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield dummy_sequence
-    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        with pytest.raises(NoBIGSdbMatchesException):
-            async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
-                if name == "should_fail":
-                    pytest.fail("Exception should have been thrown, no exception was thrown.")
-                else:
-                    assert profile is not None
-                    assert isinstance(profile, MLSTProfile)
-                    assert profile.clonal_complex == "ST-2 complex"
-                    assert profile.sequence_type == "1"
-
-async def test_bigsdb_index_get_schemas_for_bordetella():
-    async with BIGSdbIndex() as index:
-        schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
-        assert len(schemas.keys()) > 0
-        assert "MLST" in schemas
-        assert isinstance(schemas["MLST"], int)
-
-async def test_bigsdb_index_get_databases_has_only_seqdef():
-    async with BIGSdbIndex() as index:
-        databases = await index.get_known_seqdef_dbs()
-        assert len(databases.keys()) > 0
-        for database_name in databases.keys():
-            assert database_name.endswith("seqdef")
-        assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
--- a/tests/autobigs/engine/test_reading.py
+++ b/tests/autobigs/engine/test_reading.py
@@ -0,0 +1,7 @@
+from autobigs.engine.reading import read_fasta
+
+
+async def test_fasta_reader_not_none():
+    named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
+    for named_string in named_strings:
+        assert named_string.name == "BX470248.1"
--- a/tests/autobigs/engine/test_writing.py
+++ b/tests/autobigs/engine/test_writing.py
@@ -0,0 +1,47 @@
+from typing import AsyncIterable, Iterable
+
+import pytest
+from autobigs.engine.structures.alignment import AlignmentStats
+from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
+from autobigs.engine.structures.mlst import Allele, MLSTProfile
+import tempfile
+from csv import reader
+from os import path
+
+
+@pytest.fixture
+def dummy_alphabet_mlst_profile():
+    return MLSTProfile((
+        Allele("A", "1", None),
+        Allele("D", "1", None),
+        Allele("B", "1", None),
+        Allele("C", "1", None),
+        Allele("C", "2", AlignmentStats(90, 10, 0, 90))
+    ), "mysterious", "very mysterious")
+
+async def iterable_to_asynciterable(iterable: Iterable):
+    for iterated in iterable:
+        yield iterated
+
+async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
+    dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
+    with tempfile.TemporaryDirectory() as temp_dir:
+        output_path = path.join(temp_dir, "out.csv")
+        await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
+        with open(output_path) as csv_handle:
+            csv_reader = reader(csv_handle)
+            lines = list(csv_reader)
+            target_columns = lines[4:]
+            assert target_columns == sorted(target_columns)
+
+async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
+    mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
+    expected_mapping = {
+        "A": "1",
+        "B": "1",
+        "C": ("1", "2*"),
+        "D": "1"
+    }
+    for allele_name, allele_ids in mapping.items():
+        assert allele_name in expected_mapping
+        assert allele_ids == expected_mapping[allele_name]
--- a/tests/resources/2014-102_hinfluenza.fasta
+++ b/tests/resources/2014-102_hinfluenza.fasta
--- a/tests/resources/2014-102_hinfluenza_features.fasta
+++ b/tests/resources/2014-102_hinfluenza_features.fasta
--- a/tests/resources/FDAARGOS_1560.fasta
+++ b/tests/resources/FDAARGOS_1560.fasta
--- a/tests/resources/tohama_I_bpertussis_adk.fasta
+++ b/tests/resources/tohama_I_bpertussis_adk.fasta
@@ -0,0 +1,11 @@
+>lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS]
+ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT
+ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT
+GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT
+CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG
+ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT
+CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC
+AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG
+TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA
+GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC
+CGCCTGTCGCAGGCTCTGCAGAGCTAA
--- a/tests/resources/tohama_I_bpertussis_features.fasta
+++ b/tests/resources/tohama_I_bpertussis_features.fasta
Author	SHA1	Message	Date
Harrison Deng	2e8cdd8da9	Updated URL links All checks were successful automlst.engine/pipeline/head This commit looks good Details autoBIGS.engine/pipeline/tag This commit looks good Details	2025-02-14 20:37:13 +00:00
Harrison Deng	d0318536b2	Changed FASTA reading to group based on file for merging partial targets	2025-02-14 14:35:53 +00:00
Harrison Deng	765cf9d418	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:53:25 +00:00
Harrison Deng	348c3d00b4	Updated README.md to be more clear	2025-02-12 17:52:53 +00:00
Harrison Deng	1c3f7f9ed8	Removed test for instantiating local MLST profiler	2025-02-12 17:46:55 +00:00
Harrison Deng	e4ddaf2e8c	Changed to a MLST typable sequence for pubMLST tests	2025-02-12 17:43:26 +00:00
Harrison Deng	73aade2bde	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:07:51 +00:00
Harrison Deng	af8590baa7	Removed import of deleted feature	2025-02-12 17:07:10 +00:00
Harrison Deng	36bca1b70d	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:02:22 +00:00
Harrison Deng	09a693b696	Removed features being worked on in separate branch	2025-02-12 17:02:00 +00:00
Harrison Deng	f76bf86ef6	Fixed bad profile for H. influenzae non-exact test case	2025-02-12 16:59:50 +00:00
Harrison Deng	a60daf3ee2	Updated H. influenzae database API url	2025-02-12 16:39:13 +00:00
Harrison Deng	fbfd993269	Copied tests over from CSV tests and updated to reflect current code base	2025-02-12 16:36:59 +00:00
Harrison Deng	ba606c35a9	conversion of collection of alleles to map now produces results with tuples instead of lists	2025-02-12 16:36:31 +00:00
Harrison Deng	4183840ba0	Added notation to indicate inexact matching in CSV	2025-02-12 15:59:19 +00:00
Harrison Deng	7fb3eab5b6	Added pubMLST test case to bigsdb tests and updated to reflect codebase changes	2025-02-12 15:53:14 +00:00
Harrison Deng	175a51f968	Replaced local profiler with a not implemented exception	2025-02-12 15:52:48 +00:00
Harrison Deng	897f7ee922	Merge branch 'develop' into features/local-typing Some checks reported errors automlst.engine/pipeline/head Something is wrong with the build of this commit Details	2025-02-12 15:01:12 +00:00
Harrison Deng	bfc286e6b0	Updated test cases to reflect changes in codebase MLSTProfile will always return a value, even if there were no exact matches. Removed a test case specifically testing for stopping on failure, which is a removed feature.	2025-02-12 14:57:51 +00:00
Harrison Deng	a88225fcff	Added check to wrap string into list to prevent decomposing string for querying	2025-02-12 14:46:29 +00:00
Harrison Deng	c18d817cd9	Added test to verify that CSV target columns are ordered	2025-02-12 14:38:12 +00:00
Harrison Deng	f462e6d5e0	Moved "LazyPersistentCachedBIGSdbMLSTProfiler" to separate branch and deleted from current branch	2025-02-11 19:24:23 +00:00
Harrison Deng	e568e9fb2c	Adapted latest merged reading codebase to current codebase	2025-02-11 19:13:29 +00:00
Harrison Deng	4b9eb8674d	Merge branch 'develop' into features/local-typing	2025-02-11 17:55:34 +00:00
Harrison Deng	f75707e4fe	CSV output column order is now predictable (sorted)	2025-02-11 17:54:48 +00:00
Harrison Deng	b4845fab34	Added automatic handling of strings instead of arrays of sequences to typing	2025-02-06 21:15:50 +00:00
Harrison Deng	fe999f1cab	Added a unit test for multithreaded alignments	2025-02-06 18:01:50 +00:00
Harrison Deng	85946eb110	Fixed match metric difference between remote and local	2025-02-06 17:12:31 +00:00
Harrison Deng	a27e09da31	Added code to retrieve sequences and annotations from NCBI GenBank	2025-02-06 17:11:20 +00:00
Harrison Deng	ba2b688e89	Removed sorting as it seems unecessary	2025-02-05 22:06:50 +00:00
Harrison Deng	49f31b7943	Async aligner work tracking issue fixed	2025-02-05 21:47:51 +00:00
Harrison Deng	1c6e1cfb35	Fixed issue with hashing a ndarray by using tuple.	2025-02-05 20:43:53 +00:00
Harrison Deng	fb99526162	Updated iteration on asynchronous aligner	2025-02-05 17:17:37 +00:00
Harrison Deng	ff8a1aff08	Implemented annotated local typing method without testing	2025-02-04 16:19:00 +00:00
Harrison Deng	341ca933a3	Fixed typo in CI script	2025-01-29 17:00:25 +00:00
Harrison Deng	3e3898334f	Began implementing LazyPersistentCachedBIGSdbMLSTProfiler	2025-01-27 22:03:49 +00:00
Harrison Deng	ba1f0aa318	Fixed potential memory leak	2025-01-27 22:02:52 +00:00
Harrison Deng	6d0157581f	Removed conda environment step for now	2025-01-24 21:43:55 +00:00
Harrison Deng	4bcbfa0c6a	Began adding conda steps for automatic PRs to Bioconda	2025-01-24 19:33:27 +00:00