Implemented annotated local typing method without testing

2025-02-04 16:19:00 +00:00
parent 341ca933a3
commit ff8a1aff08
21 changed files with 27726 additions and 374 deletions
--- a/src/autobigs/engine/analysis/init.py
+++ b/src/autobigs/engine/analysis/init.py
--- a/src/autobigs/engine/analysis/aligners.py
+++ b/src/autobigs/engine/analysis/aligners.py
@@ -0,0 +1,71 @@
+import asyncio
+from concurrent.futures import Future, ThreadPoolExecutor
+from contextlib import AbstractContextManager
+from typing import Any, Set, Union
+from Bio.Align import PairwiseAligner
+from queue import Queue
+
+from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
+
+
+class AsyncPairwiseAlignmentEngine(AbstractContextManager):
+    def __enter__(self):
+        self._thread_pool = ThreadPoolExecutor(self._max_threads)
+        return self
+
+    def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
+        self._max_threads = max_threads
+        self._aligner = aligner
+        self._work_left: Set[Future] = set()
+        self._work_complete: Queue[Future] = Queue()
+
+    def align(self, reference: str, query: str, **associated_data):
+        work = self._thread_pool.submit(
+            self.work, reference, query, **associated_data)
+        work.add_done_callback(self._on_complete)
+        self._work_left.add(work)
+        
+    def _on_complete(self, future: Future):
+        self._work_complete.put(future)
+
+    def work(self, reference, query, **associated_data):
+        alignment_results = sorted(self._aligner.align(reference, query))[0]
+        top_alignment_stats = alignment_results.counts()
+        top_alignment_gaps = top_alignment_stats.gaps
+        top_alignment_identities = top_alignment_stats.identities
+        top_alignment_mismatches = top_alignment_stats.mismatches
+        top_alignment_score = alignment_results.score # type: ignore
+        return PairwiseAlignment(
+            alignment_results.sequences[0],
+            alignment_results.sequences[1],
+            alignment_results.indices[0],
+            alignment_results.indices[1],
+            AlignmentStats(
+                percent_identity=top_alignment_identities/alignment_results.length,
+                mismatches=top_alignment_mismatches,
+                gaps=top_alignment_gaps,
+                score=top_alignment_score
+            )), associated_data
+
+    async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
+        if self._work_complete.empty() and len(self._work_left):
+            return None
+        future_now: Future = await asyncio.wrap_future(self._work_complete.get())
+        completed: tuple[PairwiseAlignment, dict[str, Any]] = (future_now).result()
+        self._work_left.remove(future_now)
+        return completed
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.shutdown()
+
+    def __aiter__(self):
+        return self
+    
+    async def __anext__(self):
+        result = await self.next_completed()
+        if result is None:
+            raise StopAsyncIteration
+        return result
+
+    def shutdown(self):
+        self._thread_pool.shutdown(wait=True, cancel_futures=True)
--- a/src/autobigs/engine/data/remote/databases/bigsdb.py
+++ b/src/autobigs/engine/data/remote/databases/bigsdb.py
@@ -1,15 +1,21 @@
 from abc import abstractmethod
+import asyncio
 from collections import defaultdict
 from contextlib import AbstractAsyncContextManager
 import csv
 from os import path
-from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Union
+import os
+import shutil
+import tempfile
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union

 from aiohttp import ClientSession, ClientTimeout

-from autobigs.engine.data.local.fasta import read_fasta
-from autobigs.engine.data.structures.genomics import NamedString
-from autobigs.engine.data.structures.mlst import Allele, NamedMLSTProfile, PartialAllelicMatchProfile, MLSTProfile
+from autobigs.engine.analysis.aligners import AsyncPairwiseAlignmentEngine
+from autobigs.engine.reading import read_fasta
+from autobigs.engine.structures.alignment import PairwiseAlignment
+from autobigs.engine.structures.genomics import NamedString
+from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile
 from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException

 from Bio.Align import PairwiseAligner
@@ -17,26 +23,26 @@ from Bio.Align import PairwiseAligner
 class BIGSdbMLSTProfiler(AbstractAsyncContextManager):

    @abstractmethod
-    def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
+    def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
        pass

    @abstractmethod
-    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
        pass

    @abstractmethod
-    async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
+    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
        pass

    @abstractmethod
-    def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+    def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
        pass

    @abstractmethod
    async def close(self):
        pass

-class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
+class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):

    def __init__(self, database_api: str, database_name: str, schema_id: int):
        self._database_name = database_name
@@ -47,11 +53,13 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def __aenter__(self):
        return self

-    async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
+    async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
        uri_path = "sequence"
+        if not isinstance(query_sequence_strings, Iterable):
+            raise ValueError("Invalid data type for parameter \"sequence_strings\".")

-        for sequence_string in sequence_strings:
+        for sequence_string in query_sequence_strings:
            async with self._http_client.post(uri_path, json={
                "sequence": sequence_string,
                "partial_matches": True
@@ -70,10 +78,11 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
                    for allele_loci, partial_match in partial_matches.items():
                        if len(partial_match) <= 0:
                            continue
-                        partial_match_profile = PartialAllelicMatchProfile(
+                        partial_match_profile = AlignmentStats(
                            percent_identity=float(partial_match["identity"]),
                            mismatches=int(partial_match["mismatches"]),
-                            gaps=int(partial_match["gaps"])
+                            gaps=int(partial_match["gaps"]),
+                            score=int(partial_match["score"])
                        )
                        yield Allele(
                            allele_locus=allele_loci,
@@ -83,7 +92,7 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
                else:
                    raise NoBIGSdbMatchesException(self._database_name, self._schema_id)

-    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
        uri_path = "designations"
        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
        if isinstance(alleles, AsyncIterable):
@@ -97,7 +106,7 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
        }
        async with self._http_client.post(uri_path, json=request_json) as response:
            response_json: dict = await response.json()
-            allele_map: dict[str, Allele] = {}
+            allele_set: Set[Allele] = set()
            response_json.setdefault("fields", dict())
            schema_fields_returned: dict[str, str] = response_json["fields"]
            schema_fields_returned.setdefault("ST", "unknown")
@@ -106,17 +115,17 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
            for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
                if len(exact_match_alleles) > 1:
                    raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
-                allele_map[exact_match_locus] = Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)
-            if len(allele_map) == 0:
+                allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
+            if len(allele_set) == 0:
                raise ValueError("Passed in no alleles.")
-            return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+            return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])

-    async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(sequence_strings)
-        return await self.fetch_mlst_st(alleles)
+    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
+        return await self.determine_mlst_st(alleles)

-    async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
-        async for named_strings in named_string_groups:
+    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+        async for named_strings in query_named_string_groups:
            for named_string in named_strings:
                try:
                    yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
@@ -131,20 +140,36 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.close()

-class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
-    def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: str):
+class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
+    async def __aenter__(self):
+        if self._prepare:
+            await self.update_scheme_locis()
+            await asyncio.gather(
+                self.download_alleles_cache_data(),
+                self.download_scheme_profiles()
+            )
+            await self.load_scheme_profiles()
+        return self
+    
+    def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True):
        self._database_api = database_api
        self._database_name = database_name
        self._schema_id = schema_id
-        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
+        self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
-        self._cache_path = cache_path
+        if cache_path is None:
+            self._cache_path = tempfile.mkdtemp("BIGSdb")
+            self._cleanup_required = True
+        else:
+            self._cache_path = cache_path
+            self._cleanup_required = False
        self._loci: list[str] = []
-        self._profiles = {}
+        self._profiles_st_map = {}
+        self._prepare = prepare

-    async def load_scheme_locis(self):
+    async def update_scheme_locis(self):
        self._loci.clear()
-        async with self._http_client.get("") as schema_response:
+        async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response:
            schema_json = await schema_response.json()
            for locus in schema_json["loci"]:
                locus_name = path.basename(locus)
@@ -152,14 +177,14 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
        self._loci.sort()
    
    async def load_scheme_profiles(self):
-        self._profiles.clear()
+        self._profiles_st_map.clear()
        with open(self.get_scheme_profile_path()) as profile_cache_handle:
            reader = csv.DictReader(profile_cache_handle, delimiter="\t")
            for line in reader:
                alleles = []
                for locus in self._loci:
                    alleles.append(line[locus])
-                self._profiles[tuple(alleles)] = (line["ST"], line["clonal_complex"])
+                self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"])
            
    def get_locus_cache_path(self, locus) -> str:
        return path.join(self._cache_path, locus + "." + "fasta")
@@ -170,8 +195,8 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def download_alleles_cache_data(self):
        for locus in self._loci:
            with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
-                async with self._http_client.get(f"/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
-                    async for chunk, eof in fasta_response.content.iter_chunks(): # TODO maybe allow chunking to be configurable
+                async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
+                    async for chunk, eof in fasta_response.content.iter_chunks():
                        fasta_handle.write(chunk)

    async def download_scheme_profiles(self):
@@ -179,34 +204,41 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
            async with self._http_client.get("profiles_csv") as profiles_response:
                async for chunk, eof in profiles_response.content.iter_chunks():
                    profile_cache_handle.write(chunk)
+        await self.load_scheme_profiles()
    
-    async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
+    async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
        aligner = PairwiseAligner("blastn")
        aligner.mode = "local"
-        for sequence_string in sequence_strings:
-            for locus in self._loci:
-                async for fasta_seq in read_fasta(self.get_locus_cache_path(locus)):
-                    allele_variant = fasta_seq.name
-                    alignment_results = aligner.align(sequence_string, fasta_seq.sequence)
-                    top_alignment = sorted(alignment_results)[0]
-                    top_alignment_stats = top_alignment.counts()
-                    top_alignment_gaps = top_alignment_stats.gaps
-                    top_alignment_identities = top_alignment_stats.identities
-                    top_alignment_mismatches = top_alignment_stats.mismatches
-                    if top_alignment_gaps == 0 and top_alignment_mismatches == 0:
-                        yield Allele(locus, allele_variant, None)
+        with AsyncPairwiseAlignmentEngine(aligner) as aligner_engine:
+            for query_sequence_string in query_sequence_strings:
+                for locus in self._loci:
+                    async for allele_variant in read_fasta(self.get_locus_cache_path(locus)):
+                        aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True)
+                        break # start a bunch of full alignments for each variant to select segments
+            alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set)
+            async for alignment_result, additional_information in aligner_engine:
+                result_variant_name = additional_information["variant_name"]
+                result_locus, variant_id = result_variant_name.split("_")
+                full_alignment = additional_information["full"]
+                if full_alignment:
+                    if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0:
+                        # I.e., 100% exactly the same
+                        yield Allele(result_locus, variant_id, None)
+                        continue
                    else:
-                        yield Allele(
-                            locus,
-                            allele_variant,
-                            PartialAllelicMatchProfile(
-                                percent_identity=top_alignment_identities/top_alignment.length,
-                                mismatches=top_alignment_mismatches,
-                                gaps=top_alignment_gaps
-                            )
-                        )
+                        alignment_rankings[result_locus].add((alignment_result, variant_id))
+                    interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]]
+                    async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)):
+                        if result_variant_name == allele_variant.name:
+                            continue # Skip if we just finished aligning this
+                        aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False)
+                else:
+                    alignment_rankings[result_locus].add((alignment_result, variant_id))
+            for final_locus, alignments in alignment_rankings.items():
+                closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.score)[0]
+                yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats)

-    async def fetch_mlst_st(self, alleles):
+    async def determine_mlst_st(self, alleles):
        allele_variants: dict[str, Allele] = {}
        if isinstance(alleles, AsyncIterable):
            async for allele in alleles:
@@ -218,15 +250,15 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
        for locus in self._loci:
               ordered_profile.append(allele_variants[locus].allele_variant)

-        st, clonal_complex = self._profiles[tuple(ordered_profile)]
-        return MLSTProfile(allele_variants, st, clonal_complex)
+        st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)]
+        return MLSTProfile(set(allele_variants.values()), st, clonal_complex)

-    async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(sequence_strings)
-        return await self.fetch_mlst_st(alleles)
+    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
+        return await self.determine_mlst_st(alleles)

-    async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
-        async for named_strings in named_string_groups:
+    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+        async for named_strings in query_named_string_groups:
            for named_string in named_strings:
                try:
                    yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
@@ -237,6 +269,8 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):

    async def close(self):
        await self._http_client.close()
+        if self._cleanup_required:
+            shutil.rmtree(self._cache_path)

    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.close()
@@ -290,12 +324,16 @@ class BIGSdbIndex(AbstractAsyncContextManager):
            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore

-    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> OnlineBIGSdbMLSTProfiler:
-        return OnlineBIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
+    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> RemoteBIGSdbMLSTProfiler:
+        return RemoteBIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)

    async def close(self):
        await self._http_client.close()

    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.close()
-    
+
+def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
+    if local:
+        return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
+    return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
--- a/src/autobigs/engine/data/structures/mlst.py
+++ b/src/autobigs/engine/data/structures/mlst.py
@@ -1,25 +0,0 @@
-from dataclasses import dataclass
-from typing import Mapping, Sequence, Union
-
-@dataclass(frozen=True)
-class PartialAllelicMatchProfile:
-    percent_identity: float
-    mismatches: int
-    gaps: int
-
-@dataclass(frozen=True)
-class Allele:
-    allele_locus: str
-    allele_variant: str
-    partial_match_profile: Union[None, PartialAllelicMatchProfile]
-
-@dataclass(frozen=True)
-class MLSTProfile:
-    alleles: Mapping[str, Allele]
-    sequence_type: str
-    clonal_complex: str
-
-@dataclass(frozen=True)
-class NamedMLSTProfile:
-    name: str
-    mlst_profile: Union[None, MLSTProfile]
--- a/src/autobigs/engine/exceptions/init.py
+++ b/src/autobigs/engine/exceptions/init.py
--- a/src/autobigs/engine/data/local/fasta.py
+++ b/src/autobigs/engine/data/local/fasta.py
@@ -1,9 +1,9 @@
 import asyncio
 from io import TextIOWrapper
-from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
+from typing import Any, AsyncGenerator, Iterable, Union
 from Bio import SeqIO

-from autobigs.engine.data.structures.genomics import NamedString
+from autobigs.engine.structures.genomics import NamedString

 async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
--- a/src/autobigs/engine/data/remote/init.py
+++ b/src/autobigs/engine/data/remote/init.py
--- a/src/autobigs/engine/structures/alignment.py
+++ b/src/autobigs/engine/structures/alignment.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+from numbers import Number
+
+@dataclass(frozen=True)
+class AlignmentStats:
+    percent_identity: float
+    mismatches: int
+    gaps: int
+    score: int
+
+@dataclass(frozen=True)
+class PairwiseAlignment:
+    reference: str
+    query: str
+    reference_indices: list[Number]
+    query_indices: list[Number]
+    alignment_stats: AlignmentStats
--- a/src/autobigs/engine/data/structures/genomics.py
+++ b/src/autobigs/engine/data/structures/genomics.py
--- a/src/autobigs/engine/structures/mlst.py
+++ b/src/autobigs/engine/structures/mlst.py
@@ -0,0 +1,33 @@
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Collection, Iterable, Mapping, Sequence, Union
+
+from autobigs.engine.structures.alignment import AlignmentStats
+
+@dataclass(frozen=True)
+class Allele:
+    allele_locus: str
+    allele_variant: str
+    partial_match_profile: Union[None, AlignmentStats]
+
+@dataclass(frozen=True)
+class MLSTProfile:
+    alleles: Collection[Allele]
+    sequence_type: str
+    clonal_complex: str
+
+@dataclass(frozen=True)
+class NamedMLSTProfile:
+    name: str
+    mlst_profile: Union[None, MLSTProfile]
+
+
+def alleles_to_mapping(alleles: Iterable[Allele]):
+    result = defaultdict(list)
+    for allele in alleles:
+        result[allele.allele_locus].append(allele.allele_variant)
+    result = dict(result)
+    for locus, variant in result.items():
+        if len(variant) == 1:
+            result[locus] = variant[0]
+    return result
--- a/src/autobigs/engine/data/local/csv.py
+++ b/src/autobigs/engine/data/local/csv.py
@@ -2,19 +2,13 @@ import csv
 from os import PathLike
 from typing import AsyncIterable, Mapping, Sequence, Union

-from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.structures.mlst import Allele, MLSTProfile


-def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
+def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Allele]):
    result_dict: dict[str, Union[list[str], str]] = {}
    for loci, alleles in alleles_map.items():
-        if len(alleles) == 1:
-            result_dict[loci] = alleles[0].allele_variant
-        else:
-            result_locis = list()
-            for allele in alleles:
-                result_locis.append(allele.allele_variant)
-                result_dict[loci] = result_locis
+        result_dict[loci] = alleles.allele_variant
    return result_dict


--- a/tests/autobigs/engine/analysis/test_aligners.py
+++ b/tests/autobigs/engine/analysis/test_aligners.py
@@ -0,0 +1,27 @@
+from Bio import SeqIO
+from Bio.Align import PairwiseAligner
+from pytest import mark
+from pytest import fixture
+from autobigs.engine.analysis.aligners import AsyncPairwiseAlignmentEngine
+from autobigs.engine.structures.alignment import PairwiseAlignment
+
+@fixture
+def tohamaI_bpertussis_adk():
+    return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
+
+@fixture
+def tohamaI_bpertussis_genome():
+    return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
+
+@fixture(params=[1, 2])
+def dummy_engine(request):
+    aligner = PairwiseAligner("blastn")
+    aligner.mode = "local"
+    with AsyncPairwiseAlignmentEngine(aligner, request.param) as engine:
+        yield engine
+
+class TestAsyncPairwiseAlignmentEngine:
+    async def test_single_alignment_no_errors(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncPairwiseAlignmentEngine):
+        dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
+        async for alignment, additional_information in dummy_engine:
+            assert isinstance(alignment, PairwiseAlignment)
--- a/tests/autobigs/engine/analysis/test_bigsdb.py
+++ b/tests/autobigs/engine/analysis/test_bigsdb.py
@@ -0,0 +1,210 @@
+from os import path
+import random
+import re
+from typing import Callable, Collection, Sequence, Union
+from Bio import SeqIO
+import pytest
+from autobigs.engine.analysis import bigsdb
+from autobigs.engine.structures import mlst
+from autobigs.engine.structures.genomics import NamedString
+from autobigs.engine.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
+from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
+
+async def generate_async_iterable(normal_iterable):
+    for dummy_sequence in normal_iterable:
+        yield dummy_sequence
+
+def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
+    rand = random.Random(gene)
+    if isinstance(mutation_site_count, float):
+        mutation_site_count = int(mutation_site_count * len(gene))
+    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
+    scrambled = list(gene)
+    for random_location in random_locations:
+        scrambled[random_location] = rand.choice(alphabet)
+    return "".join(scrambled)
+
+def get_first_sequence_from_fasta(resource: str):
+    return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq)
+
+def get_multiple_sequences_from_fasta(resource: str):
+    return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta"))
+
+bpertussis_tohamaI_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("fumC", "1", None),
+        Allele("glyA", "1", None),
+        Allele("tyrB", "1", None),
+        Allele("icd", "1", None),
+        Allele("pepA", "1", None),
+        Allele("pgm", "1", None)), "1", "ST-2 complex")
+
+bpertussis_tohamaI_bad_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("fumC", "2", None),
+        Allele("glyA", "36", None),
+        Allele("tyrB", "4", None),
+        Allele("icd", "4", None),
+        Allele("pepA", "1", None),
+        Allele("pgm", "5", None),
+    ), "unknown", "unknown")
+
+hinfluenzae_fdaargos_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("atpG", "1", None),
+        Allele("frdB", "1", None),
+        Allele("fucK", "1", None),
+        Allele("mdh", "1", None),
+        Allele("pgi", "1", None),
+        Allele("recA", "5", None)
+    ), "3", "ST-3 complex")
+
+hinfluenzae_fdaargos_bad_profile = MLSTProfile((
+        Allele("adk", "1", None),
+        Allele("atpG", "1", None),
+        Allele("frdB", "1", None),
+        Allele("fucK", "1", None),
+        Allele("mdh", "1", None),
+        Allele("pgi", "1", None),
+        Allele("recA", "5", None)
+    ), "3", "ST-3 complex")
+
+hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
+
+hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
+
+@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
+    (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
+    (True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
+])
+class TestBIGSdbMLSTProfiler:
+    async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        sequence = get_first_sequence_from_fasta(seq_path)
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
+            targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
+            async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
+                assert isinstance(exact_match, Allele)
+                assert exact_match.allele_locus in expected_alleles
+                assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus]
+                targets_left.remove(exact_match.allele_locus)
+
+            assert len(targets_left) == 0
+
+    async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
+        mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
+            for target_sequence in target_sequences:
+                match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
+                if match is None:
+                    continue
+                gene = match.group(1).lower()
+                if gene not in mlst_targets:
+                    continue
+                scrambled = gene_scrambler(str(target_sequence.seq), 0.125)
+                async for partial_match in profiler.determine_mlst_allele_variants([scrambled]):
+                    assert partial_match.partial_match_profile is not None
+                    mlst_targets.remove(gene)
+
+            assert len(mlst_targets) == 0
+
+    async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
+            assert mlst_st_data is not None
+            assert isinstance(mlst_st_data, MLSTProfile)
+            assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
+            assert mlst_st_data.sequence_type == expected_profile.sequence_type
+
+    async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        dummy_alleles = bad_profile.alleles
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
+            assert mlst_profile.clonal_complex == "unknown"
+            assert mlst_profile.sequence_type == "unknown"
+
+
+    async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        sequence = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
+                name, profile = named_profile.name, named_profile.mlst_profile
+                assert profile is not None
+                assert isinstance(profile, MLSTProfile)
+                assert profile.clonal_complex == expected_profile.clonal_complex
+                assert profile.sequence_type == expected_profile.sequence_type
+
+    async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        valid_seq = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
+                name, profile = name_profile.name, name_profile.mlst_profile
+
+                assert profile is not None
+                if name == "should_fail":
+                    assert profile.clonal_complex == "unknown"
+                    assert profile.sequence_type == "unknown"
+                    assert len(profile.alleles) > 0
+                else:
+                    assert isinstance(profile, MLSTProfile)
+                    assert profile.clonal_complex == expected_profile.clonal_complex
+                    assert profile.sequence_type == expected_profile.sequence_type
+
+    async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
+        valid_seq = get_first_sequence_from_fasta(seq_path)
+        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
+
+        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
+            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
+                name, profile = named_profile.name, named_profile.mlst_profile
+                
+                assert profile is not None
+                if name == "should_fail":
+                    assert profile.clonal_complex == "unknown"
+                    assert profile.sequence_type == "unknown"
+                    assert len(profile.alleles) > 0
+                else:
+                    assert isinstance(profile, MLSTProfile)
+                    assert profile.clonal_complex == expected_profile.clonal_complex
+                    assert profile.sequence_type == expected_profile.sequence_type
+
+class TestBIGSdbIndex:
+
+    async def test_bigsdb_index_all_databases_is_not_empty(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
+
+    async def test_bigsdb_index_references_pubmlst_correctly(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
+
+    async def test_bigsdb_index_references_institutpasteur_correctly(self):
+        async with BIGSdbIndex() as bigsdb_index:
+            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
+
+    async def test_bigsdb_index_get_schemas_for_bordetella(self):
+        async with BIGSdbIndex() as index:
+            schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
+            assert len(schemas.keys()) > 0
+            assert "MLST" in schemas
+            assert isinstance(schemas["MLST"], int)
+
+    async def test_bigsdb_index_get_databases_has_only_seqdef(self):
+        async with BIGSdbIndex() as index:
+            databases = await index.get_known_seqdef_dbs()
+            assert len(databases.keys()) > 0
+            for database_name in databases.keys():
+                assert database_name.endswith("seqdef")
+            assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
+
+    async def test_bigsdb_index_instantiates_correct_profiler(self):
+        sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+        async with BIGSdbIndex() as bigsdb_index:
+            async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
+                profile = await profiler.profile_string(sequence)
+                assert profile.clonal_complex == "ST-2 complex"
+                assert profile.sequence_type == "1"
--- a/tests/autobigs/engine/data/local/test_csv.py
+++ b/tests/autobigs/engine/data/local/test_csv.py
@@ -1,21 +0,0 @@
-from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci
-from autobigs.engine.data.structures.mlst import Allele
-
-
-def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
-    alleles_map = {
-        "adk": [Allele("adk", "1", None)]
-    }
-    results = dict_loci_alleles_variants_from_loci(alleles_map)
-    for loci, variant in results.items():
-        assert isinstance(variant, str)
-        assert variant == "1"
-
-def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
-    alleles_map = {
-        "adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
-    }
-    results = dict_loci_alleles_variants_from_loci(alleles_map)
-    for loci, variant in results.items():
-        assert isinstance(variant, list)
-        assert len(variant) == 2
--- a/tests/autobigs/engine/data/remote/databases/test_bigsdb.py
+++ b/tests/autobigs/engine/data/remote/databases/test_bigsdb.py
@@ -1,249 +0,0 @@
-import random
-import re
-from typing import Collection, Sequence, Union
-from Bio import SeqIO
-import pytest
-from autobigs.engine.data.structures.genomics import NamedString
-from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
-from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
-from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, OnlineBIGSdbMLSTProfiler
-
-def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
-    rand = random.Random(gene)
-    if isinstance(mutation_site_count, float):
-        mutation_site_count = int(mutation_site_count * len(gene))
-    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
-    scrambled = list(gene)
-    for random_location in random_locations:
-        scrambled[random_location] = rand.choice(alphabet)
-    return "".join(scrambled)
-
-async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_strings=[sequence]):
-            assert isinstance(exact_match, Allele)
-            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
-            targets_left.remove(exact_match.allele_locus)
-
-        assert len(targets_left) == 0
-
-async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
-    sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
-    mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
-        for sequence in sequences:
-            match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
-            if match is None:
-                continue
-            gene = match.group(1)
-            if gene.lower() not in mlst_targets:
-                continue
-            scrambled = gene_scrambler(str(sequence.seq), 0.125)
-            async for partial_match in profiler.fetch_mlst_allele_variants(scrambled):
-                assert partial_match.partial_match_profile is not None
-                mlst_targets.remove(gene.lower())
-
-        assert len(mlst_targets) == 0
-
-async def test_institutpasteur_profiling_results_in_correct_mlst_st():
-    async def dummy_allele_generator():
-        dummy_alleles = [
-        Allele("adk", "1", None),
-        Allele("fumC", "1", None),
-        Allele("glyA", "1", None),
-        Allele("tyrB", "1", None),
-        Allele("icd", "1", None),
-        Allele("pepA", "1", None),
-        Allele("pgm", "1", None),
-        ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-2 complex"
-        assert mlst_st_data.sequence_type == "1"
-
-async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
-    dummy_alleles = [
-    Allele("adk", "1", None),
-    Allele("fumC", "2", None),
-    Allele("glyA", "36", None),
-    Allele("tyrB", "4", None),
-    Allele("icd", "4", None),
-    Allele("pepA", "1", None),
-    Allele("pgm", "5", None),
-    ]
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
-        assert mlst_profile.clonal_complex == "unknown"
-        assert mlst_profile.sequence_type == "unknown"
-
-
-async def test_institutpasteur_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-2 complex"
-        assert profile.sequence_type == "1"
-    
-
-async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
-    dummy_alleles = {
-        Allele("adk", "1", None),
-        Allele("atpG", "1", None),
-        Allele("frdB", "1", None),
-        Allele("fucK", "1", None),
-        Allele("mdh", "1", None),
-        Allele("pgi", "1", None),
-        Allele("recA", "5", None),
-    }
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_strings=sequence)
-        async for exact_match in exact_matches:
-            assert isinstance(exact_match, Allele)
-            dummy_alleles.remove(exact_match)
-
-        assert len(dummy_alleles) == 0
-
-async def test_pubmlst_profiling_results_in_correct_st():
-    async def generate_dummy_targets():
-        dummy_alleles = [
-                Allele("adk", "1", None),
-                Allele("atpG", "1", None),
-                Allele("frdB", "1", None),
-                Allele("fucK", "1", None),
-                Allele("mdh", "1", None),
-                Allele("pgi", "1", None),
-                Allele("recA", "5", None),
-            ]
-        for dummy_allele in dummy_alleles:
-            yield dummy_allele
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
-        assert mlst_st_data is not None
-        assert isinstance(mlst_st_data, MLSTProfile)
-        assert mlst_st_data.clonal_complex == "ST-3 complex"
-        assert mlst_st_data.sequence_type == "3"
-
-async def test_pubmlst_sequence_profiling_is_correct():
-    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        profile = await dummy_profiler.profile_string(sequence)
-        assert profile is not None
-        assert isinstance(profile, MLSTProfile)
-        assert profile.clonal_complex == "ST-3 complex"
-        assert profile.sequence_type == "3"
-
-async def test_bigsdb_index_all_databases_is_not_empty():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
-
-async def test_bigsdb_index_references_pubmlst_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
-
-async def test_bigsdb_index_references_institutpasteur_correctly():
-    async with BIGSdbIndex() as bigsdb_index:
-        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
-
-
-async def test_bigsdb_index_instantiates_correct_profiler():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    async with BIGSdbIndex() as bigsdb_index:
-        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
-            profile = await profiler.profile_string(sequence)
-            assert profile.clonal_complex == "ST-2 complex"
-            assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_same_string_twice():
-    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield [dummy_sequence]
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
-            name, profile = named_profile.name, named_profile.mlst_profile
-            assert profile is not None
-            assert isinstance(profile, MLSTProfile)
-            assert profile.clonal_complex == "ST-2 complex"
-            assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield [dummy_sequence]
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
-            name, profile = name_profile.name, name_profile.mlst_profile
-
-            if name == "should_fail":
-                assert profile is None
-            else:
-                assert profile is not None
-                assert isinstance(profile, MLSTProfile)
-                assert profile.clonal_complex == "ST-2 complex"
-                assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield [dummy_sequence]
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
-            name, profile = named_profile.name, named_profile.mlst_profile
-            if name == "should_fail":
-                assert profile is not None
-                assert profile.clonal_complex == "unknown"
-                assert profile.sequence_type == "unknown"
-                assert len(profile.alleles) > 0
-            else:
-                assert profile is not None
-                assert isinstance(profile, MLSTProfile)
-                assert profile.clonal_complex == "ST-2 complex"
-                assert profile.sequence_type == "1"
-
-async def test_bigsdb_profile_multiple_strings_fail_second_stop():
-    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
-    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
-    async def generate_async_iterable_sequences():
-        for dummy_sequence in dummy_sequences:
-            yield [dummy_sequence]
-    async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
-        with pytest.raises(NoBIGSdbMatchesException):
-            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), stop_on_fail=True):
-                name, profile = named_profile.name, named_profile.mlst_profile
-                if name == "should_fail":
-                    pytest.fail("Exception should have been thrown, no exception was thrown.")
-                else:
-                    assert profile is not None
-                    assert isinstance(profile, MLSTProfile)
-                    assert profile.clonal_complex == "ST-2 complex"
-                    assert profile.sequence_type == "1"
-
-async def test_bigsdb_index_get_schemas_for_bordetella():
-    async with BIGSdbIndex() as index:
-        schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
-        assert len(schemas.keys()) > 0
-        assert "MLST" in schemas
-        assert isinstance(schemas["MLST"], int)
-
-async def test_bigsdb_index_get_databases_has_only_seqdef():
-    async with BIGSdbIndex() as index:
-        databases = await index.get_known_seqdef_dbs()
-        assert len(databases.keys()) > 0
-        for database_name in databases.keys():
-            assert database_name.endswith("seqdef")
-        assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
--- a/tests/autobigs/engine/data/local/test_fasta.py
+++ b/tests/autobigs/engine/data/local/test_fasta.py
@@ -1,4 +1,4 @@
-from autobigs.engine.data.local.fasta import read_fasta
+from autobigs.engine.reading import read_fasta


 async def test_fasta_reader_not_none():
--- a/src/autobigs/engine/data/structures/init.py
+++ b/src/autobigs/engine/data/structures/init.py
--- a/tests/resources/fdaargos_1560_hinfluenza.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza.fasta
--- a/tests/resources/fdaargos_1560_hinfluenza_features.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza_features.fasta
--- a/tests/resources/tohama_I_bpertussis_adk.fasta
+++ b/tests/resources/tohama_I_bpertussis_adk.fasta
@@ -0,0 +1,11 @@
+>lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS]
+ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT
+ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT
+GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT
+CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG
+ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT
+CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC
+AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG
+TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA
+GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC
+CGCCTGTCGCAGGCTCTGCAGAGCTAA
--- a/tests/resources/tohama_I_bpertussis_features.fasta
+++ b/tests/resources/tohama_I_bpertussis_features.fasta