Updated URL links

Changed FASTA reading to group based on file for merging partial targets
Merge branch 'features/improved-oop-architecture' into features/non-exact-notation
2025-02-14 20:37:13 +00:00 · 2025-02-14 14:35:53 +00:00 · 2025-02-12 17:53:25 +00:00 · 2025-02-12 17:52:53 +00:00 · 2025-02-12 17:46:55 +00:00 · 2025-02-12 17:43:26 +00:00
16 changed files with 56080 additions and 51205 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # autoBIGS.Engine

-A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
+A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.

 ## Features

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,9 +15,9 @@ requires-python = ">=3.12"
 description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."

 [project.urls]
-Homepage = "https://github.com/RealYHD/autoBIGS.engine"
-Source = "https://github.com/RealYHD/autoBIGS.engine"
-Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
+Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"

 [tool.setuptools_scm]

--- a/src/autobigs/engine/analysis/aligners.py
+++ b/src/autobigs/engine/analysis/aligners.py
@@ -1,70 +0,0 @@
-import asyncio
-from concurrent.futures import Future, ThreadPoolExecutor
-from contextlib import AbstractContextManager
-from typing import Any, Set, Union
-from Bio.Align import PairwiseAligner
-from queue import Queue
-
-from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
-
-class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager):
-    def __enter__(self):
-        self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment")
-        return self
-
-    def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
-        self._max_threads = max_threads
-        self._aligner = aligner
-        self._work_left: Set[Future] = set()
-        self._work_complete: Queue[Future] = Queue()
-
-    def align(self, reference: str, query: str, **associated_data):
-        work = self._thread_pool.submit(
-            self.work, reference, query, **associated_data)
-        work.add_done_callback(self._on_complete)
-        self._work_left.add(work)
-        
-    def _on_complete(self, future: Future):
-        self._work_left.remove(future)
-        self._work_complete.put(future)
-
-    def work(self, reference, query, **associated_data):
-        alignments = self._aligner.align(reference, query)
-        top_alignment = alignments[0]
-        top_alignment_stats = top_alignment.counts()
-        top_alignment_gaps = top_alignment_stats.gaps
-        top_alignment_identities = top_alignment_stats.identities
-        top_alignment_mismatches = top_alignment_stats.mismatches
-        top_alignment_score = top_alignment.score # type: ignore
-        return PairwiseAlignment(
-            top_alignment.sequences[0],
-            top_alignment.sequences[1],
-            tuple(top_alignment.indices[0]),
-            tuple(top_alignment.indices[1]),
-            AlignmentStats(
-                percent_identity=top_alignment_identities/top_alignment.length,
-                mismatches=top_alignment_mismatches,
-                gaps=top_alignment_gaps,
-                match_metric=top_alignment_score
-            )), associated_data
-
-    async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
-        if self._work_complete.empty() and len(self._work_left):
-            return None
-        completed_alignment = await asyncio.wrap_future(self._work_complete.get())
-        return completed_alignment
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.shutdown()
-
-    def __aiter__(self):
-        return self
-    
-    async def __anext__(self):
-        result = await self.next_completed()
-        if result is None:
-            raise StopAsyncIteration
-        return result
-
-    def shutdown(self):
-        self._thread_pool.shutdown(wait=True, cancel_futures=True)
--- a/src/autobigs/engine/analysis/bigsdb.py
+++ b/src/autobigs/engine/analysis/bigsdb.py
@@ -11,7 +11,6 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen

 from aiohttp import ClientSession, ClientTimeout

-from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
 from autobigs.engine.reading import read_fasta
 from autobigs.engine.structures.alignment import PairwiseAlignment
 from autobigs.engine.structures.genomics import NamedString
@@ -139,141 +138,6 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.close()

-class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
-    async def __aenter__(self):
-        if self._prepare:
-            await self.update_scheme_locis()
-            await asyncio.gather(
-                self.download_alleles_cache_data(),
-                self.download_scheme_profiles()
-            )
-            await self.load_scheme_profiles()
-        return self
-    
-    def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True):
-        self._database_api = database_api
-        self._database_name = database_name
-        self._schema_id = schema_id
-        self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
-        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
-        if cache_path is None:
-            self._cache_path = tempfile.mkdtemp("BIGSdb")
-            self._cleanup_required = True
-        else:
-            self._cache_path = cache_path
-            self._cleanup_required = False
-        self._loci: list[str] = []
-        self._profiles_st_map = {}
-        self._prepare = prepare
-
-    async def update_scheme_locis(self):
-        self._loci.clear()
-        async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response:
-            schema_json = await schema_response.json()
-            for locus in schema_json["loci"]:
-                locus_name = path.basename(locus)
-                self._loci.append(locus_name)
-        self._loci.sort()
-    
-    async def load_scheme_profiles(self):
-        self._profiles_st_map.clear()
-        with open(self.get_scheme_profile_path()) as profile_cache_handle:
-            reader = csv.DictReader(profile_cache_handle, delimiter="\t")
-            for line in reader:
-                alleles = []
-                for locus in self._loci:
-                    alleles.append(line[locus])
-                self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"])
-            
-    def get_locus_cache_path(self, locus) -> str:
-        return path.join(self._cache_path, locus + "." + "fasta")
-
-    def get_scheme_profile_path(self):
-        return path.join(self._cache_path, "profiles.csv")
-
-    async def download_alleles_cache_data(self):
-        for locus in self._loci:
-            with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
-                async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
-                    async for chunk, eof in fasta_response.content.iter_chunks():
-                        fasta_handle.write(chunk)
-
-    async def download_scheme_profiles(self):
-        with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle:
-            async with self._http_client.get("profiles_csv") as profiles_response:
-                async for chunk, eof in profiles_response.content.iter_chunks():
-                    profile_cache_handle.write(chunk)
-        await self.load_scheme_profiles()
-    
-    async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
-        aligner = PairwiseAligner("blastn")
-        aligner.mode = "local"
-        with AsyncBiopythonPairwiseAlignmentEngine(aligner, max_threads=4) as aligner_engine:
-            for query_sequence_string in query_sequence_strings:
-                for locus in self._loci:
-                    async for allele_variant in read_fasta(self.get_locus_cache_path(locus)):
-                        aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True)
-                        break # start a bunch of full alignments for each variant to select segments
-            alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set)
-            async for alignment_result, additional_information in aligner_engine:
-                result_variant_name = additional_information["variant_name"]
-                result_locus, variant_id = result_variant_name.split("_")
-                full_alignment = additional_information["full"]
-                if full_alignment:
-                    if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0:
-                        # I.e., 100% exactly the same
-                        yield Allele(result_locus, variant_id, None)
-                        continue
-                    else:
-                        alignment_rankings[result_locus].add((alignment_result, variant_id))
-                    interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]]
-                    async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)):
-                        if result_variant_name == allele_variant.name:
-                            continue # Skip if we just finished aligning this
-                        aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False)
-                else:
-                    alignment_rankings[result_locus].add((alignment_result, variant_id))
-            for final_locus, alignments in alignment_rankings.items():
-                closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.match_metric)[0]
-                yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats)
-
-    async def determine_mlst_st(self, alleles):
-        allele_variants: dict[str, Allele] = {}
-        if isinstance(alleles, AsyncIterable):
-            async for allele in alleles:
-                allele_variants[allele.allele_locus] = allele
-        else:
-            for allele in alleles:
-                allele_variants[allele.allele_locus] = allele
-        ordered_profile = []
-        for locus in self._loci:
-               ordered_profile.append(allele_variants[locus].allele_variant)
-
-        st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)]
-        return MLSTProfile(set(allele_variants.values()), st, clonal_complex)
-
-    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
-        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
-        return await self.determine_mlst_st(alleles)
-
-    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
-        async for named_strings in query_named_string_groups:
-            for named_string in named_strings:
-                try:
-                    yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
-                except NoBIGSdbMatchesException as e:
-                    if stop_on_fail:
-                        raise e
-                    yield NamedMLSTProfile(named_string.name, None)
-
-    async def close(self):
-        await self._http_client.close()
-        if self._cleanup_required:
-            shutil.rmtree(self._cache_path)
-
-    async def __aexit__(self, exc_type, exc_value, traceback):
-        await self.close()
-
 class BIGSdbIndex(AbstractAsyncContextManager):
    KNOWN_BIGSDB_APIS = {
        "https://bigsdb.pasteur.fr/api",
@@ -334,5 +198,5 @@ class BIGSdbIndex(AbstractAsyncContextManager):

 def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
    if local:
-        return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
+        raise NotImplementedError()
    return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
--- a/src/autobigs/engine/analysis/genbank.py
+++ b/src/autobigs/engine/analysis/genbank.py
@@ -1,26 +0,0 @@
-import asyncio
-from contextlib import AbstractAsyncContextManager
-import tempfile
-from typing import Iterable, Union
-from Bio import Entrez
-from Bio import SeqIO
-
-from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation
-
-async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
-    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
-        record = SeqIO.read(fetch_stream, "genbank")
-        sequence_features = list()
-        for feature in record.features:
-            start = int(feature.location.start)
-            end = int(feature.location.end)
-            qualifiers = feature.qualifiers
-            for qualifier_key in qualifiers:
-                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
-            sequence_features.append(StringAnnotation(
-                type=feature.type,
-                start=start,
-                end=end+1,  # Position is exclusive
-                feature_properties=qualifiers
-            ))
-        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
--- a/src/autobigs/engine/reading.py
+++ b/src/autobigs/engine/reading.py
@@ -5,12 +5,13 @@ from Bio import SeqIO

 from autobigs.engine.structures.genomics import NamedString

-async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
+    results = []
    for fasta_sequence in await fasta_sequences:
-        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+        results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
+    return results

-async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
    for handle in handles:
-        async for named_seq in read_fasta(handle):
-            yield named_seq
+        yield await read_fasta(handle)
--- a/src/autobigs/engine/writing.py
+++ b/src/autobigs/engine/writing.py
@@ -6,13 +6,15 @@ from typing import AsyncIterable, Collection, Mapping, Sequence, Union
 from autobigs.engine.structures.mlst import Allele, MLSTProfile


-def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
+def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
    result = defaultdict(list)
    for allele in alleles:
-        result[allele.allele_locus].append(allele.allele_variant)
+        result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
    for locus in result.keys():
        if len(result[locus]) == 1:
            result[locus] = result[locus][0] # Take the only one
+        else:
+            result[locus] = tuple(result[locus]) # type: ignore
    return dict(result)

 async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
@@ -24,7 +26,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
            if mlst_profile is None:
                failed.append(name)
                continue
-            allele_mapping = alleles_to_map(mlst_profile.alleles)
+            allele_mapping = alleles_to_text_map(mlst_profile.alleles)
            if writer is None:
                header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
                writer = csv.DictWriter(filehandle, fieldnames=header)
--- a/tests/autobigs/engine/analysis/test_aligners.py
+++ b/tests/autobigs/engine/analysis/test_aligners.py
@@ -1,42 +0,0 @@
-from Bio import SeqIO
-from Bio.Align import PairwiseAligner
-from pytest import mark
-from pytest import fixture
-from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
-from autobigs.engine.structures.alignment import PairwiseAlignment
-
-@fixture
-def tohamaI_bpertussis_adk():
-    return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
-
-@fixture
-def tohamaI_bpertussis_genome():
-    return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
-
-@fixture
-def fdaargos_1560_hinfluenza_adk():
-    return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq)
-
-@fixture
-def fdaargos_1560_hinfluenza_genome():
-    return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq)
-
-
-@fixture(params=[1, 2])
-def dummy_engine(request):
-    aligner = PairwiseAligner("blastn")
-    aligner.mode = "local"
-    with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine:
-        yield engine
-
-class TestAsyncPairwiseAlignmentEngine:
-    async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
-        dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
-        async for alignment, additional_information in dummy_engine:
-            assert isinstance(alignment, PairwiseAlignment)
-
-    async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
-        dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
-        dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk)
-        async for alignment, additional_information in dummy_engine:
-            assert isinstance(alignment, PairwiseAlignment)
--- a/tests/autobigs/engine/analysis/test_bigsdb.py
+++ b/tests/autobigs/engine/analysis/test_bigsdb.py
@@ -9,7 +9,7 @@ from autobigs.engine.structures import mlst
 from autobigs.engine.structures.genomics import NamedString
 from autobigs.engine.structures.mlst import Allele, MLSTProfile
 from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
-from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
+from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler

 async def generate_async_iterable(normal_iterable):
    for dummy_sequence in normal_iterable:
@@ -50,33 +50,30 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
        Allele("pgm", "5", None),
    ), "unknown", "unknown")

-hinfluenzae_fdaargos_profile = MLSTProfile((
-        Allele("adk", "1", None),
-        Allele("atpG", "1", None),
-        Allele("frdB", "1", None),
-        Allele("fucK", "1", None),
-        Allele("mdh", "1", None),
-        Allele("pgi", "1", None),
+hinfluenzae_2014_102_profile = MLSTProfile((
+        Allele("adk", "28", None),
+        Allele("atpG", "33", None),
+        Allele("frdB", "7", None),
+        Allele("fucK", "18", None),
+        Allele("mdh", "11", None),
+        Allele("pgi", "125", None),
+        Allele("recA", "89", None)
+    ), "478", "unknown")
+
+hinfluenzae_2014_102_bad_profile = MLSTProfile((
+        Allele("adk", "3", None),
+        Allele("atpG", "121", None),
+        Allele("frdB", "6", None),
+        Allele("fucK", "5", None),
+        Allele("mdh", "12", None),
+        Allele("pgi", "4", None),
        Allele("recA", "5", None)
-    ), "3", "ST-3 complex")
+    ), "unknown", "unknown")

-hinfluenzae_fdaargos_bad_profile = MLSTProfile((
-        Allele("adk", "1", None),
-        Allele("atpG", "1", None),
-        Allele("frdB", "1", None),
-        Allele("fucK", "1", None),
-        Allele("mdh", "1", None),
-        Allele("pgi", "1", None),
-        Allele("recA", "5", None)
-    ), "3", "ST-3 complex")
-
-hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
-
-hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))

@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
    (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
-    (True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
+    (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
 ])
 class TestBIGSdbMLSTProfiler:
    async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
@@ -202,7 +199,6 @@ class TestBIGSdbIndex:
            assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"

    @pytest.mark.parametrize("local", [
-        (True),
        (False)
    ])
    async def test_bigsdb_index_instantiates_correct_profiler(self, local):
--- a/tests/autobigs/engine/test_reading.py
+++ b/tests/autobigs/engine/test_reading.py
@@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta


 async def test_fasta_reader_not_none():
-    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
-    async for named_string in named_strings:
+    named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
+    for named_string in named_strings:
        assert named_string.name == "BX470248.1"
--- a/tests/autobigs/engine/test_writing.py
+++ b/tests/autobigs/engine/test_writing.py
@@ -0,0 +1,47 @@
+from typing import AsyncIterable, Iterable
+
+import pytest
+from autobigs.engine.structures.alignment import AlignmentStats
+from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
+from autobigs.engine.structures.mlst import Allele, MLSTProfile
+import tempfile
+from csv import reader
+from os import path
+
+
+@pytest.fixture
+def dummy_alphabet_mlst_profile():
+    return MLSTProfile((
+        Allele("A", "1", None),
+        Allele("D", "1", None),
+        Allele("B", "1", None),
+        Allele("C", "1", None),
+        Allele("C", "2", AlignmentStats(90, 10, 0, 90))
+    ), "mysterious", "very mysterious")
+
+async def iterable_to_asynciterable(iterable: Iterable):
+    for iterated in iterable:
+        yield iterated
+
+async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
+    dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
+    with tempfile.TemporaryDirectory() as temp_dir:
+        output_path = path.join(temp_dir, "out.csv")
+        await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
+        with open(output_path) as csv_handle:
+            csv_reader = reader(csv_handle)
+            lines = list(csv_reader)
+            target_columns = lines[4:]
+            assert target_columns == sorted(target_columns)
+
+async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
+    mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
+    expected_mapping = {
+        "A": "1",
+        "B": "1",
+        "C": ("1", "2*"),
+        "D": "1"
+    }
+    for allele_name, allele_ids in mapping.items():
+        assert allele_name in expected_mapping
+        assert allele_ids == expected_mapping[allele_name]
--- a/tests/resources/2014-102_hinfluenza.fasta
+++ b/tests/resources/2014-102_hinfluenza.fasta
--- a/tests/resources/2014-102_hinfluenza_features.fasta
+++ b/tests/resources/2014-102_hinfluenza_features.fasta
--- a/tests/resources/fdaargos_1560_hinfluenza.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza.fasta
--- a/tests/resources/fdaargos_1560_hinfluenza_adk.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza_adk.fasta
@@ -1,11 +0,0 @@
->lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
-ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
-TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
-ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
-CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
-ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
-TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
-AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
-TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
-AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
-AAAATCTTAGGCTAA
--- a/tests/resources/fdaargos_1560_hinfluenza_features.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza_features.fasta
Author	SHA1	Message	Date
Harrison Deng	2e8cdd8da9	Updated URL links All checks were successful automlst.engine/pipeline/head This commit looks good Details autoBIGS.engine/pipeline/tag This commit looks good Details	2025-02-14 20:37:13 +00:00
Harrison Deng	d0318536b2	Changed FASTA reading to group based on file for merging partial targets	2025-02-14 14:35:53 +00:00
Harrison Deng	765cf9d418	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:53:25 +00:00
Harrison Deng	348c3d00b4	Updated README.md to be more clear	2025-02-12 17:52:53 +00:00
Harrison Deng	1c3f7f9ed8	Removed test for instantiating local MLST profiler	2025-02-12 17:46:55 +00:00
Harrison Deng	e4ddaf2e8c	Changed to a MLST typable sequence for pubMLST tests	2025-02-12 17:43:26 +00:00
Harrison Deng	73aade2bde	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:07:51 +00:00
Harrison Deng	af8590baa7	Removed import of deleted feature	2025-02-12 17:07:10 +00:00
Harrison Deng	36bca1b70d	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:02:22 +00:00
Harrison Deng	09a693b696	Removed features being worked on in separate branch	2025-02-12 17:02:00 +00:00
Harrison Deng	f76bf86ef6	Fixed bad profile for H. influenzae non-exact test case	2025-02-12 16:59:50 +00:00
Harrison Deng	a60daf3ee2	Updated H. influenzae database API url	2025-02-12 16:39:13 +00:00
Harrison Deng	fbfd993269	Copied tests over from CSV tests and updated to reflect current code base	2025-02-12 16:36:59 +00:00
Harrison Deng	ba606c35a9	conversion of collection of alleles to map now produces results with tuples instead of lists	2025-02-12 16:36:31 +00:00
Harrison Deng	4183840ba0	Added notation to indicate inexact matching in CSV	2025-02-12 15:59:19 +00:00
Harrison Deng	7fb3eab5b6	Added pubMLST test case to bigsdb tests and updated to reflect codebase changes	2025-02-12 15:53:14 +00:00
Harrison Deng	175a51f968	Replaced local profiler with a not implemented exception	2025-02-12 15:52:48 +00:00