Infrastructure for concurrent processing implemented

Writing now uses named MLST profile
Multiple string profiling now respects grouped queries (for non-WGS)
2025-02-19 15:49:46 +00:00 · 2025-02-18 16:03:17 +00:00 · 2025-02-18 15:34:18 +00:00 · 2025-02-14 20:47:06 +00:00 · 2025-02-14 20:37:13 +00:00 · 2025-02-14 14:35:53 +00:00
7 changed files with 69 additions and 40 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,12 @@ dependencies = [
 ]
 requires-python = ">=3.12"
 description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
+license = {text = "GPL-3.0-or-later"}

 [project.urls]
-Homepage = "https://github.com/RealYHD/autoBIGS.engine"
-Source = "https://github.com/RealYHD/autoBIGS.engine"
-Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
+Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
+Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"

 [tool.setuptools_scm]

--- a/src/autobigs/engine/analysis/bigsdb.py
+++ b/src/autobigs/engine/analysis/bigsdb.py
@@ -22,15 +22,15 @@ from Bio.Align import PairwiseAligner
 class BIGSdbMLSTProfiler(AbstractAsyncContextManager):

    @abstractmethod
-    def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
+    def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
        pass

    @abstractmethod
-    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
        pass

    @abstractmethod
-    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+    async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
        pass

    @abstractmethod
@@ -52,14 +52,14 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def __aenter__(self):
        return self

-    async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
+    async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
        uri_path = "sequence"
-        if isinstance(query_sequence_strings, str):
+        if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString):
            query_sequence_strings = [query_sequence_strings]
        for sequence_string in query_sequence_strings:
            async with self._http_client.post(uri_path, json={
-                "sequence": sequence_string,
+                "sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
                "partial_matches": True
            }) as response:
                sequence_response: dict = await response.json()
@@ -70,7 +70,8 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
                    for allele_loci, alleles in exact_matches.items():
                        for allele in alleles:
                            alelle_id = allele["allele_id"]
-                            yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
+                            result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
+                            yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
                elif "partial_matches" in sequence_response:
                    partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
                    for allele_loci, partial_match in partial_matches.items():
@@ -82,23 +83,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
                            gaps=int(partial_match["gaps"]),
                            match_metric=int(partial_match["bitscore"])
                        )
-                        yield Allele(
+                        result_allele = Allele(
                            allele_locus=allele_loci,
                            allele_variant=str(partial_match["allele"]),
                            partial_match_profile=partial_match_profile
                        )
+                        yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
                else:
-                    raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
+                    raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)

-    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
        uri_path = "designations"
        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        names_list = []
+        def insert_allele_to_request_dict(allele: Union[Allele, tuple[str, Allele]]):
+            if isinstance(allele, Allele):
+                allele_val = allele
+            else:
+                allele_val = allele[1]
+                names_list.append(allele[0])
+            allele_request_dict[allele_val.allele_locus].append({"allele": str(allele_val.allele_variant)})
+
        if isinstance(alleles, AsyncIterable):
            async for allele in alleles:
-                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
+                insert_allele_to_request_dict(allele)
        else:
            for allele in alleles:
-                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
+                insert_allele_to_request_dict(allele)
        request_json = {
            "designations": allele_request_dict
        }
@@ -111,26 +122,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
            schema_fields_returned.setdefault("clonal_complex", "unknown")
            schema_exact_matches: dict = response_json["exact_matches"]
            for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
-                if len(exact_match_alleles) > 1:
-                    raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
                allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
            if len(allele_set) == 0:
                raise ValueError("Passed in no alleles.")
-            return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+            result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+            if len(names_list) > 0:
+                result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile)
+            return result_mlst_profile

-    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
+    async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
        return await self.determine_mlst_st(alleles)

    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
+        tasks = []
        async for named_strings in query_named_string_groups:
-            for named_string in named_strings:
+            tasks.append(self.profile_string(named_strings))
+            for task in asyncio.as_completed(tasks):
                try:
-                    yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
+                    yield await task
                except NoBIGSdbMatchesException as e:
                    if stop_on_fail:
                        raise e
-                    yield NamedMLSTProfile(named_string.name, None)
+                    causal_name = e.get_causal_query_name()
+                    if causal_name is None:
+                        raise ValueError("Missing query name despite requiring names.")
+                    else:
+                        yield NamedMLSTProfile(causal_name, None)

    async def close(self):
        await self._http_client.close()
--- a/src/autobigs/engine/exceptions/database.py
+++ b/src/autobigs/engine/exceptions/database.py
@@ -5,8 +5,12 @@ class BIGSDbDatabaseAPIException(Exception):


 class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
-    def __init__(self, database_name: str, database_schema_id: int, *args):
+    def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args):
+        self._query_name = query_name
        super().__init__(f"No matches found with schema with ID {database_schema_id}  in the database \"{database_name}\".", *args)
+    
+    def get_causal_query_name(self) -> Union[str, None]:
+        return self._query_name

 class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
    def __init__(self, database_name: str, database_schema_id: int, *args):
--- a/src/autobigs/engine/reading.py
+++ b/src/autobigs/engine/reading.py
@@ -5,12 +5,16 @@ from Bio import SeqIO

 from autobigs.engine.structures.genomics import NamedString

-async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
+    results = []
    for fasta_sequence in await fasta_sequences:
-        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+        results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
+    return results

-async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
+    tasks = []
    for handle in handles:
-        async for named_seq in read_fasta(handle):
-            yield named_seq
+        tasks.append(read_fasta(handle))
+    for task in asyncio.as_completed(tasks):
+        yield await task
--- a/src/autobigs/engine/writing.py
+++ b/src/autobigs/engine/writing.py
@@ -3,7 +3,7 @@ import csv
 from os import PathLike
 from typing import AsyncIterable, Collection, Mapping, Sequence, Union

-from autobigs.engine.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile


 def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
@@ -17,12 +17,14 @@ def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Seque
            result[locus] = tuple(result[locus]) # type: ignore
    return dict(result)

-async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[NamedMLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
    failed = list()
    with open(handle, "w", newline='') as filehandle:
        header = None
        writer: Union[csv.DictWriter, None] = None
-        async for name, mlst_profile in mlst_profiles_iterable:
+        async for named_mlst_profile in mlst_profiles_iterable:
+            name = named_mlst_profile.name
+            mlst_profile = named_mlst_profile.mlst_profile
            if mlst_profile is None:
                failed.append(name)
                continue
--- a/tests/autobigs/engine/test_reading.py
+++ b/tests/autobigs/engine/test_reading.py
@@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta


 async def test_fasta_reader_not_none():
-    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
-    async for named_string in named_strings:
+    named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
+    for named_string in named_strings:
        assert named_string.name == "BX470248.1"
--- a/tests/autobigs/engine/test_writing.py
+++ b/tests/autobigs/engine/test_writing.py
@@ -3,7 +3,7 @@ from typing import AsyncIterable, Iterable
 import pytest
 from autobigs.engine.structures.alignment import AlignmentStats
 from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
-from autobigs.engine.structures.mlst import Allele, MLSTProfile
+from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
 import tempfile
 from csv import reader
 from os import path
@@ -11,20 +11,20 @@ from os import path

@pytest.fixture
 def dummy_alphabet_mlst_profile():
-    return MLSTProfile((
+    return NamedMLSTProfile("name", MLSTProfile((
        Allele("A", "1", None),
        Allele("D", "1", None),
        Allele("B", "1", None),
        Allele("C", "1", None),
        Allele("C", "2", AlignmentStats(90, 10, 0, 90))
-    ), "mysterious", "very mysterious")
+    ), "mysterious", "very mysterious"))

 async def iterable_to_asynciterable(iterable: Iterable):
    for iterated in iterable:
        yield iterated

 async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
-    dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
+    dummy_profiles = [dummy_alphabet_mlst_profile]
    with tempfile.TemporaryDirectory() as temp_dir:
        output_path = path.join(temp_dir, "out.csv")
        await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
@@ -34,8 +34,8 @@ async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile
            target_columns = lines[4:]
            assert target_columns == sorted(target_columns)

-async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
-    mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
+async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: NamedMLSTProfile):
+    mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.mlst_profile.alleles) # type: ignore
    expected_mapping = {
        "A": "1",
        "B": "1",
@@ -44,4 +44,4 @@ async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profil
    }
    for allele_name, allele_ids in mapping.items():
        assert allele_name in expected_mapping
-        assert allele_ids == expected_mapping[allele_name]
+        assert allele_ids == expected_mapping[allele_name]
Author	SHA1	Message	Date
Harrison Deng	b8cebb8ba4	Infrastructure for concurrent processing implemented All checks were successful autoBIGS.engine/pipeline/head This commit looks good Details	2025-02-19 15:49:46 +00:00
Harrison Deng	7384895578	Writing now uses named MLST profile All checks were successful automlst.engine/pipeline/head This commit looks good Details automlst.engine/pipeline/tag This commit looks good Details	2025-02-18 16:03:17 +00:00
Harrison Deng	5a03c7e8d8	Multiple string profiling now respects grouped queries (for non-WGS) All checks were successful automlst.engine/pipeline/head This commit looks good Details	2025-02-18 15:34:18 +00:00
Harrison Deng	ddf9cde175	Added a license text to pyproject.toml	2025-02-14 20:47:06 +00:00
Harrison Deng	2e8cdd8da9	Updated URL links All checks were successful automlst.engine/pipeline/head This commit looks good Details autoBIGS.engine/pipeline/tag This commit looks good Details	2025-02-14 20:37:13 +00:00
Harrison Deng	d0318536b2	Changed FASTA reading to group based on file for merging partial targets	2025-02-14 14:35:53 +00:00