Multiple string profiling now respects grouped queries (for non-WGS)

Added a license text to pyproject.toml
Updated URL links
2025-02-18 15:34:18 +00:00 · 2025-02-14 20:47:06 +00:00 · 2025-02-14 20:37:13 +00:00 · 2025-02-14 14:35:53 +00:00 · 2025-02-12 17:53:25 +00:00 · 2025-02-12 17:52:53 +00:00
13 changed files with 56082 additions and 50929 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # autoBIGS.Engine
-A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
+A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
 ## Features
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,12 @@ dependencies = [
 ]
 requires-python = ">=3.12"
 description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
 license = {text = "GPL-3.0-or-later"}
 [project.urls]
-Homepage = "https://github.com/RealYHD/autoBIGS.engine"
+Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
-Source = "https://github.com/RealYHD/autoBIGS.engine"
+Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
-Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
+Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"
 [tool.setuptools_scm]
--- a/src/autobigs/engine/analysis/bigsdb.py
+++ b/src/autobigs/engine/analysis/bigsdb.py
@@ -11,7 +11,6 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen
 from aiohttp import ClientSession, ClientTimeout
 from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
 from autobigs.engine.reading import read_fasta
 from autobigs.engine.structures.alignment import PairwiseAlignment
 from autobigs.engine.structures.genomics import NamedString
@@ -125,13 +124,17 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
        async for named_strings in query_named_string_groups:
            names: list[str] = list()
            sequences: list[str] = list()
            for named_string in named_strings:
                names.append(named_string.name)
                sequences.append(named_string.sequence)
            try:
-                    yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
+                yield NamedMLSTProfile("-".join(names), (await self.profile_string(sequences)))
            except NoBIGSdbMatchesException as e:
                if stop_on_fail:
                    raise e
-                    yield NamedMLSTProfile(named_string.name, None)
+                yield NamedMLSTProfile("-".join(names), None)
    async def close(self):
        await self._http_client.close()
--- a/src/autobigs/engine/reading.py
+++ b/src/autobigs/engine/reading.py
@@ -5,12 +5,13 @@ from Bio import SeqIO
 from autobigs.engine.structures.genomics import NamedString
-async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
    results = []
    for fasta_sequence in await fasta_sequences:
-        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+        results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
    return results
-async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
    for handle in handles:
-        async for named_seq in read_fasta(handle):
+        yield await read_fasta(handle)
            yield named_seq
--- a/src/autobigs/engine/writing.py
+++ b/src/autobigs/engine/writing.py
@@ -6,13 +6,15 @@ from typing import AsyncIterable, Collection, Mapping, Sequence, Union
 from autobigs.engine.structures.mlst import Allele, MLSTProfile
-def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
+def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
    result = defaultdict(list)
    for allele in alleles:
-        result[allele.allele_locus].append(allele.allele_variant)
+        result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
    for locus in result.keys():
        if len(result[locus]) == 1:
            result[locus] = result[locus][0] # Take the only one
        else:
            result[locus] = tuple(result[locus]) # type: ignore
    return dict(result)
 async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
@@ -24,7 +26,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
            if mlst_profile is None:
                failed.append(name)
                continue
-            allele_mapping = alleles_to_map(mlst_profile.alleles)
+            allele_mapping = alleles_to_text_map(mlst_profile.alleles)
            if writer is None:
                header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
                writer = csv.DictWriter(filehandle, fieldnames=header)
--- a/tests/autobigs/engine/analysis/test_bigsdb.py
+++ b/tests/autobigs/engine/analysis/test_bigsdb.py
@@ -50,17 +50,17 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
        Allele("pgm", "5", None),
    ), "unknown", "unknown")
-hinfluenzae_fdaargos_profile = MLSTProfile((
+hinfluenzae_2014_102_profile = MLSTProfile((
-        Allele("adk", "1", None),
+        Allele("adk", "28", None),
-        Allele("atpG", "1", None),
+        Allele("atpG", "33", None),
-        Allele("frdB", "1", None),
+        Allele("frdB", "7", None),
-        Allele("fucK", "1", None),
+        Allele("fucK", "18", None),
-        Allele("mdh", "1", None),
+        Allele("mdh", "11", None),
-        Allele("pgi", "1", None),
+        Allele("pgi", "125", None),
-        Allele("recA", "5", None)
+        Allele("recA", "89", None)
-    ), "3", "ST-3 complex")
+    ), "478", "unknown")
-hinfluenzae_fdaargos_bad_profile = MLSTProfile((
+hinfluenzae_2014_102_bad_profile = MLSTProfile((
        Allele("adk", "3", None),
        Allele("atpG", "121", None),
        Allele("frdB", "6", None),
@@ -68,15 +68,12 @@ hinfluenzae_fdaargos_bad_profile = MLSTProfile((
        Allele("mdh", "12", None),
        Allele("pgi", "4", None),
        Allele("recA", "5", None)
-    ), "3", "ST-3 complex")
+    ), "unknown", "unknown")
 hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
 hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
    (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
-    (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile),
+    (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
 ])
 class TestBIGSdbMLSTProfiler:
    async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
@@ -202,7 +199,6 @@ class TestBIGSdbIndex:
            assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
    @pytest.mark.parametrize("local", [
        (True),
        (False)
    ])
    async def test_bigsdb_index_instantiates_correct_profiler(self, local):
--- a/tests/autobigs/engine/test_reading.py
+++ b/tests/autobigs/engine/test_reading.py
@@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta
 async def test_fasta_reader_not_none():
-    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
+    named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
-    async for named_string in named_strings:
+    for named_string in named_strings:
        assert named_string.name == "BX470248.1"
--- a/tests/autobigs/engine/test_writing.py
+++ b/tests/autobigs/engine/test_writing.py
@@ -0,0 +1,47 @@
 from typing import AsyncIterable, Iterable
 import pytest
 from autobigs.engine.structures.alignment import AlignmentStats
 from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
 from autobigs.engine.structures.mlst import Allele, MLSTProfile
 import tempfile
 from csv import reader
 from os import path
@pytest.fixture
 def dummy_alphabet_mlst_profile():
    return MLSTProfile((
        Allele("A", "1", None),
        Allele("D", "1", None),
        Allele("B", "1", None),
        Allele("C", "1", None),
        Allele("C", "2", AlignmentStats(90, 10, 0, 90))
    ), "mysterious", "very mysterious")
 async def iterable_to_asynciterable(iterable: Iterable):
    for iterated in iterable:
        yield iterated
 async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
    dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
    with tempfile.TemporaryDirectory() as temp_dir:
        output_path = path.join(temp_dir, "out.csv")
        await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
        with open(output_path) as csv_handle:
            csv_reader = reader(csv_handle)
            lines = list(csv_reader)
            target_columns = lines[4:]
            assert target_columns == sorted(target_columns)
 async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
    mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
    expected_mapping = {
        "A": "1",
        "B": "1",
        "C": ("1", "2*"),
        "D": "1"
    }
    for allele_name, allele_ids in mapping.items():
        assert allele_name in expected_mapping
        assert allele_ids == expected_mapping[allele_name]
--- a/tests/resources/2014-102_hinfluenza.fasta
+++ b/tests/resources/2014-102_hinfluenza.fasta
--- a/tests/resources/2014-102_hinfluenza_features.fasta
+++ b/tests/resources/2014-102_hinfluenza_features.fasta
--- a/tests/resources/fdaargos_1560_hinfluenza.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza.fasta
--- a/tests/resources/fdaargos_1560_hinfluenza_adk.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza_adk.fasta
@@ -1,11 +0,0 @@
 >lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
 ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
 TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
 ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
 CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
 ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
 TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
 AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
 TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
 AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
 AAAATCTTAGGCTAA
--- a/tests/resources/fdaargos_1560_hinfluenza_features.fasta
+++ b/tests/resources/fdaargos_1560_hinfluenza_features.fasta
Author	SHA1	Message	Date
Harrison Deng	5a03c7e8d8	Multiple string profiling now respects grouped queries (for non-WGS) All checks were successful automlst.engine/pipeline/head This commit looks good Details	2025-02-18 15:34:18 +00:00
Harrison Deng	ddf9cde175	Added a license text to pyproject.toml	2025-02-14 20:47:06 +00:00
Harrison Deng	2e8cdd8da9	Updated URL links All checks were successful automlst.engine/pipeline/head This commit looks good Details autoBIGS.engine/pipeline/tag This commit looks good Details	2025-02-14 20:37:13 +00:00
Harrison Deng	d0318536b2	Changed FASTA reading to group based on file for merging partial targets	2025-02-14 14:35:53 +00:00
Harrison Deng	765cf9d418	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:53:25 +00:00
Harrison Deng	348c3d00b4	Updated README.md to be more clear	2025-02-12 17:52:53 +00:00
Harrison Deng	1c3f7f9ed8	Removed test for instantiating local MLST profiler	2025-02-12 17:46:55 +00:00
Harrison Deng	e4ddaf2e8c	Changed to a MLST typable sequence for pubMLST tests	2025-02-12 17:43:26 +00:00
Harrison Deng	73aade2bde	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:07:51 +00:00
Harrison Deng	af8590baa7	Removed import of deleted feature	2025-02-12 17:07:10 +00:00
Harrison Deng	36bca1b70d	Merge branch 'features/improved-oop-architecture' into features/non-exact-notation	2025-02-12 17:02:22 +00:00
Harrison Deng	fbfd993269	Copied tests over from CSV tests and updated to reflect current code base	2025-02-12 16:36:59 +00:00
Harrison Deng	ba606c35a9	conversion of collection of alleles to map now produces results with tuples instead of lists	2025-02-12 16:36:31 +00:00
Harrison Deng	4183840ba0	Added notation to indicate inexact matching in CSV	2025-02-12 15:59:19 +00:00