allele profiling partial matching works

2025-01-09 21:44:28 +00:00
parent e60dba936c
commit 03fbbe542e
9 changed files with 59246 additions and 48 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -357,3 +357,4 @@ package
 # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
 output
 *.private.*
--- a/src/automlst/cli/program.py
+++ b/src/automlst/cli/program.py
@@ -4,7 +4,7 @@ import datetime
 from os import path
 import os
-from automlst.cli import exactmatch, info
+from automlst.cli import info, type
 from automlst.cli.meta import get_module_base_name
 from automlst.engine.data.genomics import NamedString
 from automlst.engine.local.abif import read_abif
@@ -16,7 +16,7 @@ root_parser = argparse.ArgumentParser()
 subparsers = root_parser.add_subparsers(required=True)
 info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
-exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
+type.setup_parser(subparsers.add_parser(get_module_base_name(type.__name__)))
 def run():
--- a/src/automlst/cli/exactmatch.py
+++ b/src/automlst/cli/exactmatch.py
@@ -34,6 +34,14 @@ def setup_parser(parser: ArgumentParser):
        default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
        help="The output CSV name (.csv will be appended)."
    )
    parser.add_argument(
        "--exact", "-ex",
        action="store_true",
        required=False,
        default=False,
        help="Should run exact matching rather than returning all similar ones"
    )
    parser.set_defaults(func=run_asynchronously)
 async def run(args):
--- a/src/automlst/engine/data/mlst.py
+++ b/src/automlst/engine/data/mlst.py
@@ -1,13 +1,21 @@
 from dataclasses import dataclass
-from typing import Mapping, Sequence
+from typing import Mapping, Sequence, Union
@dataclass(frozen=True)
 class PartialAllelicMatchProfile:
    percent_identity: float
    mismatches: int
    bitscore: float
    gaps: int
@dataclass(frozen=True)
 class Allele:
    allele_loci: str
    allele_variant: str
    partial_match_profile: Union[None, PartialAllelicMatchProfile]
@dataclass(frozen=True)
 class MLSTProfile:
    alleles: Mapping[str, Sequence[Allele]]
    sequence_type: str
-    clonal_complex: str
+    clonal_complex: str
--- a/src/automlst/engine/exceptions/database.py
+++ b/src/automlst/engine/exceptions/database.py
@@ -3,6 +3,11 @@ from typing import Union
 class BIGSDbDatabaseAPIException(Exception):
    pass
 class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
    def __init__(self, database_name: str, database_schema_id: int, *args):
        super().__init__(f"No exact match found with schema with ID {database_schema_id}  in the database \"{database_name}\".", *args)
 class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
    def __init__(self, database_name: str, database_schema_id: int, *args):
        super().__init__(f"No exact match found with schema with ID {database_schema_id}  in the database \"{database_name}\".", *args)
--- a/src/automlst/engine/local/csv.py
+++ b/src/automlst/engine/local/csv.py
@@ -7,11 +7,14 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
 def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
-    result_dict: dict[str, list[str]] = {}
+    result_dict: dict[str, Union[list[str], str]] = {}
    for loci, alleles in alleles_map.items():
-        result_dict[loci] = list()
+        if len(alleles) == 1:
            result_dict[loci] = alleles[0].allele_variant
        for allele in alleles:
-            result_dict[loci].append(allele.allele_variant)
+            result_locis = list()
            result_locis.append(allele.allele_variant)
            result_dict[loci] = result_locis
    return result_dict
--- a/src/automlst/engine/remote/databases/bigsdb.py
+++ b/src/automlst/engine/remote/databases/bigsdb.py
@@ -1,12 +1,13 @@
 from collections import defaultdict
 from contextlib import AbstractAsyncContextManager
 from numbers import Number
 from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
 from aiohttp import ClientSession, ClientTimeout
 from automlst.engine.data.genomics import NamedString
-from automlst.engine.data.mlst import Allele, MLSTProfile
+from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
-from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoSuchBIGSdbDatabaseException
+from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
 class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@@ -19,22 +20,44 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
    async def __aenter__(self):
        return self
-    async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
+    async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
        uri_path = "sequence"
        response = await self._http_client.post(uri_path, json={
-            "sequence": sequence_string
+            "sequence": sequence_string,
            "partial_matches": not exact
        })
        sequence_response: dict = await response.json()
-        if "exact_matches" not in sequence_response:
+        if "exact_matches" in sequence_response:
-            raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
+            # loci -> list of alleles with id and loci
-        
+            exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
-        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+            for allele_loci, alleles in exact_matches.items():
-        for allele_loci, alleles in exact_matches.items():
+                for allele in alleles:
-            for allele in alleles:
+                    alelle_id = allele["allele_id"]
-                alelle_id = allele["allele_id"]
+                    yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
-                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
+        elif "partial_matches" in sequence_response:
            if exact:
                raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
            partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
            for allele_loci, partial_match in partial_matches.items():
                if len(partial_match) <= 0:
                    continue
                partial_match_profile = PartialAllelicMatchProfile(
                    percent_identity=float(partial_match["identity"]),
                    mismatches=int(partial_match["mismatches"]),
                    bitscore=float(partial_match["bitscore"]),
                    gaps=int(partial_match["gaps"])
                )
                yield Allele(
                    allele_loci=allele_loci,
                    allele_variant=str(partial_match["allele"]),
                    partial_match_profile=partial_match_profile
                )
        else:
            raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
    async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
        uri_path = "designations"
@@ -60,15 +83,15 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
            return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
-    async def profile_string(self, string: str) -> MLSTProfile:
+    async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
-        alleles = self.fetch_mlst_allele_variants(string)
+        alleles = self.fetch_mlst_allele_variants(string, exact)
        return await self.fetch_mlst_st(alleles)
-    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
+    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
        async for named_string in namedStrings:
            try:
-                yield (named_string.name, await self.profile_string(named_string.sequence))
+                yield (named_string.name, await self.profile_string(named_string.sequence, exact))
            except NoBIGSdbExactMatchesException as e:
                if stop_on_fail:
                    raise e
--- a/tests/automlst/engine/remote/databases/test_bigsdb.py
+++ b/tests/automlst/engine/remote/databases/test_bigsdb.py
@@ -1,3 +1,6 @@
 import random
 import re
 from typing import Collection, Sequence, Union
 from Bio import SeqIO
 import pytest
 from automlst.engine.data.genomics import NamedString
@@ -5,29 +8,56 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
 from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException
 from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
 def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
    rand = random.Random(gene)
    if isinstance(mutation_site_count, float):
        mutation_site_count = int(mutation_site_count * len(gene))
    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
    scrambled = list(gene)
    for random_location in random_locations:
        scrambled[random_location] = rand.choice(alphabet)
    return "".join(scrambled)
 async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
-        async for exact_match in exact_matches:
+        async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
            assert isinstance(exact_match, Allele)
            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
            targets_left.remove(exact_match.allele_loci)
        assert len(targets_left) == 0
 async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
    sequences = SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")
    mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
        for sequence in sequences:
            match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
            if match is None:
                continue
            gene = match.group(1)
            if gene.lower() not in mlst_targets:
                continue
            scrambled = gene_scrambler(str(sequence.seq), 0.2)
            async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
                assert partial_match.partial_match_profile is not None
                assert partial_match.allele_variant == '1'
                mlst_targets.remove(gene.lower())
            assert len(mlst_targets) == 0
 async def test_institutpasteur_profiling_results_in_correct_mlst_st():
    async def dummy_allele_generator():
        dummy_alleles = [
-        Allele("adk", "1"),
+        Allele("adk", "1", None),
-        Allele("fumC", "1"),
+        Allele("fumC", "1", None),
-        Allele("glyA", "1"),
+        Allele("glyA", "1", None),
-        Allele("tyrB", "1"),
+        Allele("tyrB", "1", None),
-        Allele("icd", "1"),
+        Allele("icd", "1", None),
-        Allele("pepA", "1"),
+        Allele("pepA", "1", None),
-        Allele("pgm", "1"),
+        Allele("pgm", "1", None),
        ]
        for dummy_allele in dummy_alleles:
            yield dummy_allele
@@ -46,21 +76,21 @@ async def test_institutpasteur_sequence_profiling_is_correct():
        assert isinstance(profile, MLSTProfile)
        assert profile.clonal_complex == "ST-2 complex"
        assert profile.sequence_type == "1"
-
+    
 async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
    dummy_alleles = {
-        Allele("adk", "1"),
+        Allele("adk", "1", None),
-        Allele("atpG", "1"),
+        Allele("atpG", "1", None),
-        Allele("frdB", "1"),
+        Allele("frdB", "1", None),
-        Allele("fucK", "1"),
+        Allele("fucK", "1", None),
-        Allele("mdh", "1"),
+        Allele("mdh", "1", None),
-        Allele("pgi", "1"),
+        Allele("pgi", "1", None),
-        Allele("recA", "5"),
+        Allele("recA", "5", None),
    }
    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
-        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
        async for exact_match in exact_matches:
            assert isinstance(exact_match, Allele)
            dummy_alleles.remove(exact_match)
@@ -70,13 +100,13 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
 async def test_pubmlst_profiling_results_in_correct_st():
    async def generate_dummy_targets():
        dummy_alleles = [
-                Allele("adk", "1"),
+                Allele("adk", "1", None),
-                Allele("atpG", "1"),
+                Allele("atpG", "1", None),
-                Allele("frdB", "1"),
+                Allele("frdB", "1", None),
-                Allele("fucK", "1"),
+                Allele("fucK", "1", None),
-                Allele("mdh", "1"),
+                Allele("mdh", "1", None),
-                Allele("pgi", "1"),
+                Allele("pgi", "1", None),
-                Allele("recA", "5"),
+                Allele("recA", "5", None),
            ]
        for dummy_allele in dummy_alleles:
            yield dummy_allele
--- a/tests/resources/tohama_I_bpertussis_coding.fasta
+++ b/tests/resources/tohama_I_bpertussis_coding.fasta