allele profiling partial matching works

This commit is contained in:
2025-01-09 21:44:28 +00:00
parent e60dba936c
commit 03fbbe542e
9 changed files with 59246 additions and 48 deletions

View File

@@ -4,7 +4,7 @@ import datetime
from os import path
import os
from automlst.cli import exactmatch, info
from automlst.cli import info, type
from automlst.cli.meta import get_module_base_name
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
@@ -16,7 +16,7 @@ root_parser = argparse.ArgumentParser()
subparsers = root_parser.add_subparsers(required=True)
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
type.setup_parser(subparsers.add_parser(get_module_base_name(type.__name__)))
def run():

View File

@@ -34,6 +34,14 @@ def setup_parser(parser: ArgumentParser):
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
help="The output CSV name (.csv will be appended)."
)
parser.add_argument(
"--exact", "-ex",
action="store_true",
required=False,
default=False,
help="Should run exact matching rather than returning all similar ones"
)
parser.set_defaults(func=run_asynchronously)
async def run(args):

View File

@@ -1,13 +1,21 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
from typing import Mapping, Sequence, Union
@dataclass(frozen=True)
class PartialAllelicMatchProfile:
percent_identity: float
mismatches: int
bitscore: float
gaps: int
@dataclass(frozen=True)
class Allele:
allele_loci: str
allele_variant: str
partial_match_profile: Union[None, PartialAllelicMatchProfile]
@dataclass(frozen=True)
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: str
clonal_complex: str
clonal_complex: str

View File

@@ -3,6 +3,11 @@ from typing import Union
class BIGSDbDatabaseAPIException(Exception):
pass
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)

View File

@@ -7,11 +7,14 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {}
result_dict: dict[str, Union[list[str], str]] = {}
for loci, alleles in alleles_map.items():
result_dict[loci] = list()
if len(alleles) == 1:
result_dict[loci] = alleles[0].allele_variant
for allele in alleles:
result_dict[loci].append(allele.allele_variant)
result_locis = list()
result_locis.append(allele.allele_variant)
result_dict[loci] = result_locis
return result_dict

View File

@@ -1,12 +1,13 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
from numbers import Number
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoSuchBIGSdbDatabaseException
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@@ -19,22 +20,44 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
async def __aenter__(self):
return self
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
"sequence": sequence_string,
"partial_matches": not exact
})
sequence_response: dict = await response.json()
if "exact_matches" not in sequence_response:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
elif "partial_matches" in sequence_response:
if exact:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = PartialAllelicMatchProfile(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
bitscore=float(partial_match["bitscore"]),
gaps=int(partial_match["gaps"])
)
yield Allele(
allele_loci=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = "designations"
@@ -60,15 +83,15 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string, exact)
return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
async for named_string in namedStrings:
try:
yield (named_string.name, await self.profile_string(named_string.sequence))
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
except NoBIGSdbExactMatchesException as e:
if stop_on_fail:
raise e