allele profiling partial matching works

This commit is contained in:
Harrison Deng 2025-01-09 21:44:28 +00:00
parent e60dba936c
commit 03fbbe542e
9 changed files with 59246 additions and 48 deletions

1
.gitignore vendored
View File

@ -357,3 +357,4 @@ package
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
output output
*.private.*

View File

@ -4,7 +4,7 @@ import datetime
from os import path from os import path
import os import os
from automlst.cli import exactmatch, info from automlst.cli import info, type
from automlst.cli.meta import get_module_base_name from automlst.cli.meta import get_module_base_name
from automlst.engine.data.genomics import NamedString from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif from automlst.engine.local.abif import read_abif
@ -16,7 +16,7 @@ root_parser = argparse.ArgumentParser()
subparsers = root_parser.add_subparsers(required=True) subparsers = root_parser.add_subparsers(required=True)
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__))) info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__))) type.setup_parser(subparsers.add_parser(get_module_base_name(type.__name__)))
def run(): def run():

View File

@ -34,6 +34,14 @@ def setup_parser(parser: ArgumentParser):
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}', default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
help="The output CSV name (.csv will be appended)." help="The output CSV name (.csv will be appended)."
) )
parser.add_argument(
"--exact", "-ex",
action="store_true",
required=False,
default=False,
help="Should run exact matching rather than returning all similar ones"
)
parser.set_defaults(func=run_asynchronously) parser.set_defaults(func=run_asynchronously)
async def run(args): async def run(args):

View File

@ -1,13 +1,21 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Mapping, Sequence from typing import Mapping, Sequence, Union
@dataclass(frozen=True)
class PartialAllelicMatchProfile:
percent_identity: float
mismatches: int
bitscore: float
gaps: int
@dataclass(frozen=True) @dataclass(frozen=True)
class Allele: class Allele:
allele_loci: str allele_loci: str
allele_variant: str allele_variant: str
partial_match_profile: Union[None, PartialAllelicMatchProfile]
@dataclass(frozen=True) @dataclass(frozen=True)
class MLSTProfile: class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]] alleles: Mapping[str, Sequence[Allele]]
sequence_type: str sequence_type: str
clonal_complex: str clonal_complex: str

View File

@ -3,6 +3,11 @@ from typing import Union
class BIGSDbDatabaseAPIException(Exception): class BIGSDbDatabaseAPIException(Exception):
pass pass
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException): class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args): def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)

View File

@ -7,11 +7,14 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]): def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {} result_dict: dict[str, Union[list[str], str]] = {}
for loci, alleles in alleles_map.items(): for loci, alleles in alleles_map.items():
result_dict[loci] = list() if len(alleles) == 1:
result_dict[loci] = alleles[0].allele_variant
for allele in alleles: for allele in alleles:
result_dict[loci].append(allele.allele_variant) result_locis = list()
result_locis.append(allele.allele_variant)
result_dict[loci] = result_locis
return result_dict return result_dict

View File

@ -1,12 +1,13 @@
from collections import defaultdict from collections import defaultdict
from contextlib import AbstractAsyncContextManager from contextlib import AbstractAsyncContextManager
from numbers import Number
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.genomics import NamedString from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoSuchBIGSdbDatabaseException from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
class BIGSdbMLSTProfiler(AbstractAsyncContextManager): class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@ -19,22 +20,44 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
async def __aenter__(self): async def __aenter__(self):
return self return self
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]: async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence" uri_path = "sequence"
response = await self._http_client.post(uri_path, json={ response = await self._http_client.post(uri_path, json={
"sequence": sequence_string "sequence": sequence_string,
"partial_matches": not exact
}) })
sequence_response: dict = await response.json() sequence_response: dict = await response.json()
if "exact_matches" not in sequence_response: if "exact_matches" in sequence_response:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id) # loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"] for allele_loci, alleles in exact_matches.items():
for allele_loci, alleles in exact_matches.items(): for allele in alleles:
for allele in alleles: alelle_id = allele["allele_id"]
alelle_id = allele["allele_id"] yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id) elif "partial_matches" in sequence_response:
if exact:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = PartialAllelicMatchProfile(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
bitscore=float(partial_match["bitscore"]),
gaps=int(partial_match["gaps"])
)
yield Allele(
allele_loci=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile: async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = "designations" uri_path = "designations"
@ -60,15 +83,15 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"])) allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile: async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string) alleles = self.fetch_mlst_allele_variants(string, exact)
return await self.fetch_mlst_st(alleles) return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]: async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
async for named_string in namedStrings: async for named_string in namedStrings:
try: try:
yield (named_string.name, await self.profile_string(named_string.sequence)) yield (named_string.name, await self.profile_string(named_string.sequence, exact))
except NoBIGSdbExactMatchesException as e: except NoBIGSdbExactMatchesException as e:
if stop_on_fail: if stop_on_fail:
raise e raise e

View File

@ -1,3 +1,6 @@
import random
import re
from typing import Collection, Sequence, Union
from Bio import SeqIO from Bio import SeqIO
import pytest import pytest
from automlst.engine.data.genomics import NamedString from automlst.engine.data.genomics import NamedString
@ -5,29 +8,56 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
rand = random.Random(gene)
if isinstance(mutation_site_count, float):
mutation_site_count = int(mutation_site_count * len(gene))
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
scrambled = list(gene)
for random_location in random_locations:
scrambled[random_location] = rand.choice(alphabet)
return "".join(scrambled)
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact(): async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches: async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
assert isinstance(exact_match, Allele) assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci) targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0 assert len(targets_left) == 0
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
sequences = SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
for sequence in sequences:
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
if match is None:
continue
gene = match.group(1)
if gene.lower() not in mlst_targets:
continue
scrambled = gene_scrambler(str(sequence.seq), 0.2)
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
assert partial_match.partial_match_profile is not None
assert partial_match.allele_variant == '1'
mlst_targets.remove(gene.lower())
assert len(mlst_targets) == 0
async def test_institutpasteur_profiling_results_in_correct_mlst_st(): async def test_institutpasteur_profiling_results_in_correct_mlst_st():
async def dummy_allele_generator(): async def dummy_allele_generator():
dummy_alleles = [ dummy_alleles = [
Allele("adk", "1"), Allele("adk", "1", None),
Allele("fumC", "1"), Allele("fumC", "1", None),
Allele("glyA", "1"), Allele("glyA", "1", None),
Allele("tyrB", "1"), Allele("tyrB", "1", None),
Allele("icd", "1"), Allele("icd", "1", None),
Allele("pepA", "1"), Allele("pepA", "1", None),
Allele("pgm", "1"), Allele("pgm", "1", None),
] ]
for dummy_allele in dummy_alleles: for dummy_allele in dummy_alleles:
yield dummy_allele yield dummy_allele
@ -46,21 +76,21 @@ async def test_institutpasteur_sequence_profiling_is_correct():
assert isinstance(profile, MLSTProfile) assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex" assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1" assert profile.sequence_type == "1"
async def test_pubmlst_profiling_results_in_exact_matches_when_exact(): async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
dummy_alleles = { dummy_alleles = {
Allele("adk", "1"), Allele("adk", "1", None),
Allele("atpG", "1"), Allele("atpG", "1", None),
Allele("frdB", "1"), Allele("frdB", "1", None),
Allele("fucK", "1"), Allele("fucK", "1", None),
Allele("mdh", "1"), Allele("mdh", "1", None),
Allele("pgi", "1"), Allele("pgi", "1", None),
Allele("recA", "5"), Allele("recA", "5", None),
} }
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence) exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
async for exact_match in exact_matches: async for exact_match in exact_matches:
assert isinstance(exact_match, Allele) assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match) dummy_alleles.remove(exact_match)
@ -70,13 +100,13 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
async def test_pubmlst_profiling_results_in_correct_st(): async def test_pubmlst_profiling_results_in_correct_st():
async def generate_dummy_targets(): async def generate_dummy_targets():
dummy_alleles = [ dummy_alleles = [
Allele("adk", "1"), Allele("adk", "1", None),
Allele("atpG", "1"), Allele("atpG", "1", None),
Allele("frdB", "1"), Allele("frdB", "1", None),
Allele("fucK", "1"), Allele("fucK", "1", None),
Allele("mdh", "1"), Allele("mdh", "1", None),
Allele("pgi", "1"), Allele("pgi", "1", None),
Allele("recA", "5"), Allele("recA", "5", None),
] ]
for dummy_allele in dummy_alleles: for dummy_allele in dummy_alleles:
yield dummy_allele yield dummy_allele

File diff suppressed because it is too large Load Diff