allele profiling partial matching works
This commit is contained in:
parent
e60dba936c
commit
03fbbe542e
1
.gitignore
vendored
1
.gitignore
vendored
@ -357,3 +357,4 @@ package
|
||||
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||
|
||||
output
|
||||
*.private.*
|
@ -4,7 +4,7 @@ import datetime
|
||||
from os import path
|
||||
import os
|
||||
|
||||
from automlst.cli import exactmatch, info
|
||||
from automlst.cli import info, type
|
||||
from automlst.cli.meta import get_module_base_name
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.local.abif import read_abif
|
||||
@ -16,7 +16,7 @@ root_parser = argparse.ArgumentParser()
|
||||
subparsers = root_parser.add_subparsers(required=True)
|
||||
|
||||
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
|
||||
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
|
||||
type.setup_parser(subparsers.add_parser(get_module_base_name(type.__name__)))
|
||||
|
||||
|
||||
def run():
|
||||
|
@ -34,6 +34,14 @@ def setup_parser(parser: ArgumentParser):
|
||||
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
|
||||
help="The output CSV name (.csv will be appended)."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--exact", "-ex",
|
||||
action="store_true",
|
||||
required=False,
|
||||
default=False,
|
||||
help="Should run exact matching rather than returning all similar ones"
|
||||
)
|
||||
parser.set_defaults(func=run_asynchronously)
|
||||
|
||||
async def run(args):
|
@ -1,13 +1,21 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence
|
||||
from typing import Mapping, Sequence, Union
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PartialAllelicMatchProfile:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
bitscore: float
|
||||
gaps: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
||||
clonal_complex: str
|
||||
|
@ -3,6 +3,11 @@ from typing import Union
|
||||
class BIGSDbDatabaseAPIException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||
|
||||
class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
|
||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||
|
@ -7,11 +7,14 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, list[str]] = {}
|
||||
result_dict: dict[str, Union[list[str], str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
result_dict[loci] = list()
|
||||
if len(alleles) == 1:
|
||||
result_dict[loci] = alleles[0].allele_variant
|
||||
for allele in alleles:
|
||||
result_dict[loci].append(allele.allele_variant)
|
||||
result_locis = list()
|
||||
result_locis.append(allele.allele_variant)
|
||||
result_dict[loci] = result_locis
|
||||
return result_dict
|
||||
|
||||
|
||||
|
@ -1,12 +1,13 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from numbers import Number
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoSuchBIGSdbDatabaseException
|
||||
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
@ -19,22 +20,44 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": not exact
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
|
||||
if "exact_matches" not in sequence_response:
|
||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
if exact:
|
||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = PartialAllelicMatchProfile(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
bitscore=float(partial_match["bitscore"]),
|
||||
gaps=int(partial_match["gaps"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_loci=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
|
||||
|
||||
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
@ -60,15 +83,15 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string)
|
||||
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||
async for named_string in namedStrings:
|
||||
try:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence))
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||
except NoBIGSdbExactMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
|
@ -1,3 +1,6 @@
|
||||
import random
|
||||
import re
|
||||
from typing import Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
@ -5,29 +8,56 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in exact_matches:
|
||||
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||
sequences = SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")
|
||||
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||
for sequence in sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1)
|
||||
if gene.lower() not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(sequence.seq), 0.2)
|
||||
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
assert partial_match.allele_variant == '1'
|
||||
mlst_targets.remove(gene.lower())
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||
async def dummy_allele_generator():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("fumC", "1"),
|
||||
Allele("glyA", "1"),
|
||||
Allele("tyrB", "1"),
|
||||
Allele("icd", "1"),
|
||||
Allele("pepA", "1"),
|
||||
Allele("pgm", "1"),
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
@ -46,21 +76,21 @@ async def test_institutpasteur_sequence_profiling_is_correct():
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
|
||||
|
||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
@ -70,13 +100,13 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
async def test_pubmlst_profiling_results_in_correct_st():
|
||||
async def generate_dummy_targets():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
|
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user