allele profiling partial matching works
This commit is contained in:
parent
e60dba936c
commit
03fbbe542e
1
.gitignore
vendored
1
.gitignore
vendored
@ -357,3 +357,4 @@ package
|
|||||||
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||||
|
|
||||||
output
|
output
|
||||||
|
*.private.*
|
@ -4,7 +4,7 @@ import datetime
|
|||||||
from os import path
|
from os import path
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from automlst.cli import exactmatch, info
|
from automlst.cli import info, type
|
||||||
from automlst.cli.meta import get_module_base_name
|
from automlst.cli.meta import get_module_base_name
|
||||||
from automlst.engine.data.genomics import NamedString
|
from automlst.engine.data.genomics import NamedString
|
||||||
from automlst.engine.local.abif import read_abif
|
from automlst.engine.local.abif import read_abif
|
||||||
@ -16,7 +16,7 @@ root_parser = argparse.ArgumentParser()
|
|||||||
subparsers = root_parser.add_subparsers(required=True)
|
subparsers = root_parser.add_subparsers(required=True)
|
||||||
|
|
||||||
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
|
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
|
||||||
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
|
type.setup_parser(subparsers.add_parser(get_module_base_name(type.__name__)))
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
@ -34,6 +34,14 @@ def setup_parser(parser: ArgumentParser):
|
|||||||
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
|
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
|
||||||
help="The output CSV name (.csv will be appended)."
|
help="The output CSV name (.csv will be appended)."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--exact", "-ex",
|
||||||
|
action="store_true",
|
||||||
|
required=False,
|
||||||
|
default=False,
|
||||||
|
help="Should run exact matching rather than returning all similar ones"
|
||||||
|
)
|
||||||
parser.set_defaults(func=run_asynchronously)
|
parser.set_defaults(func=run_asynchronously)
|
||||||
|
|
||||||
async def run(args):
|
async def run(args):
|
@ -1,13 +1,21 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Mapping, Sequence
|
from typing import Mapping, Sequence, Union
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PartialAllelicMatchProfile:
|
||||||
|
percent_identity: float
|
||||||
|
mismatches: int
|
||||||
|
bitscore: float
|
||||||
|
gaps: int
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Allele:
|
class Allele:
|
||||||
allele_loci: str
|
allele_loci: str
|
||||||
allele_variant: str
|
allele_variant: str
|
||||||
|
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class MLSTProfile:
|
class MLSTProfile:
|
||||||
alleles: Mapping[str, Sequence[Allele]]
|
alleles: Mapping[str, Sequence[Allele]]
|
||||||
sequence_type: str
|
sequence_type: str
|
||||||
clonal_complex: str
|
clonal_complex: str
|
||||||
|
@ -3,6 +3,11 @@ from typing import Union
|
|||||||
class BIGSDbDatabaseAPIException(Exception):
|
class BIGSDbDatabaseAPIException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
||||||
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
|
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||||
|
|
||||||
class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
|
class NoBIGSdbExactMatchesException(BIGSDbDatabaseAPIException):
|
||||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||||
|
@ -7,11 +7,14 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
|
|||||||
|
|
||||||
|
|
||||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||||
result_dict: dict[str, list[str]] = {}
|
result_dict: dict[str, Union[list[str], str]] = {}
|
||||||
for loci, alleles in alleles_map.items():
|
for loci, alleles in alleles_map.items():
|
||||||
result_dict[loci] = list()
|
if len(alleles) == 1:
|
||||||
|
result_dict[loci] = alleles[0].allele_variant
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
result_dict[loci].append(allele.allele_variant)
|
result_locis = list()
|
||||||
|
result_locis.append(allele.allele_variant)
|
||||||
|
result_dict[loci] = result_locis
|
||||||
return result_dict
|
return result_dict
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from contextlib import AbstractAsyncContextManager
|
from contextlib import AbstractAsyncContextManager
|
||||||
|
from numbers import Number
|
||||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||||
|
|
||||||
from aiohttp import ClientSession, ClientTimeout
|
from aiohttp import ClientSession, ClientTimeout
|
||||||
|
|
||||||
from automlst.engine.data.genomics import NamedString
|
from automlst.engine.data.genomics import NamedString
|
||||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoSuchBIGSdbDatabaseException
|
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||||
|
|
||||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||||
|
|
||||||
@ -19,22 +20,44 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
|||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||||
uri_path = "sequence"
|
uri_path = "sequence"
|
||||||
response = await self._http_client.post(uri_path, json={
|
response = await self._http_client.post(uri_path, json={
|
||||||
"sequence": sequence_string
|
"sequence": sequence_string,
|
||||||
|
"partial_matches": not exact
|
||||||
})
|
})
|
||||||
sequence_response: dict = await response.json()
|
sequence_response: dict = await response.json()
|
||||||
|
|
||||||
if "exact_matches" not in sequence_response:
|
if "exact_matches" in sequence_response:
|
||||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
# loci -> list of alleles with id and loci
|
||||||
|
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
for allele_loci, alleles in exact_matches.items():
|
||||||
for allele_loci, alleles in exact_matches.items():
|
for allele in alleles:
|
||||||
for allele in alleles:
|
alelle_id = allele["allele_id"]
|
||||||
alelle_id = allele["allele_id"]
|
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
elif "partial_matches" in sequence_response:
|
||||||
|
if exact:
|
||||||
|
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||||
|
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||||
|
for allele_loci, partial_match in partial_matches.items():
|
||||||
|
if len(partial_match) <= 0:
|
||||||
|
continue
|
||||||
|
partial_match_profile = PartialAllelicMatchProfile(
|
||||||
|
percent_identity=float(partial_match["identity"]),
|
||||||
|
mismatches=int(partial_match["mismatches"]),
|
||||||
|
bitscore=float(partial_match["bitscore"]),
|
||||||
|
gaps=int(partial_match["gaps"])
|
||||||
|
)
|
||||||
|
yield Allele(
|
||||||
|
allele_loci=allele_loci,
|
||||||
|
allele_variant=str(partial_match["allele"]),
|
||||||
|
partial_match_profile=partial_match_profile
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||||
uri_path = "designations"
|
uri_path = "designations"
|
||||||
@ -60,15 +83,15 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
|||||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||||
|
|
||||||
async def profile_string(self, string: str) -> MLSTProfile:
|
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||||
alleles = self.fetch_mlst_allele_variants(string)
|
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||||
return await self.fetch_mlst_st(alleles)
|
return await self.fetch_mlst_st(alleles)
|
||||||
|
|
||||||
|
|
||||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||||
async for named_string in namedStrings:
|
async for named_string in namedStrings:
|
||||||
try:
|
try:
|
||||||
yield (named_string.name, await self.profile_string(named_string.sequence))
|
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||||
except NoBIGSdbExactMatchesException as e:
|
except NoBIGSdbExactMatchesException as e:
|
||||||
if stop_on_fail:
|
if stop_on_fail:
|
||||||
raise e
|
raise e
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
import random
|
||||||
|
import re
|
||||||
|
from typing import Collection, Sequence, Union
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
import pytest
|
import pytest
|
||||||
from automlst.engine.data.genomics import NamedString
|
from automlst.engine.data.genomics import NamedString
|
||||||
@ -5,29 +8,56 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
|
|||||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException
|
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException
|
||||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||||
|
|
||||||
|
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||||
|
rand = random.Random(gene)
|
||||||
|
if isinstance(mutation_site_count, float):
|
||||||
|
mutation_site_count = int(mutation_site_count * len(gene))
|
||||||
|
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||||
|
scrambled = list(gene)
|
||||||
|
for random_location in random_locations:
|
||||||
|
scrambled[random_location] = rand.choice(alphabet)
|
||||||
|
return "".join(scrambled)
|
||||||
|
|
||||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
|
||||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||||
async for exact_match in exact_matches:
|
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||||
assert isinstance(exact_match, Allele)
|
assert isinstance(exact_match, Allele)
|
||||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||||
targets_left.remove(exact_match.allele_loci)
|
targets_left.remove(exact_match.allele_loci)
|
||||||
|
|
||||||
assert len(targets_left) == 0
|
assert len(targets_left) == 0
|
||||||
|
|
||||||
|
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||||
|
sequences = SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")
|
||||||
|
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||||
|
for sequence in sequences:
|
||||||
|
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||||
|
if match is None:
|
||||||
|
continue
|
||||||
|
gene = match.group(1)
|
||||||
|
if gene.lower() not in mlst_targets:
|
||||||
|
continue
|
||||||
|
scrambled = gene_scrambler(str(sequence.seq), 0.2)
|
||||||
|
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||||
|
assert partial_match.partial_match_profile is not None
|
||||||
|
assert partial_match.allele_variant == '1'
|
||||||
|
mlst_targets.remove(gene.lower())
|
||||||
|
|
||||||
|
assert len(mlst_targets) == 0
|
||||||
|
|
||||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||||
async def dummy_allele_generator():
|
async def dummy_allele_generator():
|
||||||
dummy_alleles = [
|
dummy_alleles = [
|
||||||
Allele("adk", "1"),
|
Allele("adk", "1", None),
|
||||||
Allele("fumC", "1"),
|
Allele("fumC", "1", None),
|
||||||
Allele("glyA", "1"),
|
Allele("glyA", "1", None),
|
||||||
Allele("tyrB", "1"),
|
Allele("tyrB", "1", None),
|
||||||
Allele("icd", "1"),
|
Allele("icd", "1", None),
|
||||||
Allele("pepA", "1"),
|
Allele("pepA", "1", None),
|
||||||
Allele("pgm", "1"),
|
Allele("pgm", "1", None),
|
||||||
]
|
]
|
||||||
for dummy_allele in dummy_alleles:
|
for dummy_allele in dummy_alleles:
|
||||||
yield dummy_allele
|
yield dummy_allele
|
||||||
@ -46,21 +76,21 @@ async def test_institutpasteur_sequence_profiling_is_correct():
|
|||||||
assert isinstance(profile, MLSTProfile)
|
assert isinstance(profile, MLSTProfile)
|
||||||
assert profile.clonal_complex == "ST-2 complex"
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
assert profile.sequence_type == "1"
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
|
||||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||||
dummy_alleles = {
|
dummy_alleles = {
|
||||||
Allele("adk", "1"),
|
Allele("adk", "1", None),
|
||||||
Allele("atpG", "1"),
|
Allele("atpG", "1", None),
|
||||||
Allele("frdB", "1"),
|
Allele("frdB", "1", None),
|
||||||
Allele("fucK", "1"),
|
Allele("fucK", "1", None),
|
||||||
Allele("mdh", "1"),
|
Allele("mdh", "1", None),
|
||||||
Allele("pgi", "1"),
|
Allele("pgi", "1", None),
|
||||||
Allele("recA", "5"),
|
Allele("recA", "5", None),
|
||||||
}
|
}
|
||||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||||
async for exact_match in exact_matches:
|
async for exact_match in exact_matches:
|
||||||
assert isinstance(exact_match, Allele)
|
assert isinstance(exact_match, Allele)
|
||||||
dummy_alleles.remove(exact_match)
|
dummy_alleles.remove(exact_match)
|
||||||
@ -70,13 +100,13 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
|||||||
async def test_pubmlst_profiling_results_in_correct_st():
|
async def test_pubmlst_profiling_results_in_correct_st():
|
||||||
async def generate_dummy_targets():
|
async def generate_dummy_targets():
|
||||||
dummy_alleles = [
|
dummy_alleles = [
|
||||||
Allele("adk", "1"),
|
Allele("adk", "1", None),
|
||||||
Allele("atpG", "1"),
|
Allele("atpG", "1", None),
|
||||||
Allele("frdB", "1"),
|
Allele("frdB", "1", None),
|
||||||
Allele("fucK", "1"),
|
Allele("fucK", "1", None),
|
||||||
Allele("mdh", "1"),
|
Allele("mdh", "1", None),
|
||||||
Allele("pgi", "1"),
|
Allele("pgi", "1", None),
|
||||||
Allele("recA", "5"),
|
Allele("recA", "5", None),
|
||||||
]
|
]
|
||||||
for dummy_allele in dummy_alleles:
|
for dummy_allele in dummy_alleles:
|
||||||
yield dummy_allele
|
yield dummy_allele
|
||||||
|
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user