Began implementing LazyPersistentCachedBIGSdbMLSTProfiler

This commit is contained in:
Harrison Deng 2025-01-27 22:03:49 +00:00
parent ba1f0aa318
commit 3e3898334f
5 changed files with 230 additions and 86 deletions

2
Jenkinsfile vendored
View File

@ -49,7 +49,7 @@ pipeline {
steps { steps {
sh returnStatus: true, script: 'python -m twine upload -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*' sh returnStatus: true, script: 'python -m twine upload -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
} }
} }-
} }
} }
} }

View File

@ -15,7 +15,8 @@ requires-python = ">=3.12"
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases." description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
[project.urls] [project.urls]
Repository = "https://github.com/RealYHD/autoBIGS.engine" Homepage = "https://github.com/RealYHD/autoBIGS.engine"
Source = "https://github.com/RealYHD/autoBIGS.engine"
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues" Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
[tool.setuptools_scm] [tool.setuptools_scm]

View File

@ -1,16 +1,43 @@
from abc import abstractmethod
from collections import defaultdict from collections import defaultdict
from contextlib import AbstractAsyncContextManager from contextlib import AbstractAsyncContextManager
from numbers import Number import csv
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union from os import path
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout from aiohttp import ClientSession, ClientTimeout
from autobigs.engine.data.local.fasta import read_fasta
from autobigs.engine.data.structures.genomics import NamedString from autobigs.engine.data.structures.genomics import NamedString
from autobigs.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile from autobigs.engine.data.structures.mlst import Allele, NamedMLSTProfile, PartialAllelicMatchProfile, MLSTProfile
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
from Bio.Align import PairwiseAligner
class BIGSdbMLSTProfiler(AbstractAsyncContextManager): class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
pass
@abstractmethod
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
pass
@abstractmethod
async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
pass
@abstractmethod
def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
pass
@abstractmethod
async def close(self):
pass
class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
def __init__(self, database_api: str, database_name: str, schema_id: int): def __init__(self, database_api: str, database_name: str, schema_id: int):
self._database_name = database_name self._database_name = database_name
self._schema_id = schema_id self._schema_id = schema_id
@ -20,86 +47,193 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
async def __aenter__(self): async def __aenter__(self):
return self return self
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]: async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence" uri_path = "sequence"
async with self._http_client.post(uri_path, json={ for sequence_string in sequence_strings:
"sequence": sequence_string, async with self._http_client.post(uri_path, json={
"partial_matches": not exact "sequence": sequence_string,
}) as response: "partial_matches": True
sequence_response: dict = await response.json() }) as response:
sequence_response: dict = await response.json()
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
elif "partial_matches" in sequence_response:
if exact:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = PartialAllelicMatchProfile(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
bitscore=float(partial_match["bitscore"]),
gaps=int(partial_match["gaps"])
)
yield Allele(
allele_loci=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
elif "partial_matches" in sequence_response:
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = PartialAllelicMatchProfile(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
gaps=int(partial_match["gaps"])
)
yield Allele(
allele_locus=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "designations" uri_path = "designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable): if isinstance(alleles, AsyncIterable):
async for allele in alleles: async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
else: else:
for allele in alleles: for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)}) allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
request_json = { request_json = {
"designations": allele_request_dict "designations": allele_request_dict
} }
async with self._http_client.post(uri_path, json=request_json) as response: async with self._http_client.post(uri_path, json=request_json) as response:
response_json: dict = await response.json() response_json: dict = await response.json()
allele_map: dict[str, list[Allele]] = defaultdict(list) allele_map: dict[str, Allele] = {}
response_json.setdefault("fields", dict()) response_json.setdefault("fields", dict())
schema_fields_returned: dict[str, str] = response_json["fields"] schema_fields_returned: dict[str, str] = response_json["fields"]
schema_fields_returned.setdefault("ST", "unknown") schema_fields_returned.setdefault("ST", "unknown")
schema_fields_returned.setdefault("clonal_complex", "unknown") schema_fields_returned.setdefault("clonal_complex", "unknown")
schema_exact_matches: dict = response_json["exact_matches"] schema_exact_matches: dict = response_json["exact_matches"]
for exact_match_loci, exact_match_alleles in schema_exact_matches.items(): for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles: if len(exact_match_alleles) > 1:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None)) raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
allele_map[exact_match_locus] = Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)
if len(allele_map) == 0: if len(allele_map) == 0:
raise ValueError("Passed in no alleles.") raise ValueError("Passed in no alleles.")
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile: async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string, exact) alleles = self.fetch_mlst_allele_variants(sequence_strings)
return await self.fetch_mlst_st(alleles) return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
async for named_strings in named_string_groups:
for named_string in named_strings:
try:
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
yield NamedMLSTProfile(named_string.name, None)
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]: async def close(self):
async for named_string in namedStrings: await self._http_client.close()
try:
yield (named_string.name, await self.profile_string(named_string.sequence, exact)) async def __aexit__(self, exc_type, exc_value, traceback):
except NoBIGSdbMatchesException as e: await self.close()
if stop_on_fail:
raise e class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
yield (named_string.name, None) def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: str):
self._database_api = database_api
self._database_name = database_name
self._schema_id = schema_id
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
self._cache_path = cache_path
self._loci: list[str] = []
self._profiles = {}
async def load_scheme_locis(self):
self._loci.clear()
async with self._http_client.get("") as schema_response:
schema_json = await schema_response.json()
for locus in schema_json["loci"]:
locus_name = path.basename(locus)
self._loci.append(locus_name)
self._loci.sort()
async def load_scheme_profiles(self):
self._profiles.clear()
with open(self.get_scheme_profile_path()) as profile_cache_handle:
reader = csv.DictReader(profile_cache_handle, delimiter="\t")
for line in reader:
alleles = []
for locus in self._loci:
alleles.append(line[locus])
self._profiles[tuple(alleles)] = (line["ST"], line["clonal_complex"])
def get_locus_cache_path(self, locus) -> str:
return path.join(self._cache_path, locus + "." + "fasta")
def get_scheme_profile_path(self):
return path.join(self._cache_path, "profiles.csv")
async def download_alleles_cache_data(self):
for locus in self._loci:
with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
async with self._http_client.get(f"/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
async for chunk, eof in fasta_response.content.iter_chunks(): # TODO maybe allow chunking to be configurable
fasta_handle.write(chunk)
async def download_scheme_profiles(self):
with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle:
async with self._http_client.get("profiles_csv") as profiles_response:
async for chunk, eof in profiles_response.content.iter_chunks():
profile_cache_handle.write(chunk)
async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for sequence_string in sequence_strings:
for locus in self._loci:
async for fasta_seq in read_fasta(self.get_locus_cache_path(locus)):
allele_variant = fasta_seq.name
alignment_results = aligner.align(sequence_string, fasta_seq.sequence)
top_alignment = sorted(alignment_results)[0]
top_alignment_stats = top_alignment.counts()
top_alignment_gaps = top_alignment_stats.gaps
top_alignment_identities = top_alignment_stats.identities
top_alignment_mismatches = top_alignment_stats.mismatches
if top_alignment_gaps == 0 and top_alignment_mismatches == 0:
yield Allele(locus, allele_variant, None)
else:
yield Allele(
locus,
allele_variant,
PartialAllelicMatchProfile(
percent_identity=top_alignment_identities/top_alignment.length,
mismatches=top_alignment_mismatches,
gaps=top_alignment_gaps
)
)
async def fetch_mlst_st(self, alleles):
allele_variants: dict[str, Allele] = {}
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_variants[allele.allele_locus] = allele
else:
for allele in alleles:
allele_variants[allele.allele_locus] = allele
ordered_profile = []
for locus in self._loci:
ordered_profile.append(allele_variants[locus].allele_variant)
st, clonal_complex = self._profiles[tuple(ordered_profile)]
return MLSTProfile(allele_variants, st, clonal_complex)
async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(sequence_strings)
return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
async for named_strings in named_string_groups:
for named_string in named_strings:
try:
yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
yield NamedMLSTProfile(named_string.name, None)
async def close(self): async def close(self):
await self._http_client.close() await self._http_client.close()
@ -156,8 +290,8 @@ class BIGSdbIndex(AbstractAsyncContextManager):
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler: async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> OnlineBIGSdbMLSTProfiler:
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) return OnlineBIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
async def close(self): async def close(self):
await self._http_client.close() await self._http_client.close()

View File

@ -5,17 +5,21 @@ from typing import Mapping, Sequence, Union
class PartialAllelicMatchProfile: class PartialAllelicMatchProfile:
percent_identity: float percent_identity: float
mismatches: int mismatches: int
bitscore: float
gaps: int gaps: int
@dataclass(frozen=True) @dataclass(frozen=True)
class Allele: class Allele:
allele_loci: str allele_locus: str
allele_variant: str allele_variant: str
partial_match_profile: Union[None, PartialAllelicMatchProfile] partial_match_profile: Union[None, PartialAllelicMatchProfile]
@dataclass(frozen=True) @dataclass(frozen=True)
class MLSTProfile: class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]] alleles: Mapping[str, Allele]
sequence_type: str sequence_type: str
clonal_complex: str clonal_complex: str
@dataclass(frozen=True)
class NamedMLSTProfile:
name: str
mlst_profile: Union[None, MLSTProfile]

View File

@ -6,7 +6,7 @@ import pytest
from autobigs.engine.data.structures.genomics import NamedString from autobigs.engine.data.structures.genomics import NamedString
from autobigs.engine.data.structures.mlst import Allele, MLSTProfile from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, OnlineBIGSdbMLSTProfiler
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]): def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
rand = random.Random(gene) rand = random.Random(gene)
@ -20,19 +20,19 @@ def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet:
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact(): async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True): async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_strings=[sequence]):
assert isinstance(exact_match, Allele) assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci) targets_left.remove(exact_match.allele_locus)
assert len(targets_left) == 0 assert len(targets_left) == 0
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact(): async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")) sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"} mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
for sequence in sequences: for sequence in sequences:
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description) match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
if match is None: if match is None:
@ -41,7 +41,7 @@ async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
if gene.lower() not in mlst_targets: if gene.lower() not in mlst_targets:
continue continue
scrambled = gene_scrambler(str(sequence.seq), 0.125) scrambled = gene_scrambler(str(sequence.seq), 0.125)
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False): async for partial_match in profiler.fetch_mlst_allele_variants(scrambled):
assert partial_match.partial_match_profile is not None assert partial_match.partial_match_profile is not None
mlst_targets.remove(gene.lower()) mlst_targets.remove(gene.lower())
@ -60,7 +60,7 @@ async def test_institutpasteur_profiling_results_in_correct_mlst_st():
] ]
for dummy_allele in dummy_alleles: for dummy_allele in dummy_alleles:
yield dummy_allele yield dummy_allele
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator()) mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
assert mlst_st_data is not None assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile) assert isinstance(mlst_st_data, MLSTProfile)
@ -77,7 +77,7 @@ async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
Allele("pepA", "1", None), Allele("pepA", "1", None),
Allele("pgm", "5", None), Allele("pgm", "5", None),
] ]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles) mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
assert mlst_profile.clonal_complex == "unknown" assert mlst_profile.clonal_complex == "unknown"
assert mlst_profile.sequence_type == "unknown" assert mlst_profile.sequence_type == "unknown"
@ -85,7 +85,7 @@ async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
async def test_institutpasteur_sequence_profiling_is_correct(): async def test_institutpasteur_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence) profile = await dummy_profiler.profile_string(sequence)
assert profile is not None assert profile is not None
assert isinstance(profile, MLSTProfile) assert isinstance(profile, MLSTProfile)
@ -104,8 +104,8 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
Allele("recA", "5", None), Allele("recA", "5", None),
} }
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True) exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_strings=sequence)
async for exact_match in exact_matches: async for exact_match in exact_matches:
assert isinstance(exact_match, Allele) assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match) dummy_alleles.remove(exact_match)
@ -125,7 +125,7 @@ async def test_pubmlst_profiling_results_in_correct_st():
] ]
for dummy_allele in dummy_alleles: for dummy_allele in dummy_alleles:
yield dummy_allele yield dummy_allele
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets()) mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
assert mlst_st_data is not None assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile) assert isinstance(mlst_st_data, MLSTProfile)
@ -134,7 +134,7 @@ async def test_pubmlst_profiling_results_in_correct_st():
async def test_pubmlst_sequence_profiling_is_correct(): async def test_pubmlst_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence) profile = await dummy_profiler.profile_string(sequence)
assert profile is not None assert profile is not None
assert isinstance(profile, MLSTProfile) assert isinstance(profile, MLSTProfile)
@ -167,9 +167,10 @@ async def test_bigsdb_profile_multiple_strings_same_string_twice():
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)] dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
async def generate_async_iterable_sequences(): async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences: for dummy_sequence in dummy_sequences:
yield dummy_sequence yield [dummy_sequence]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()): async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
name, profile = named_profile.name, named_profile.mlst_profile
assert profile is not None assert profile is not None
assert isinstance(profile, MLSTProfile) assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex" assert profile.clonal_complex == "ST-2 complex"
@ -180,9 +181,11 @@ async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)] dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences(): async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences: for dummy_sequence in dummy_sequences:
yield dummy_sequence yield [dummy_sequence]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True): async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
name, profile = name_profile.name, name_profile.mlst_profile
if name == "should_fail": if name == "should_fail":
assert profile is None assert profile is None
else: else:
@ -196,9 +199,10 @@ async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)] dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences(): async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences: for dummy_sequence in dummy_sequences:
yield dummy_sequence yield [dummy_sequence]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False): async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
name, profile = named_profile.name, named_profile.mlst_profile
if name == "should_fail": if name == "should_fail":
assert profile is not None assert profile is not None
assert profile.clonal_complex == "unknown" assert profile.clonal_complex == "unknown"
@ -216,10 +220,11 @@ async def test_bigsdb_profile_multiple_strings_fail_second_stop():
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)] dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences(): async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences: for dummy_sequence in dummy_sequences:
yield dummy_sequence yield [dummy_sequence]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
with pytest.raises(NoBIGSdbMatchesException): with pytest.raises(NoBIGSdbMatchesException):
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True): async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), stop_on_fail=True):
name, profile = named_profile.name, named_profile.mlst_profile
if name == "should_fail": if name == "should_fail":
pytest.fail("Exception should have been thrown, no exception was thrown.") pytest.fail("Exception should have been thrown, no exception was thrown.")
else: else: