From f462e6d5e0ea117ad51296ee93851d373e0ca5f2 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Tue, 11 Feb 2025 19:24:23 +0000 Subject: [PATCH 1/4] Moved "LazyPersistentCachedBIGSdbMLSTProfiler" to separate branch and deleted from current branch --- .../engine/data/remote/databases/bigsdb.py | 110 ------------------ 1 file changed, 110 deletions(-) diff --git a/src/autobigs/engine/data/remote/databases/bigsdb.py b/src/autobigs/engine/data/remote/databases/bigsdb.py index f7ba79a..206a9f1 100644 --- a/src/autobigs/engine/data/remote/databases/bigsdb.py +++ b/src/autobigs/engine/data/remote/databases/bigsdb.py @@ -131,116 +131,6 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): async def __aexit__(self, exc_type, exc_value, traceback): await self.close() -class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): - def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: str): - self._database_api = database_api - self._database_name = database_name - self._schema_id = schema_id - self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/" - self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) - self._cache_path = cache_path - self._loci: list[str] = [] - self._profiles = {} - - async def load_scheme_locis(self): - self._loci.clear() - async with self._http_client.get("") as schema_response: - schema_json = await schema_response.json() - for locus in schema_json["loci"]: - locus_name = path.basename(locus) - self._loci.append(locus_name) - self._loci.sort() - - async def load_scheme_profiles(self): - self._profiles.clear() - with open(self.get_scheme_profile_path()) as profile_cache_handle: - reader = csv.DictReader(profile_cache_handle, delimiter="\t") - for line in reader: - alleles = [] - for locus in self._loci: - alleles.append(line[locus]) - self._profiles[tuple(alleles)] = (line["ST"], line["clonal_complex"]) - - def get_locus_cache_path(self, locus) -> str: - return path.join(self._cache_path, locus + "." + "fasta") - - def get_scheme_profile_path(self): - return path.join(self._cache_path, "profiles.csv") - - async def download_alleles_cache_data(self): - for locus in self._loci: - with open(self.get_locus_cache_path(locus), "wb") as fasta_handle: - async with self._http_client.get(f"/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response: - async for chunk, eof in fasta_response.content.iter_chunks(): # TODO maybe allow chunking to be configurable - fasta_handle.write(chunk) - - async def download_scheme_profiles(self): - with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle: - async with self._http_client.get("profiles_csv") as profiles_response: - async for chunk, eof in profiles_response.content.iter_chunks(): - profile_cache_handle.write(chunk) - - async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: - aligner = PairwiseAligner("blastn") - aligner.mode = "local" - for sequence_string in sequence_strings: - for locus in self._loci: - async for fasta_seq in read_fasta(self.get_locus_cache_path(locus)): - allele_variant = fasta_seq.name - alignment_results = aligner.align(sequence_string, fasta_seq.sequence) - top_alignment = sorted(alignment_results)[0] - top_alignment_stats = top_alignment.counts() - top_alignment_gaps = top_alignment_stats.gaps - top_alignment_identities = top_alignment_stats.identities - top_alignment_mismatches = top_alignment_stats.mismatches - if top_alignment_gaps == 0 and top_alignment_mismatches == 0: - yield Allele(locus, allele_variant, None) - else: - yield Allele( - locus, - allele_variant, - PartialAllelicMatchProfile( - percent_identity=top_alignment_identities/top_alignment.length, - mismatches=top_alignment_mismatches, - gaps=top_alignment_gaps - ) - ) - - async def fetch_mlst_st(self, alleles): - allele_variants: dict[str, Allele] = {} - if isinstance(alleles, AsyncIterable): - async for allele in alleles: - allele_variants[allele.allele_locus] = allele - else: - for allele in alleles: - allele_variants[allele.allele_locus] = allele - ordered_profile = [] - for locus in self._loci: - ordered_profile.append(allele_variants[locus].allele_variant) - - st, clonal_complex = self._profiles[tuple(ordered_profile)] - return MLSTProfile(allele_variants, st, clonal_complex) - - async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile: - alleles = self.fetch_mlst_allele_variants(sequence_strings) - return await self.fetch_mlst_st(alleles) - - async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: - async for named_strings in named_string_groups: - for named_string in named_strings: - try: - yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence])) - except NoBIGSdbMatchesException as e: - if stop_on_fail: - raise e - yield NamedMLSTProfile(named_string.name, None) - - async def close(self): - await self._http_client.close() - - async def __aexit__(self, exc_type, exc_value, traceback): - await self.close() - class BIGSdbIndex(AbstractAsyncContextManager): KNOWN_BIGSDB_APIS = { "https://bigsdb.pasteur.fr/api", From c18d817cd9a05869c960644dbe10632d1b914d5c Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 12 Feb 2025 14:38:12 +0000 Subject: [PATCH 2/4] Added test to verify that CSV target columns are ordered --- tests/autobigs/engine/data/local/test_csv.py | 29 ++++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/autobigs/engine/data/local/test_csv.py b/tests/autobigs/engine/data/local/test_csv.py index 084df9e..9218873 100644 --- a/tests/autobigs/engine/data/local/test_csv.py +++ b/tests/autobigs/engine/data/local/test_csv.py @@ -1,6 +1,13 @@ -from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci -from autobigs.engine.data.structures.mlst import Allele +from typing import AsyncIterable, Iterable +from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci, write_mlst_profiles_as_csv +from autobigs.engine.data.structures.mlst import Allele, MLSTProfile +import tempfile +from csv import reader +from os import path +async def iterable_to_asynciterable(iterable: Iterable): + for iterated in iterable: + yield iterated def test_dict_loci_alleles_variants_from_loci_single_loci_not_list(): alleles_map = { @@ -18,4 +25,20 @@ def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list(): results = dict_loci_alleles_variants_from_loci(alleles_map) for loci, variant in results.items(): assert isinstance(variant, list) - assert len(variant) == 2 \ No newline at end of file + assert len(variant) == 2 + +async def test_column_order_is_same_as_expected_file(): + dummy_profiles = [("test_1", MLSTProfile({ + "A": Allele("A", "1", None), + "D": Allele("D", "1", None), + "B": Allele("B", "1", None), + "C": Allele("C", "1", None) + }, "mysterious", "very mysterious"))] + with tempfile.TemporaryDirectory() as temp_dir: + output_path = path.join(temp_dir, "out.csv") + await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path) + with open(output_path) as csv_handle: + csv_reader = reader(csv_handle) + lines = list(csv_reader) + target_columns = lines[4:] + assert target_columns == sorted(target_columns) \ No newline at end of file From a88225fcffe893c9b265bd0cbf3a5e166c2bae6f Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 12 Feb 2025 14:46:29 +0000 Subject: [PATCH 3/4] Added check to wrap string into list to prevent decomposing string for querying --- src/autobigs/engine/data/remote/databases/bigsdb.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/autobigs/engine/data/remote/databases/bigsdb.py b/src/autobigs/engine/data/remote/databases/bigsdb.py index 206a9f1..116a73f 100644 --- a/src/autobigs/engine/data/remote/databases/bigsdb.py +++ b/src/autobigs/engine/data/remote/databases/bigsdb.py @@ -47,10 +47,13 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): async def __aenter__(self): return self - async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: + async def fetch_mlst_allele_variants(self, sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]: # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes uri_path = "sequence" + if isinstance(sequence_strings, str): + sequence_strings = [sequence_strings] + for sequence_string in sequence_strings: async with self._http_client.post(uri_path, json={ "sequence": sequence_string, From bfc286e6b02e5dd45468ccbfc13d837d46f5d618 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 12 Feb 2025 14:57:51 +0000 Subject: [PATCH 4/4] Updated test cases to reflect changes in codebase MLSTProfile will always return a value, even if there were no exact matches. Removed a test case specifically testing for stopping on failure, which is a removed feature. --- .../data/remote/databases/test_bigsdb.py | 26 +++---------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/tests/autobigs/engine/data/remote/databases/test_bigsdb.py b/tests/autobigs/engine/data/remote/databases/test_bigsdb.py index 74414a8..17676df 100644 --- a/tests/autobigs/engine/data/remote/databases/test_bigsdb.py +++ b/tests/autobigs/engine/data/remote/databases/test_bigsdb.py @@ -186,11 +186,12 @@ async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(): async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True): name, profile = name_profile.name, name_profile.mlst_profile + assert profile is not None + assert isinstance(profile, MLSTProfile) if name == "should_fail": - assert profile is None + assert profile.clonal_complex == "unknown" + assert profile.sequence_type == "unknown" else: - assert profile is not None - assert isinstance(profile, MLSTProfile) assert profile.clonal_complex == "ST-2 complex" assert profile.sequence_type == "1" @@ -214,25 +215,6 @@ async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(): assert profile.clonal_complex == "ST-2 complex" assert profile.sequence_type == "1" -async def test_bigsdb_profile_multiple_strings_fail_second_stop(): - valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) - invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) - dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)] - async def generate_async_iterable_sequences(): - for dummy_sequence in dummy_sequences: - yield [dummy_sequence] - async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: - with pytest.raises(NoBIGSdbMatchesException): - async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), stop_on_fail=True): - name, profile = named_profile.name, named_profile.mlst_profile - if name == "should_fail": - pytest.fail("Exception should have been thrown, no exception was thrown.") - else: - assert profile is not None - assert isinstance(profile, MLSTProfile) - assert profile.clonal_complex == "ST-2 complex" - assert profile.sequence_type == "1" - async def test_bigsdb_index_get_schemas_for_bordetella(): async with BIGSdbIndex() as index: schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")