diff --git a/src/autobigs/engine/analysis/aligners.py b/src/autobigs/engine/analysis/aligners.py deleted file mode 100644 index 8909902..0000000 --- a/src/autobigs/engine/analysis/aligners.py +++ /dev/null @@ -1,70 +0,0 @@ -import asyncio -from concurrent.futures import Future, ThreadPoolExecutor -from contextlib import AbstractContextManager -from typing import Any, Set, Union -from Bio.Align import PairwiseAligner -from queue import Queue - -from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment - -class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager): - def __enter__(self): - self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment") - return self - - def __init__(self, aligner: PairwiseAligner, max_threads: int = 4): - self._max_threads = max_threads - self._aligner = aligner - self._work_left: Set[Future] = set() - self._work_complete: Queue[Future] = Queue() - - def align(self, reference: str, query: str, **associated_data): - work = self._thread_pool.submit( - self.work, reference, query, **associated_data) - work.add_done_callback(self._on_complete) - self._work_left.add(work) - - def _on_complete(self, future: Future): - self._work_left.remove(future) - self._work_complete.put(future) - - def work(self, reference, query, **associated_data): - alignments = self._aligner.align(reference, query) - top_alignment = alignments[0] - top_alignment_stats = top_alignment.counts() - top_alignment_gaps = top_alignment_stats.gaps - top_alignment_identities = top_alignment_stats.identities - top_alignment_mismatches = top_alignment_stats.mismatches - top_alignment_score = top_alignment.score # type: ignore - return PairwiseAlignment( - top_alignment.sequences[0], - top_alignment.sequences[1], - tuple(top_alignment.indices[0]), - tuple(top_alignment.indices[1]), - AlignmentStats( - percent_identity=top_alignment_identities/top_alignment.length, - mismatches=top_alignment_mismatches, - gaps=top_alignment_gaps, - match_metric=top_alignment_score - )), associated_data - - async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]: - if self._work_complete.empty() and len(self._work_left): - return None - completed_alignment = await asyncio.wrap_future(self._work_complete.get()) - return completed_alignment - - def __exit__(self, exc_type, exc_value, traceback): - self.shutdown() - - def __aiter__(self): - return self - - async def __anext__(self): - result = await self.next_completed() - if result is None: - raise StopAsyncIteration - return result - - def shutdown(self): - self._thread_pool.shutdown(wait=True, cancel_futures=True) diff --git a/src/autobigs/engine/analysis/genbank.py b/src/autobigs/engine/analysis/genbank.py deleted file mode 100644 index 8d771ad..0000000 --- a/src/autobigs/engine/analysis/genbank.py +++ /dev/null @@ -1,26 +0,0 @@ -import asyncio -from contextlib import AbstractAsyncContextManager -import tempfile -from typing import Iterable, Union -from Bio import Entrez -from Bio import SeqIO - -from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation - -async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: - with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream: - record = SeqIO.read(fetch_stream, "genbank") - sequence_features = list() - for feature in record.features: - start = int(feature.location.start) - end = int(feature.location.end) - qualifiers = feature.qualifiers - for qualifier_key in qualifiers: - qualifiers[qualifier_key] = set(qualifiers[qualifier_key]) - sequence_features.append(StringAnnotation( - type=feature.type, - start=start, - end=end+1, # Position is exclusive - feature_properties=qualifiers - )) - return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features) diff --git a/tests/autobigs/engine/analysis/test_aligners.py b/tests/autobigs/engine/analysis/test_aligners.py deleted file mode 100644 index 6285b98..0000000 --- a/tests/autobigs/engine/analysis/test_aligners.py +++ /dev/null @@ -1,42 +0,0 @@ -from Bio import SeqIO -from Bio.Align import PairwiseAligner -from pytest import mark -from pytest import fixture -from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine -from autobigs.engine.structures.alignment import PairwiseAlignment - -@fixture -def tohamaI_bpertussis_adk(): - return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq) - -@fixture -def tohamaI_bpertussis_genome(): - return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq) - -@fixture -def fdaargos_1560_hinfluenza_adk(): - return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq) - -@fixture -def fdaargos_1560_hinfluenza_genome(): - return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq) - - -@fixture(params=[1, 2]) -def dummy_engine(request): - aligner = PairwiseAligner("blastn") - aligner.mode = "local" - with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine: - yield engine - -class TestAsyncPairwiseAlignmentEngine: - async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine): - dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk) - async for alignment, additional_information in dummy_engine: - assert isinstance(alignment, PairwiseAlignment) - - async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine): - dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk) - dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk) - async for alignment, additional_information in dummy_engine: - assert isinstance(alignment, PairwiseAlignment) \ No newline at end of file diff --git a/tests/autobigs/engine/analysis/test_bigsdb.py b/tests/autobigs/engine/analysis/test_bigsdb.py index d689d65..81cfa32 100644 --- a/tests/autobigs/engine/analysis/test_bigsdb.py +++ b/tests/autobigs/engine/analysis/test_bigsdb.py @@ -61,12 +61,12 @@ hinfluenzae_fdaargos_profile = MLSTProfile(( ), "3", "ST-3 complex") hinfluenzae_fdaargos_bad_profile = MLSTProfile(( - Allele("adk", "1", None), - Allele("atpG", "1", None), - Allele("frdB", "1", None), - Allele("fucK", "1", None), - Allele("mdh", "1", None), - Allele("pgi", "1", None), + Allele("adk", "3", None), + Allele("atpG", "121", None), + Allele("frdB", "6", None), + Allele("fucK", "5", None), + Allele("mdh", "12", None), + Allele("pgi", "4", None), Allele("recA", "5", None) ), "3", "ST-3 complex") @@ -76,7 +76,7 @@ hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/to @pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), - (False, "https://bigsdb.pasteur.fr/api", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile), + (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile), ]) class TestBIGSdbMLSTProfiler: async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):