Merge branch 'features/improved-oop-architecture' into features/non-exact-notation

This commit is contained in:
Harrison Deng 2025-02-12 17:02:22 +00:00
commit 36bca1b70d
4 changed files with 7 additions and 145 deletions

View File

@ -1,70 +0,0 @@
import asyncio
from concurrent.futures import Future, ThreadPoolExecutor
from contextlib import AbstractContextManager
from typing import Any, Set, Union
from Bio.Align import PairwiseAligner
from queue import Queue
from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager):
def __enter__(self):
self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment")
return self
def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
self._max_threads = max_threads
self._aligner = aligner
self._work_left: Set[Future] = set()
self._work_complete: Queue[Future] = Queue()
def align(self, reference: str, query: str, **associated_data):
work = self._thread_pool.submit(
self.work, reference, query, **associated_data)
work.add_done_callback(self._on_complete)
self._work_left.add(work)
def _on_complete(self, future: Future):
self._work_left.remove(future)
self._work_complete.put(future)
def work(self, reference, query, **associated_data):
alignments = self._aligner.align(reference, query)
top_alignment = alignments[0]
top_alignment_stats = top_alignment.counts()
top_alignment_gaps = top_alignment_stats.gaps
top_alignment_identities = top_alignment_stats.identities
top_alignment_mismatches = top_alignment_stats.mismatches
top_alignment_score = top_alignment.score # type: ignore
return PairwiseAlignment(
top_alignment.sequences[0],
top_alignment.sequences[1],
tuple(top_alignment.indices[0]),
tuple(top_alignment.indices[1]),
AlignmentStats(
percent_identity=top_alignment_identities/top_alignment.length,
mismatches=top_alignment_mismatches,
gaps=top_alignment_gaps,
match_metric=top_alignment_score
)), associated_data
async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
if self._work_complete.empty() and len(self._work_left):
return None
completed_alignment = await asyncio.wrap_future(self._work_complete.get())
return completed_alignment
def __exit__(self, exc_type, exc_value, traceback):
self.shutdown()
def __aiter__(self):
return self
async def __anext__(self):
result = await self.next_completed()
if result is None:
raise StopAsyncIteration
return result
def shutdown(self):
self._thread_pool.shutdown(wait=True, cancel_futures=True)

View File

@ -1,26 +0,0 @@
import asyncio
from contextlib import AbstractAsyncContextManager
import tempfile
from typing import Iterable, Union
from Bio import Entrez
from Bio import SeqIO
from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)

View File

@ -1,42 +0,0 @@
from Bio import SeqIO
from Bio.Align import PairwiseAligner
from pytest import mark
from pytest import fixture
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
from autobigs.engine.structures.alignment import PairwiseAlignment
@fixture
def tohamaI_bpertussis_adk():
return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
@fixture
def tohamaI_bpertussis_genome():
return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
@fixture
def fdaargos_1560_hinfluenza_adk():
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq)
@fixture
def fdaargos_1560_hinfluenza_genome():
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq)
@fixture(params=[1, 2])
def dummy_engine(request):
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine:
yield engine
class TestAsyncPairwiseAlignmentEngine:
async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
async for alignment, additional_information in dummy_engine:
assert isinstance(alignment, PairwiseAlignment)
async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk)
async for alignment, additional_information in dummy_engine:
assert isinstance(alignment, PairwiseAlignment)

View File

@ -61,12 +61,12 @@ hinfluenzae_fdaargos_profile = MLSTProfile((
), "3", "ST-3 complex")
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("adk", "3", None),
Allele("atpG", "121", None),
Allele("frdB", "6", None),
Allele("fucK", "5", None),
Allele("mdh", "12", None),
Allele("pgi", "4", None),
Allele("recA", "5", None)
), "3", "ST-3 complex")
@ -76,7 +76,7 @@ hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/to
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile),
])
class TestBIGSdbMLSTProfiler:
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):