Implemented annotated local typing method without testing
This commit is contained in:
		
							
								
								
									
										71
									
								
								src/autobigs/engine/analysis/aligners.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								src/autobigs/engine/analysis/aligners.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,71 @@ | ||||
| import asyncio | ||||
| from concurrent.futures import Future, ThreadPoolExecutor | ||||
| from contextlib import AbstractContextManager | ||||
| from typing import Any, Set, Union | ||||
| from Bio.Align import PairwiseAligner | ||||
| from queue import Queue | ||||
|  | ||||
| from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment | ||||
|  | ||||
|  | ||||
| class AsyncPairwiseAlignmentEngine(AbstractContextManager): | ||||
|     def __enter__(self): | ||||
|         self._thread_pool = ThreadPoolExecutor(self._max_threads) | ||||
|         return self | ||||
|  | ||||
|     def __init__(self, aligner: PairwiseAligner, max_threads: int = 4): | ||||
|         self._max_threads = max_threads | ||||
|         self._aligner = aligner | ||||
|         self._work_left: Set[Future] = set() | ||||
|         self._work_complete: Queue[Future] = Queue() | ||||
|  | ||||
|     def align(self, reference: str, query: str, **associated_data): | ||||
|         work = self._thread_pool.submit( | ||||
|             self.work, reference, query, **associated_data) | ||||
|         work.add_done_callback(self._on_complete) | ||||
|         self._work_left.add(work) | ||||
|          | ||||
|     def _on_complete(self, future: Future): | ||||
|         self._work_complete.put(future) | ||||
|  | ||||
|     def work(self, reference, query, **associated_data): | ||||
|         alignment_results = sorted(self._aligner.align(reference, query))[0] | ||||
|         top_alignment_stats = alignment_results.counts() | ||||
|         top_alignment_gaps = top_alignment_stats.gaps | ||||
|         top_alignment_identities = top_alignment_stats.identities | ||||
|         top_alignment_mismatches = top_alignment_stats.mismatches | ||||
|         top_alignment_score = alignment_results.score # type: ignore | ||||
|         return PairwiseAlignment( | ||||
|             alignment_results.sequences[0], | ||||
|             alignment_results.sequences[1], | ||||
|             alignment_results.indices[0], | ||||
|             alignment_results.indices[1], | ||||
|             AlignmentStats( | ||||
|                 percent_identity=top_alignment_identities/alignment_results.length, | ||||
|                 mismatches=top_alignment_mismatches, | ||||
|                 gaps=top_alignment_gaps, | ||||
|                 score=top_alignment_score | ||||
|             )), associated_data | ||||
|  | ||||
|     async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]: | ||||
|         if self._work_complete.empty() and len(self._work_left): | ||||
|             return None | ||||
|         future_now: Future = await asyncio.wrap_future(self._work_complete.get()) | ||||
|         completed: tuple[PairwiseAlignment, dict[str, Any]] = (future_now).result() | ||||
|         self._work_left.remove(future_now) | ||||
|         return completed | ||||
|  | ||||
|     def __exit__(self, exc_type, exc_value, traceback): | ||||
|         self.shutdown() | ||||
|  | ||||
|     def __aiter__(self): | ||||
|         return self | ||||
|      | ||||
|     async def __anext__(self): | ||||
|         result = await self.next_completed() | ||||
|         if result is None: | ||||
|             raise StopAsyncIteration | ||||
|         return result | ||||
|  | ||||
|     def shutdown(self): | ||||
|         self._thread_pool.shutdown(wait=True, cancel_futures=True) | ||||
| @@ -1,15 +1,21 @@ | ||||
| from abc import abstractmethod | ||||
| import asyncio | ||||
| from collections import defaultdict | ||||
| from contextlib import AbstractAsyncContextManager | ||||
| import csv | ||||
| from os import path | ||||
| from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Union | ||||
| import os | ||||
| import shutil | ||||
| import tempfile | ||||
| from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union | ||||
| 
 | ||||
| from aiohttp import ClientSession, ClientTimeout | ||||
| 
 | ||||
| from autobigs.engine.data.local.fasta import read_fasta | ||||
| from autobigs.engine.data.structures.genomics import NamedString | ||||
| from autobigs.engine.data.structures.mlst import Allele, NamedMLSTProfile, PartialAllelicMatchProfile, MLSTProfile | ||||
| from autobigs.engine.analysis.aligners import AsyncPairwiseAlignmentEngine | ||||
| from autobigs.engine.reading import read_fasta | ||||
| from autobigs.engine.structures.alignment import PairwiseAlignment | ||||
| from autobigs.engine.structures.genomics import NamedString | ||||
| from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile | ||||
| from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException | ||||
| 
 | ||||
| from Bio.Align import PairwiseAligner | ||||
| @@ -17,26 +23,26 @@ from Bio.Align import PairwiseAligner | ||||
| class BIGSdbMLSTProfiler(AbstractAsyncContextManager): | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|     def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|         pass | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: | ||||
|     async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: | ||||
|         pass | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|     async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|         pass | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|     def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|         pass | ||||
| 
 | ||||
|     @abstractmethod | ||||
|     async def close(self): | ||||
|         pass | ||||
| 
 | ||||
| class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
| class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
| 
 | ||||
|     def __init__(self, database_api: str, database_name: str, schema_id: int): | ||||
|         self._database_name = database_name | ||||
| @@ -47,11 +53,13 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|     async def __aenter__(self): | ||||
|         return self | ||||
| 
 | ||||
|     async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|     async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|         # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes | ||||
|         uri_path = "sequence" | ||||
|         if not isinstance(query_sequence_strings, Iterable): | ||||
|             raise ValueError("Invalid data type for parameter \"sequence_strings\".") | ||||
| 
 | ||||
|         for sequence_string in sequence_strings: | ||||
|         for sequence_string in query_sequence_strings: | ||||
|             async with self._http_client.post(uri_path, json={ | ||||
|                 "sequence": sequence_string, | ||||
|                 "partial_matches": True | ||||
| @@ -70,10 +78,11 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|                     for allele_loci, partial_match in partial_matches.items(): | ||||
|                         if len(partial_match) <= 0: | ||||
|                             continue | ||||
|                         partial_match_profile = PartialAllelicMatchProfile( | ||||
|                         partial_match_profile = AlignmentStats( | ||||
|                             percent_identity=float(partial_match["identity"]), | ||||
|                             mismatches=int(partial_match["mismatches"]), | ||||
|                             gaps=int(partial_match["gaps"]) | ||||
|                             gaps=int(partial_match["gaps"]), | ||||
|                             score=int(partial_match["score"]) | ||||
|                         ) | ||||
|                         yield Allele( | ||||
|                             allele_locus=allele_loci, | ||||
| @@ -83,7 +92,7 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|                 else: | ||||
|                     raise NoBIGSdbMatchesException(self._database_name, self._schema_id) | ||||
| 
 | ||||
|     async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: | ||||
|     async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: | ||||
|         uri_path = "designations" | ||||
|         allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) | ||||
|         if isinstance(alleles, AsyncIterable): | ||||
| @@ -97,7 +106,7 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|         } | ||||
|         async with self._http_client.post(uri_path, json=request_json) as response: | ||||
|             response_json: dict = await response.json() | ||||
|             allele_map: dict[str, Allele] = {} | ||||
|             allele_set: Set[Allele] = set() | ||||
|             response_json.setdefault("fields", dict()) | ||||
|             schema_fields_returned: dict[str, str] = response_json["fields"] | ||||
|             schema_fields_returned.setdefault("ST", "unknown") | ||||
| @@ -106,17 +115,17 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|             for exact_match_locus, exact_match_alleles in schema_exact_matches.items(): | ||||
|                 if len(exact_match_alleles) > 1: | ||||
|                     raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})") | ||||
|                 allele_map[exact_match_locus] = Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None) | ||||
|             if len(allele_map) == 0: | ||||
|                 allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)) | ||||
|             if len(allele_set) == 0: | ||||
|                 raise ValueError("Passed in no alleles.") | ||||
|             return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) | ||||
|             return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) | ||||
| 
 | ||||
|     async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|         alleles = self.fetch_mlst_allele_variants(sequence_strings) | ||||
|         return await self.fetch_mlst_st(alleles) | ||||
|     async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|         alleles = self.determine_mlst_allele_variants(query_sequence_strings) | ||||
|         return await self.determine_mlst_st(alleles) | ||||
| 
 | ||||
|     async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|         async for named_strings in named_string_groups: | ||||
|     async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|         async for named_strings in query_named_string_groups: | ||||
|             for named_string in named_strings: | ||||
|                 try: | ||||
|                     yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence]))) | ||||
| @@ -131,20 +140,36 @@ class OnlineBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|     async def __aexit__(self, exc_type, exc_value, traceback): | ||||
|         await self.close() | ||||
| 
 | ||||
| class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|     def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: str): | ||||
| class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|     async def __aenter__(self): | ||||
|         if self._prepare: | ||||
|             await self.update_scheme_locis() | ||||
|             await asyncio.gather( | ||||
|                 self.download_alleles_cache_data(), | ||||
|                 self.download_scheme_profiles() | ||||
|             ) | ||||
|             await self.load_scheme_profiles() | ||||
|         return self | ||||
|      | ||||
|     def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True): | ||||
|         self._database_api = database_api | ||||
|         self._database_name = database_name | ||||
|         self._schema_id = schema_id | ||||
|         self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/" | ||||
|         self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/" | ||||
|         self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000)) | ||||
|         self._cache_path = cache_path | ||||
|         if cache_path is None: | ||||
|             self._cache_path = tempfile.mkdtemp("BIGSdb") | ||||
|             self._cleanup_required = True | ||||
|         else: | ||||
|             self._cache_path = cache_path | ||||
|             self._cleanup_required = False | ||||
|         self._loci: list[str] = [] | ||||
|         self._profiles = {} | ||||
|         self._profiles_st_map = {} | ||||
|         self._prepare = prepare | ||||
| 
 | ||||
|     async def load_scheme_locis(self): | ||||
|     async def update_scheme_locis(self): | ||||
|         self._loci.clear() | ||||
|         async with self._http_client.get("") as schema_response: | ||||
|         async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response: | ||||
|             schema_json = await schema_response.json() | ||||
|             for locus in schema_json["loci"]: | ||||
|                 locus_name = path.basename(locus) | ||||
| @@ -152,14 +177,14 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|         self._loci.sort() | ||||
|      | ||||
|     async def load_scheme_profiles(self): | ||||
|         self._profiles.clear() | ||||
|         self._profiles_st_map.clear() | ||||
|         with open(self.get_scheme_profile_path()) as profile_cache_handle: | ||||
|             reader = csv.DictReader(profile_cache_handle, delimiter="\t") | ||||
|             for line in reader: | ||||
|                 alleles = [] | ||||
|                 for locus in self._loci: | ||||
|                     alleles.append(line[locus]) | ||||
|                 self._profiles[tuple(alleles)] = (line["ST"], line["clonal_complex"]) | ||||
|                 self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"]) | ||||
|              | ||||
|     def get_locus_cache_path(self, locus) -> str: | ||||
|         return path.join(self._cache_path, locus + "." + "fasta") | ||||
| @@ -170,8 +195,8 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|     async def download_alleles_cache_data(self): | ||||
|         for locus in self._loci: | ||||
|             with open(self.get_locus_cache_path(locus), "wb") as fasta_handle: | ||||
|                 async with self._http_client.get(f"/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response: | ||||
|                     async for chunk, eof in fasta_response.content.iter_chunks(): # TODO maybe allow chunking to be configurable | ||||
|                 async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response: | ||||
|                     async for chunk, eof in fasta_response.content.iter_chunks(): | ||||
|                         fasta_handle.write(chunk) | ||||
| 
 | ||||
|     async def download_scheme_profiles(self): | ||||
| @@ -179,34 +204,41 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|             async with self._http_client.get("profiles_csv") as profiles_response: | ||||
|                 async for chunk, eof in profiles_response.content.iter_chunks(): | ||||
|                     profile_cache_handle.write(chunk) | ||||
|         await self.load_scheme_profiles() | ||||
|      | ||||
|     async def fetch_mlst_allele_variants(self, sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|     async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: | ||||
|         aligner = PairwiseAligner("blastn") | ||||
|         aligner.mode = "local" | ||||
|         for sequence_string in sequence_strings: | ||||
|             for locus in self._loci: | ||||
|                 async for fasta_seq in read_fasta(self.get_locus_cache_path(locus)): | ||||
|                     allele_variant = fasta_seq.name | ||||
|                     alignment_results = aligner.align(sequence_string, fasta_seq.sequence) | ||||
|                     top_alignment = sorted(alignment_results)[0] | ||||
|                     top_alignment_stats = top_alignment.counts() | ||||
|                     top_alignment_gaps = top_alignment_stats.gaps | ||||
|                     top_alignment_identities = top_alignment_stats.identities | ||||
|                     top_alignment_mismatches = top_alignment_stats.mismatches | ||||
|                     if top_alignment_gaps == 0 and top_alignment_mismatches == 0: | ||||
|                         yield Allele(locus, allele_variant, None) | ||||
|         with AsyncPairwiseAlignmentEngine(aligner) as aligner_engine: | ||||
|             for query_sequence_string in query_sequence_strings: | ||||
|                 for locus in self._loci: | ||||
|                     async for allele_variant in read_fasta(self.get_locus_cache_path(locus)): | ||||
|                         aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True) | ||||
|                         break # start a bunch of full alignments for each variant to select segments | ||||
|             alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set) | ||||
|             async for alignment_result, additional_information in aligner_engine: | ||||
|                 result_variant_name = additional_information["variant_name"] | ||||
|                 result_locus, variant_id = result_variant_name.split("_") | ||||
|                 full_alignment = additional_information["full"] | ||||
|                 if full_alignment: | ||||
|                     if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0: | ||||
|                         # I.e., 100% exactly the same | ||||
|                         yield Allele(result_locus, variant_id, None) | ||||
|                         continue | ||||
|                     else: | ||||
|                         yield Allele( | ||||
|                             locus, | ||||
|                             allele_variant, | ||||
|                             PartialAllelicMatchProfile( | ||||
|                                 percent_identity=top_alignment_identities/top_alignment.length, | ||||
|                                 mismatches=top_alignment_mismatches, | ||||
|                                 gaps=top_alignment_gaps | ||||
|                             ) | ||||
|                         ) | ||||
|                         alignment_rankings[result_locus].add((alignment_result, variant_id)) | ||||
|                     interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]] | ||||
|                     async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)): | ||||
|                         if result_variant_name == allele_variant.name: | ||||
|                             continue # Skip if we just finished aligning this | ||||
|                         aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False) | ||||
|                 else: | ||||
|                     alignment_rankings[result_locus].add((alignment_result, variant_id)) | ||||
|             for final_locus, alignments in alignment_rankings.items(): | ||||
|                 closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.score)[0] | ||||
|                 yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats) | ||||
| 
 | ||||
|     async def fetch_mlst_st(self, alleles): | ||||
|     async def determine_mlst_st(self, alleles): | ||||
|         allele_variants: dict[str, Allele] = {} | ||||
|         if isinstance(alleles, AsyncIterable): | ||||
|             async for allele in alleles: | ||||
| @@ -218,15 +250,15 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
|         for locus in self._loci: | ||||
|                ordered_profile.append(allele_variants[locus].allele_variant) | ||||
| 
 | ||||
|         st, clonal_complex = self._profiles[tuple(ordered_profile)] | ||||
|         return MLSTProfile(allele_variants, st, clonal_complex) | ||||
|         st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)] | ||||
|         return MLSTProfile(set(allele_variants.values()), st, clonal_complex) | ||||
| 
 | ||||
|     async def profile_string(self, sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|         alleles = self.fetch_mlst_allele_variants(sequence_strings) | ||||
|         return await self.fetch_mlst_st(alleles) | ||||
|     async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile: | ||||
|         alleles = self.determine_mlst_allele_variants(query_sequence_strings) | ||||
|         return await self.determine_mlst_st(alleles) | ||||
| 
 | ||||
|     async def profile_multiple_strings(self, named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|         async for named_strings in named_string_groups: | ||||
|     async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: | ||||
|         async for named_strings in query_named_string_groups: | ||||
|             for named_string in named_strings: | ||||
|                 try: | ||||
|                     yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence])) | ||||
| @@ -237,6 +269,8 @@ class LazyPersistentCachedBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): | ||||
| 
 | ||||
|     async def close(self): | ||||
|         await self._http_client.close() | ||||
|         if self._cleanup_required: | ||||
|             shutil.rmtree(self._cache_path) | ||||
| 
 | ||||
|     async def __aexit__(self, exc_type, exc_value, traceback): | ||||
|         await self.close() | ||||
| @@ -290,8 +324,8 @@ class BIGSdbIndex(AbstractAsyncContextManager): | ||||
|             self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions | ||||
|             return self._seqdefdb_schemas[seqdef_db_name] # type: ignore | ||||
| 
 | ||||
|     async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> OnlineBIGSdbMLSTProfiler: | ||||
|         return OnlineBIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) | ||||
|     async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> RemoteBIGSdbMLSTProfiler: | ||||
|         return RemoteBIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) | ||||
| 
 | ||||
|     async def close(self): | ||||
|         await self._http_client.close() | ||||
| @@ -299,3 +333,7 @@ class BIGSdbIndex(AbstractAsyncContextManager): | ||||
|     async def __aexit__(self, exc_type, exc_value, traceback): | ||||
|         await self.close() | ||||
| 
 | ||||
| def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int): | ||||
|     if local: | ||||
|         return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id) | ||||
|     return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id) | ||||
| @@ -1,25 +0,0 @@ | ||||
| from dataclasses import dataclass | ||||
| from typing import Mapping, Sequence, Union | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class PartialAllelicMatchProfile: | ||||
|     percent_identity: float | ||||
|     mismatches: int | ||||
|     gaps: int | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class Allele: | ||||
|     allele_locus: str | ||||
|     allele_variant: str | ||||
|     partial_match_profile: Union[None, PartialAllelicMatchProfile] | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class MLSTProfile: | ||||
|     alleles: Mapping[str, Allele] | ||||
|     sequence_type: str | ||||
|     clonal_complex: str | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class NamedMLSTProfile: | ||||
|     name: str | ||||
|     mlst_profile: Union[None, MLSTProfile] | ||||
| @@ -1,9 +1,9 @@ | ||||
| import asyncio | ||||
| from io import TextIOWrapper | ||||
| from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union | ||||
| from typing import Any, AsyncGenerator, Iterable, Union | ||||
| from Bio import SeqIO | ||||
| 
 | ||||
| from autobigs.engine.data.structures.genomics import NamedString | ||||
| from autobigs.engine.structures.genomics import NamedString | ||||
| 
 | ||||
| async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]: | ||||
|     fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta") | ||||
							
								
								
									
										17
									
								
								src/autobigs/engine/structures/alignment.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								src/autobigs/engine/structures/alignment.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| from dataclasses import dataclass | ||||
| from numbers import Number | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class AlignmentStats: | ||||
|     percent_identity: float | ||||
|     mismatches: int | ||||
|     gaps: int | ||||
|     score: int | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class PairwiseAlignment: | ||||
|     reference: str | ||||
|     query: str | ||||
|     reference_indices: list[Number] | ||||
|     query_indices: list[Number] | ||||
|     alignment_stats: AlignmentStats | ||||
							
								
								
									
										33
									
								
								src/autobigs/engine/structures/mlst.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								src/autobigs/engine/structures/mlst.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| from collections import defaultdict | ||||
| from dataclasses import dataclass | ||||
| from typing import Collection, Iterable, Mapping, Sequence, Union | ||||
|  | ||||
| from autobigs.engine.structures.alignment import AlignmentStats | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class Allele: | ||||
|     allele_locus: str | ||||
|     allele_variant: str | ||||
|     partial_match_profile: Union[None, AlignmentStats] | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class MLSTProfile: | ||||
|     alleles: Collection[Allele] | ||||
|     sequence_type: str | ||||
|     clonal_complex: str | ||||
|  | ||||
| @dataclass(frozen=True) | ||||
| class NamedMLSTProfile: | ||||
|     name: str | ||||
|     mlst_profile: Union[None, MLSTProfile] | ||||
|  | ||||
|  | ||||
| def alleles_to_mapping(alleles: Iterable[Allele]): | ||||
|     result = defaultdict(list) | ||||
|     for allele in alleles: | ||||
|         result[allele.allele_locus].append(allele.allele_variant) | ||||
|     result = dict(result) | ||||
|     for locus, variant in result.items(): | ||||
|         if len(variant) == 1: | ||||
|             result[locus] = variant[0] | ||||
|     return result | ||||
| @@ -2,19 +2,13 @@ import csv | ||||
| from os import PathLike | ||||
| from typing import AsyncIterable, Mapping, Sequence, Union | ||||
| 
 | ||||
| from autobigs.engine.data.structures.mlst import Allele, MLSTProfile | ||||
| from autobigs.engine.structures.mlst import Allele, MLSTProfile | ||||
| 
 | ||||
| 
 | ||||
| def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]): | ||||
| def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Allele]): | ||||
|     result_dict: dict[str, Union[list[str], str]] = {} | ||||
|     for loci, alleles in alleles_map.items(): | ||||
|         if len(alleles) == 1: | ||||
|             result_dict[loci] = alleles[0].allele_variant | ||||
|         else: | ||||
|             result_locis = list() | ||||
|             for allele in alleles: | ||||
|                 result_locis.append(allele.allele_variant) | ||||
|                 result_dict[loci] = result_locis | ||||
|         result_dict[loci] = alleles.allele_variant | ||||
|     return result_dict | ||||
| 
 | ||||
| 
 | ||||
							
								
								
									
										27
									
								
								tests/autobigs/engine/analysis/test_aligners.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								tests/autobigs/engine/analysis/test_aligners.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| from Bio import SeqIO | ||||
| from Bio.Align import PairwiseAligner | ||||
| from pytest import mark | ||||
| from pytest import fixture | ||||
| from autobigs.engine.analysis.aligners import AsyncPairwiseAlignmentEngine | ||||
| from autobigs.engine.structures.alignment import PairwiseAlignment | ||||
|  | ||||
| @fixture | ||||
| def tohamaI_bpertussis_adk(): | ||||
|     return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq) | ||||
|  | ||||
| @fixture | ||||
| def tohamaI_bpertussis_genome(): | ||||
|     return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq) | ||||
|  | ||||
| @fixture(params=[1, 2]) | ||||
| def dummy_engine(request): | ||||
|     aligner = PairwiseAligner("blastn") | ||||
|     aligner.mode = "local" | ||||
|     with AsyncPairwiseAlignmentEngine(aligner, request.param) as engine: | ||||
|         yield engine | ||||
|  | ||||
| class TestAsyncPairwiseAlignmentEngine: | ||||
|     async def test_single_alignment_no_errors(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncPairwiseAlignmentEngine): | ||||
|         dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk) | ||||
|         async for alignment, additional_information in dummy_engine: | ||||
|             assert isinstance(alignment, PairwiseAlignment) | ||||
							
								
								
									
										210
									
								
								tests/autobigs/engine/analysis/test_bigsdb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										210
									
								
								tests/autobigs/engine/analysis/test_bigsdb.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,210 @@ | ||||
| from os import path | ||||
| import random | ||||
| import re | ||||
| from typing import Callable, Collection, Sequence, Union | ||||
| from Bio import SeqIO | ||||
| import pytest | ||||
| from autobigs.engine.analysis import bigsdb | ||||
| from autobigs.engine.structures import mlst | ||||
| from autobigs.engine.structures.genomics import NamedString | ||||
| from autobigs.engine.structures.mlst import Allele, MLSTProfile | ||||
| from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException | ||||
| from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler | ||||
|  | ||||
| async def generate_async_iterable(normal_iterable): | ||||
|     for dummy_sequence in normal_iterable: | ||||
|         yield dummy_sequence | ||||
|  | ||||
| def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]): | ||||
|     rand = random.Random(gene) | ||||
|     if isinstance(mutation_site_count, float): | ||||
|         mutation_site_count = int(mutation_site_count * len(gene)) | ||||
|     random_locations = rand.choices(range(len(gene)), k=mutation_site_count) | ||||
|     scrambled = list(gene) | ||||
|     for random_location in random_locations: | ||||
|         scrambled[random_location] = rand.choice(alphabet) | ||||
|     return "".join(scrambled) | ||||
|  | ||||
| def get_first_sequence_from_fasta(resource: str): | ||||
|     return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq) | ||||
|  | ||||
| def get_multiple_sequences_from_fasta(resource: str): | ||||
|     return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta")) | ||||
|  | ||||
| bpertussis_tohamaI_profile = MLSTProfile(( | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("fumC", "1", None), | ||||
|         Allele("glyA", "1", None), | ||||
|         Allele("tyrB", "1", None), | ||||
|         Allele("icd", "1", None), | ||||
|         Allele("pepA", "1", None), | ||||
|         Allele("pgm", "1", None)), "1", "ST-2 complex") | ||||
|  | ||||
| bpertussis_tohamaI_bad_profile = MLSTProfile(( | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("fumC", "2", None), | ||||
|         Allele("glyA", "36", None), | ||||
|         Allele("tyrB", "4", None), | ||||
|         Allele("icd", "4", None), | ||||
|         Allele("pepA", "1", None), | ||||
|         Allele("pgm", "5", None), | ||||
|     ), "unknown", "unknown") | ||||
|  | ||||
| hinfluenzae_fdaargos_profile = MLSTProfile(( | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("atpG", "1", None), | ||||
|         Allele("frdB", "1", None), | ||||
|         Allele("fucK", "1", None), | ||||
|         Allele("mdh", "1", None), | ||||
|         Allele("pgi", "1", None), | ||||
|         Allele("recA", "5", None) | ||||
|     ), "3", "ST-3 complex") | ||||
|  | ||||
| hinfluenzae_fdaargos_bad_profile = MLSTProfile(( | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("atpG", "1", None), | ||||
|         Allele("frdB", "1", None), | ||||
|         Allele("fucK", "1", None), | ||||
|         Allele("mdh", "1", None), | ||||
|         Allele("pgi", "1", None), | ||||
|         Allele("recA", "5", None) | ||||
|     ), "3", "ST-3 complex") | ||||
|  | ||||
| hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq) | ||||
|  | ||||
| hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta")) | ||||
|  | ||||
| @pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ | ||||
|     (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), | ||||
|     (True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), | ||||
| ]) | ||||
| class TestBIGSdbMLSTProfiler: | ||||
|     async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         sequence = get_first_sequence_from_fasta(seq_path) | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles) | ||||
|             targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys()) | ||||
|             async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]): | ||||
|                 assert isinstance(exact_match, Allele) | ||||
|                 assert exact_match.allele_locus in expected_alleles | ||||
|                 assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus] | ||||
|                 targets_left.remove(exact_match.allele_locus) | ||||
|  | ||||
|             assert len(targets_left) == 0 | ||||
|  | ||||
|     async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path) | ||||
|         mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()} | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler: | ||||
|             for target_sequence in target_sequences: | ||||
|                 match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description) | ||||
|                 if match is None: | ||||
|                     continue | ||||
|                 gene = match.group(1).lower() | ||||
|                 if gene not in mlst_targets: | ||||
|                     continue | ||||
|                 scrambled = gene_scrambler(str(target_sequence.seq), 0.125) | ||||
|                 async for partial_match in profiler.determine_mlst_allele_variants([scrambled]): | ||||
|                     assert partial_match.partial_match_profile is not None | ||||
|                     mlst_targets.remove(gene) | ||||
|  | ||||
|             assert len(mlst_targets) == 0 | ||||
|  | ||||
|     async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles) | ||||
|             assert mlst_st_data is not None | ||||
|             assert isinstance(mlst_st_data, MLSTProfile) | ||||
|             assert mlst_st_data.clonal_complex == expected_profile.clonal_complex | ||||
|             assert mlst_st_data.sequence_type == expected_profile.sequence_type | ||||
|  | ||||
|     async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         dummy_alleles = bad_profile.alleles | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles) | ||||
|             assert mlst_profile.clonal_complex == "unknown" | ||||
|             assert mlst_profile.sequence_type == "unknown" | ||||
|  | ||||
|  | ||||
|     async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         sequence = get_first_sequence_from_fasta(seq_path) | ||||
|         dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]] | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)): | ||||
|                 name, profile = named_profile.name, named_profile.mlst_profile | ||||
|                 assert profile is not None | ||||
|                 assert isinstance(profile, MLSTProfile) | ||||
|                 assert profile.clonal_complex == expected_profile.clonal_complex | ||||
|                 assert profile.sequence_type == expected_profile.sequence_type | ||||
|  | ||||
|     async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         valid_seq = get_first_sequence_from_fasta(seq_path) | ||||
|         dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True): | ||||
|                 name, profile = name_profile.name, name_profile.mlst_profile | ||||
|  | ||||
|                 assert profile is not None | ||||
|                 if name == "should_fail": | ||||
|                     assert profile.clonal_complex == "unknown" | ||||
|                     assert profile.sequence_type == "unknown" | ||||
|                     assert len(profile.alleles) > 0 | ||||
|                 else: | ||||
|                     assert isinstance(profile, MLSTProfile) | ||||
|                     assert profile.clonal_complex == expected_profile.clonal_complex | ||||
|                     assert profile.sequence_type == expected_profile.sequence_type | ||||
|  | ||||
|     async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): | ||||
|         valid_seq = get_first_sequence_from_fasta(seq_path) | ||||
|         dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] | ||||
|  | ||||
|         async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: | ||||
|             async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False): | ||||
|                 name, profile = named_profile.name, named_profile.mlst_profile | ||||
|                  | ||||
|                 assert profile is not None | ||||
|                 if name == "should_fail": | ||||
|                     assert profile.clonal_complex == "unknown" | ||||
|                     assert profile.sequence_type == "unknown" | ||||
|                     assert len(profile.alleles) > 0 | ||||
|                 else: | ||||
|                     assert isinstance(profile, MLSTProfile) | ||||
|                     assert profile.clonal_complex == expected_profile.clonal_complex | ||||
|                     assert profile.sequence_type == expected_profile.sequence_type | ||||
|  | ||||
| class TestBIGSdbIndex: | ||||
|  | ||||
|     async def test_bigsdb_index_all_databases_is_not_empty(self): | ||||
|         async with BIGSdbIndex() as bigsdb_index: | ||||
|             assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0 | ||||
|  | ||||
|     async def test_bigsdb_index_references_pubmlst_correctly(self): | ||||
|         async with BIGSdbIndex() as bigsdb_index: | ||||
|             assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org" | ||||
|  | ||||
|     async def test_bigsdb_index_references_institutpasteur_correctly(self): | ||||
|         async with BIGSdbIndex() as bigsdb_index: | ||||
|             assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api" | ||||
|  | ||||
|     async def test_bigsdb_index_get_schemas_for_bordetella(self): | ||||
|         async with BIGSdbIndex() as index: | ||||
|             schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") | ||||
|             assert len(schemas.keys()) > 0 | ||||
|             assert "MLST" in schemas | ||||
|             assert isinstance(schemas["MLST"], int) | ||||
|  | ||||
|     async def test_bigsdb_index_get_databases_has_only_seqdef(self): | ||||
|         async with BIGSdbIndex() as index: | ||||
|             databases = await index.get_known_seqdef_dbs() | ||||
|             assert len(databases.keys()) > 0 | ||||
|             for database_name in databases.keys(): | ||||
|                 assert database_name.endswith("seqdef") | ||||
|             assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api" | ||||
|  | ||||
|     async def test_bigsdb_index_instantiates_correct_profiler(self): | ||||
|         sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|         async with BIGSdbIndex() as bigsdb_index: | ||||
|             async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler: | ||||
|                 profile = await profiler.profile_string(sequence) | ||||
|                 assert profile.clonal_complex == "ST-2 complex" | ||||
|                 assert profile.sequence_type == "1" | ||||
| @@ -1,21 +0,0 @@ | ||||
| from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci | ||||
| from autobigs.engine.data.structures.mlst import Allele | ||||
|  | ||||
|  | ||||
| def test_dict_loci_alleles_variants_from_loci_single_loci_not_list(): | ||||
|     alleles_map = { | ||||
|         "adk": [Allele("adk", "1", None)] | ||||
|     } | ||||
|     results = dict_loci_alleles_variants_from_loci(alleles_map) | ||||
|     for loci, variant in results.items(): | ||||
|         assert isinstance(variant, str) | ||||
|         assert variant == "1" | ||||
|  | ||||
| def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list(): | ||||
|     alleles_map = { | ||||
|         "adk": [Allele("adk", "1", None), Allele("adk", "2", None)] | ||||
|     } | ||||
|     results = dict_loci_alleles_variants_from_loci(alleles_map) | ||||
|     for loci, variant in results.items(): | ||||
|         assert isinstance(variant, list) | ||||
|         assert len(variant) == 2 | ||||
| @@ -1,249 +0,0 @@ | ||||
| import random | ||||
| import re | ||||
| from typing import Collection, Sequence, Union | ||||
| from Bio import SeqIO | ||||
| import pytest | ||||
| from autobigs.engine.data.structures.genomics import NamedString | ||||
| from autobigs.engine.data.structures.mlst import Allele, MLSTProfile | ||||
| from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException | ||||
| from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, OnlineBIGSdbMLSTProfiler | ||||
|  | ||||
| def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]): | ||||
|     rand = random.Random(gene) | ||||
|     if isinstance(mutation_site_count, float): | ||||
|         mutation_site_count = int(mutation_site_count * len(gene)) | ||||
|     random_locations = rand.choices(range(len(gene)), k=mutation_site_count) | ||||
|     scrambled = list(gene) | ||||
|     for random_location in random_locations: | ||||
|         scrambled[random_location] = rand.choice(alphabet) | ||||
|     return "".join(scrambled) | ||||
|  | ||||
| async def test_institutpasteur_profiling_results_in_exact_matches_when_exact(): | ||||
|     sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"} | ||||
|         async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_strings=[sequence]): | ||||
|             assert isinstance(exact_match, Allele) | ||||
|             assert exact_match.allele_variant == '1' # All of Tohama I has allele id I | ||||
|             targets_left.remove(exact_match.allele_locus) | ||||
|  | ||||
|         assert len(targets_left) == 0 | ||||
|  | ||||
| async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact(): | ||||
|     sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta")) | ||||
|     mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"} | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler: | ||||
|         for sequence in sequences: | ||||
|             match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description) | ||||
|             if match is None: | ||||
|                 continue | ||||
|             gene = match.group(1) | ||||
|             if gene.lower() not in mlst_targets: | ||||
|                 continue | ||||
|             scrambled = gene_scrambler(str(sequence.seq), 0.125) | ||||
|             async for partial_match in profiler.fetch_mlst_allele_variants(scrambled): | ||||
|                 assert partial_match.partial_match_profile is not None | ||||
|                 mlst_targets.remove(gene.lower()) | ||||
|  | ||||
|         assert len(mlst_targets) == 0 | ||||
|  | ||||
| async def test_institutpasteur_profiling_results_in_correct_mlst_st(): | ||||
|     async def dummy_allele_generator(): | ||||
|         dummy_alleles = [ | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("fumC", "1", None), | ||||
|         Allele("glyA", "1", None), | ||||
|         Allele("tyrB", "1", None), | ||||
|         Allele("icd", "1", None), | ||||
|         Allele("pepA", "1", None), | ||||
|         Allele("pgm", "1", None), | ||||
|         ] | ||||
|         for dummy_allele in dummy_alleles: | ||||
|             yield dummy_allele | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator()) | ||||
|         assert mlst_st_data is not None | ||||
|         assert isinstance(mlst_st_data, MLSTProfile) | ||||
|         assert mlst_st_data.clonal_complex == "ST-2 complex" | ||||
|         assert mlst_st_data.sequence_type == "1" | ||||
|  | ||||
| async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts(): | ||||
|     dummy_alleles = [ | ||||
|     Allele("adk", "1", None), | ||||
|     Allele("fumC", "2", None), | ||||
|     Allele("glyA", "36", None), | ||||
|     Allele("tyrB", "4", None), | ||||
|     Allele("icd", "4", None), | ||||
|     Allele("pepA", "1", None), | ||||
|     Allele("pgm", "5", None), | ||||
|     ] | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles) | ||||
|         assert mlst_profile.clonal_complex == "unknown" | ||||
|         assert mlst_profile.sequence_type == "unknown" | ||||
|  | ||||
|  | ||||
| async def test_institutpasteur_sequence_profiling_is_correct(): | ||||
|     sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         profile = await dummy_profiler.profile_string(sequence) | ||||
|         assert profile is not None | ||||
|         assert isinstance(profile, MLSTProfile) | ||||
|         assert profile.clonal_complex == "ST-2 complex" | ||||
|         assert profile.sequence_type == "1" | ||||
|      | ||||
|  | ||||
| async def test_pubmlst_profiling_results_in_exact_matches_when_exact(): | ||||
|     dummy_alleles = { | ||||
|         Allele("adk", "1", None), | ||||
|         Allele("atpG", "1", None), | ||||
|         Allele("frdB", "1", None), | ||||
|         Allele("fucK", "1", None), | ||||
|         Allele("mdh", "1", None), | ||||
|         Allele("pgi", "1", None), | ||||
|         Allele("recA", "5", None), | ||||
|     } | ||||
|     sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: | ||||
|         exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_strings=sequence) | ||||
|         async for exact_match in exact_matches: | ||||
|             assert isinstance(exact_match, Allele) | ||||
|             dummy_alleles.remove(exact_match) | ||||
|  | ||||
|         assert len(dummy_alleles) == 0 | ||||
|  | ||||
| async def test_pubmlst_profiling_results_in_correct_st(): | ||||
|     async def generate_dummy_targets(): | ||||
|         dummy_alleles = [ | ||||
|                 Allele("adk", "1", None), | ||||
|                 Allele("atpG", "1", None), | ||||
|                 Allele("frdB", "1", None), | ||||
|                 Allele("fucK", "1", None), | ||||
|                 Allele("mdh", "1", None), | ||||
|                 Allele("pgi", "1", None), | ||||
|                 Allele("recA", "5", None), | ||||
|             ] | ||||
|         for dummy_allele in dummy_alleles: | ||||
|             yield dummy_allele | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: | ||||
|         mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets()) | ||||
|         assert mlst_st_data is not None | ||||
|         assert isinstance(mlst_st_data, MLSTProfile) | ||||
|         assert mlst_st_data.clonal_complex == "ST-3 complex" | ||||
|         assert mlst_st_data.sequence_type == "3" | ||||
|  | ||||
| async def test_pubmlst_sequence_profiling_is_correct(): | ||||
|     sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler: | ||||
|         profile = await dummy_profiler.profile_string(sequence) | ||||
|         assert profile is not None | ||||
|         assert isinstance(profile, MLSTProfile) | ||||
|         assert profile.clonal_complex == "ST-3 complex" | ||||
|         assert profile.sequence_type == "3" | ||||
|  | ||||
| async def test_bigsdb_index_all_databases_is_not_empty(): | ||||
|     async with BIGSdbIndex() as bigsdb_index: | ||||
|         assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0 | ||||
|  | ||||
| async def test_bigsdb_index_references_pubmlst_correctly(): | ||||
|     async with BIGSdbIndex() as bigsdb_index: | ||||
|         assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org" | ||||
|  | ||||
| async def test_bigsdb_index_references_institutpasteur_correctly(): | ||||
|     async with BIGSdbIndex() as bigsdb_index: | ||||
|         assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api" | ||||
|  | ||||
|  | ||||
| async def test_bigsdb_index_instantiates_correct_profiler(): | ||||
|     sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     async with BIGSdbIndex() as bigsdb_index: | ||||
|         async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler: | ||||
|             profile = await profiler.profile_string(sequence) | ||||
|             assert profile.clonal_complex == "ST-2 complex" | ||||
|             assert profile.sequence_type == "1" | ||||
|  | ||||
| async def test_bigsdb_profile_multiple_strings_same_string_twice(): | ||||
|     sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)] | ||||
|     async def generate_async_iterable_sequences(): | ||||
|         for dummy_sequence in dummy_sequences: | ||||
|             yield [dummy_sequence] | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()): | ||||
|             name, profile = named_profile.name, named_profile.mlst_profile | ||||
|             assert profile is not None | ||||
|             assert isinstance(profile, MLSTProfile) | ||||
|             assert profile.clonal_complex == "ST-2 complex" | ||||
|             assert profile.sequence_type == "1" | ||||
|  | ||||
| async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(): | ||||
|     valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)] | ||||
|     async def generate_async_iterable_sequences(): | ||||
|         for dummy_sequence in dummy_sequences: | ||||
|             yield [dummy_sequence] | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True): | ||||
|             name, profile = name_profile.name, name_profile.mlst_profile | ||||
|  | ||||
|             if name == "should_fail": | ||||
|                 assert profile is None | ||||
|             else: | ||||
|                 assert profile is not None | ||||
|                 assert isinstance(profile, MLSTProfile) | ||||
|                 assert profile.clonal_complex == "ST-2 complex" | ||||
|                 assert profile.sequence_type == "1" | ||||
|  | ||||
| async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(): | ||||
|     valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)] | ||||
|     async def generate_async_iterable_sequences(): | ||||
|         for dummy_sequence in dummy_sequences: | ||||
|             yield [dummy_sequence] | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False): | ||||
|             name, profile = named_profile.name, named_profile.mlst_profile | ||||
|             if name == "should_fail": | ||||
|                 assert profile is not None | ||||
|                 assert profile.clonal_complex == "unknown" | ||||
|                 assert profile.sequence_type == "unknown" | ||||
|                 assert len(profile.alleles) > 0 | ||||
|             else: | ||||
|                 assert profile is not None | ||||
|                 assert isinstance(profile, MLSTProfile) | ||||
|                 assert profile.clonal_complex == "ST-2 complex" | ||||
|                 assert profile.sequence_type == "1" | ||||
|  | ||||
| async def test_bigsdb_profile_multiple_strings_fail_second_stop(): | ||||
|     valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq) | ||||
|     invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq) | ||||
|     dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)] | ||||
|     async def generate_async_iterable_sequences(): | ||||
|         for dummy_sequence in dummy_sequences: | ||||
|             yield [dummy_sequence] | ||||
|     async with OnlineBIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler: | ||||
|         with pytest.raises(NoBIGSdbMatchesException): | ||||
|             async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), stop_on_fail=True): | ||||
|                 name, profile = named_profile.name, named_profile.mlst_profile | ||||
|                 if name == "should_fail": | ||||
|                     pytest.fail("Exception should have been thrown, no exception was thrown.") | ||||
|                 else: | ||||
|                     assert profile is not None | ||||
|                     assert isinstance(profile, MLSTProfile) | ||||
|                     assert profile.clonal_complex == "ST-2 complex" | ||||
|                     assert profile.sequence_type == "1" | ||||
|  | ||||
| async def test_bigsdb_index_get_schemas_for_bordetella(): | ||||
|     async with BIGSdbIndex() as index: | ||||
|         schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") | ||||
|         assert len(schemas.keys()) > 0 | ||||
|         assert "MLST" in schemas | ||||
|         assert isinstance(schemas["MLST"], int) | ||||
|  | ||||
| async def test_bigsdb_index_get_databases_has_only_seqdef(): | ||||
|     async with BIGSdbIndex() as index: | ||||
|         databases = await index.get_known_seqdef_dbs() | ||||
|         assert len(databases.keys()) > 0 | ||||
|         for database_name in databases.keys(): | ||||
|             assert database_name.endswith("seqdef") | ||||
|         assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api" | ||||
| @@ -1,4 +1,4 @@ | ||||
| from autobigs.engine.data.local.fasta import read_fasta | ||||
| from autobigs.engine.reading import read_fasta | ||||
| 
 | ||||
| 
 | ||||
| async def test_fasta_reader_not_none(): | ||||
							
								
								
									
										27246
									
								
								tests/resources/fdaargos_1560_hinfluenza_features.fasta
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27246
									
								
								tests/resources/fdaargos_1560_hinfluenza_features.fasta
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										11
									
								
								tests/resources/tohama_I_bpertussis_adk.fasta
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tests/resources/tohama_I_bpertussis_adk.fasta
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | ||||
| >lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS] | ||||
| ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT | ||||
| ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT | ||||
| GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT | ||||
| CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG | ||||
| ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT | ||||
| CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC | ||||
| AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG | ||||
| TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA | ||||
| GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC | ||||
| CGCCTGTCGCAGGCTCTGCAGAGCTAA | ||||
		Reference in New Issue
	
	Block a user