Compare commits
	
		
			26 Commits
		
	
	
		
			0.7.0
			...
			features/l
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 897f7ee922 | |||
| bfc286e6b0 | |||
| a88225fcff | |||
| c18d817cd9 | |||
| f462e6d5e0 | |||
| e568e9fb2c | |||
| 4b9eb8674d | |||
| f75707e4fe | |||
| b4845fab34 | |||
| fe999f1cab | |||
| 85946eb110 | |||
| a27e09da31 | |||
| ba2b688e89 | |||
| 49f31b7943 | |||
| 1c6e1cfb35 | |||
| fb99526162 | |||
| ff8a1aff08 | |||
| 341ca933a3 | |||
| 3e3898334f | |||
| ba1f0aa318 | |||
| 6d0157581f | |||
| 4bcbfa0c6a | |||
| ca0f9673b0 | |||
| 5048fa8057 | |||
| 39125c848e | |||
| 744a6c2009 | 
@@ -8,15 +8,16 @@ dynamic = ["version"]
 | 
			
		||||
readme = "README.md"
 | 
			
		||||
 | 
			
		||||
dependencies = [
 | 
			
		||||
    "biopython",
 | 
			
		||||
    "aiohttp[speedups]",
 | 
			
		||||
    "biopython==1.85",
 | 
			
		||||
    "aiohttp[speedups]==3.11.*",
 | 
			
		||||
]
 | 
			
		||||
requires-python = ">=3.11"
 | 
			
		||||
requires-python = ">=3.12"
 | 
			
		||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
 | 
			
		||||
 | 
			
		||||
[project.urls]
 | 
			
		||||
Repository = "https://github.com/RealYHD/autobigs.engine.git"
 | 
			
		||||
Issues = "https://github.com/RealYHD/autobigs.engine/issues"
 | 
			
		||||
Homepage = "https://github.com/RealYHD/autoBIGS.engine"
 | 
			
		||||
Source = "https://github.com/RealYHD/autoBIGS.engine"
 | 
			
		||||
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
 | 
			
		||||
 | 
			
		||||
[tool.setuptools_scm]
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
aiohttp[speedups]
 | 
			
		||||
biopython
 | 
			
		||||
aiohttp[speedups]==3.11.*
 | 
			
		||||
biopython==1.85
 | 
			
		||||
pytest
 | 
			
		||||
pytest-asyncio
 | 
			
		||||
build
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										70
									
								
								src/autobigs/engine/analysis/aligners.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								src/autobigs/engine/analysis/aligners.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,70 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
from concurrent.futures import Future, ThreadPoolExecutor
 | 
			
		||||
from contextlib import AbstractContextManager
 | 
			
		||||
from typing import Any, Set, Union
 | 
			
		||||
from Bio.Align import PairwiseAligner
 | 
			
		||||
from queue import Queue
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
 | 
			
		||||
 | 
			
		||||
class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager):
 | 
			
		||||
    def __enter__(self):
 | 
			
		||||
        self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment")
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
 | 
			
		||||
        self._max_threads = max_threads
 | 
			
		||||
        self._aligner = aligner
 | 
			
		||||
        self._work_left: Set[Future] = set()
 | 
			
		||||
        self._work_complete: Queue[Future] = Queue()
 | 
			
		||||
 | 
			
		||||
    def align(self, reference: str, query: str, **associated_data):
 | 
			
		||||
        work = self._thread_pool.submit(
 | 
			
		||||
            self.work, reference, query, **associated_data)
 | 
			
		||||
        work.add_done_callback(self._on_complete)
 | 
			
		||||
        self._work_left.add(work)
 | 
			
		||||
        
 | 
			
		||||
    def _on_complete(self, future: Future):
 | 
			
		||||
        self._work_left.remove(future)
 | 
			
		||||
        self._work_complete.put(future)
 | 
			
		||||
 | 
			
		||||
    def work(self, reference, query, **associated_data):
 | 
			
		||||
        alignments = self._aligner.align(reference, query)
 | 
			
		||||
        top_alignment = alignments[0]
 | 
			
		||||
        top_alignment_stats = top_alignment.counts()
 | 
			
		||||
        top_alignment_gaps = top_alignment_stats.gaps
 | 
			
		||||
        top_alignment_identities = top_alignment_stats.identities
 | 
			
		||||
        top_alignment_mismatches = top_alignment_stats.mismatches
 | 
			
		||||
        top_alignment_score = top_alignment.score # type: ignore
 | 
			
		||||
        return PairwiseAlignment(
 | 
			
		||||
            top_alignment.sequences[0],
 | 
			
		||||
            top_alignment.sequences[1],
 | 
			
		||||
            tuple(top_alignment.indices[0]),
 | 
			
		||||
            tuple(top_alignment.indices[1]),
 | 
			
		||||
            AlignmentStats(
 | 
			
		||||
                percent_identity=top_alignment_identities/top_alignment.length,
 | 
			
		||||
                mismatches=top_alignment_mismatches,
 | 
			
		||||
                gaps=top_alignment_gaps,
 | 
			
		||||
                match_metric=top_alignment_score
 | 
			
		||||
            )), associated_data
 | 
			
		||||
 | 
			
		||||
    async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
 | 
			
		||||
        if self._work_complete.empty() and len(self._work_left):
 | 
			
		||||
            return None
 | 
			
		||||
        completed_alignment = await asyncio.wrap_future(self._work_complete.get())
 | 
			
		||||
        return completed_alignment
 | 
			
		||||
 | 
			
		||||
    def __exit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        self.shutdown()
 | 
			
		||||
 | 
			
		||||
    def __aiter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
    
 | 
			
		||||
    async def __anext__(self):
 | 
			
		||||
        result = await self.next_completed()
 | 
			
		||||
        if result is None:
 | 
			
		||||
            raise StopAsyncIteration
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    def shutdown(self):
 | 
			
		||||
        self._thread_pool.shutdown(wait=True, cancel_futures=True)
 | 
			
		||||
							
								
								
									
										338
									
								
								src/autobigs/engine/analysis/bigsdb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										338
									
								
								src/autobigs/engine/analysis/bigsdb.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,338 @@
 | 
			
		||||
from abc import abstractmethod
 | 
			
		||||
import asyncio
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from contextlib import AbstractAsyncContextManager
 | 
			
		||||
import csv
 | 
			
		||||
from os import path
 | 
			
		||||
import os
 | 
			
		||||
import shutil
 | 
			
		||||
import tempfile
 | 
			
		||||
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union
 | 
			
		||||
 | 
			
		||||
from aiohttp import ClientSession, ClientTimeout
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
 | 
			
		||||
from autobigs.engine.reading import read_fasta
 | 
			
		||||
from autobigs.engine.structures.alignment import PairwiseAlignment
 | 
			
		||||
from autobigs.engine.structures.genomics import NamedString
 | 
			
		||||
from autobigs.engine.structures.mlst import Allele, NamedMLSTProfile, AlignmentStats, MLSTProfile
 | 
			
		||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
 | 
			
		||||
 | 
			
		||||
from Bio.Align import PairwiseAligner
 | 
			
		||||
 | 
			
		||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, database_api: str, database_name: str, schema_id: int):
 | 
			
		||||
        self._database_name = database_name
 | 
			
		||||
        self._schema_id = schema_id
 | 
			
		||||
        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
 | 
			
		||||
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
 | 
			
		||||
 | 
			
		||||
    async def __aenter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
 | 
			
		||||
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
 | 
			
		||||
        uri_path = "sequence"
 | 
			
		||||
        if isinstance(query_sequence_strings, str):
 | 
			
		||||
            query_sequence_strings = [query_sequence_strings]
 | 
			
		||||
        for sequence_string in query_sequence_strings:
 | 
			
		||||
            async with self._http_client.post(uri_path, json={
 | 
			
		||||
                "sequence": sequence_string,
 | 
			
		||||
                "partial_matches": True
 | 
			
		||||
            }) as response:
 | 
			
		||||
                sequence_response: dict = await response.json()
 | 
			
		||||
 | 
			
		||||
                if "exact_matches" in sequence_response:
 | 
			
		||||
                    # loci -> list of alleles with id and loci
 | 
			
		||||
                    exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
 | 
			
		||||
                    for allele_loci, alleles in exact_matches.items():
 | 
			
		||||
                        for allele in alleles:
 | 
			
		||||
                            alelle_id = allele["allele_id"]
 | 
			
		||||
                            yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
 | 
			
		||||
                elif "partial_matches" in sequence_response:
 | 
			
		||||
                    partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
 | 
			
		||||
                    for allele_loci, partial_match in partial_matches.items():
 | 
			
		||||
                        if len(partial_match) <= 0:
 | 
			
		||||
                            continue
 | 
			
		||||
                        partial_match_profile = AlignmentStats(
 | 
			
		||||
                            percent_identity=float(partial_match["identity"]),
 | 
			
		||||
                            mismatches=int(partial_match["mismatches"]),
 | 
			
		||||
                            gaps=int(partial_match["gaps"]),
 | 
			
		||||
                            match_metric=int(partial_match["bitscore"])
 | 
			
		||||
                        )
 | 
			
		||||
                        yield Allele(
 | 
			
		||||
                            allele_locus=allele_loci,
 | 
			
		||||
                            allele_variant=str(partial_match["allele"]),
 | 
			
		||||
                            partial_match_profile=partial_match_profile
 | 
			
		||||
                        )
 | 
			
		||||
                else:
 | 
			
		||||
                    raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
 | 
			
		||||
 | 
			
		||||
    async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
 | 
			
		||||
        uri_path = "designations"
 | 
			
		||||
        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
 | 
			
		||||
        if isinstance(alleles, AsyncIterable):
 | 
			
		||||
            async for allele in alleles:
 | 
			
		||||
                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
 | 
			
		||||
        else:
 | 
			
		||||
            for allele in alleles:
 | 
			
		||||
                allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
 | 
			
		||||
        request_json = {
 | 
			
		||||
            "designations": allele_request_dict
 | 
			
		||||
        }
 | 
			
		||||
        async with self._http_client.post(uri_path, json=request_json) as response:
 | 
			
		||||
            response_json: dict = await response.json()
 | 
			
		||||
            allele_set: Set[Allele] = set()
 | 
			
		||||
            response_json.setdefault("fields", dict())
 | 
			
		||||
            schema_fields_returned: dict[str, str] = response_json["fields"]
 | 
			
		||||
            schema_fields_returned.setdefault("ST", "unknown")
 | 
			
		||||
            schema_fields_returned.setdefault("clonal_complex", "unknown")
 | 
			
		||||
            schema_exact_matches: dict = response_json["exact_matches"]
 | 
			
		||||
            for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
 | 
			
		||||
                if len(exact_match_alleles) > 1:
 | 
			
		||||
                    raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
 | 
			
		||||
                allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
 | 
			
		||||
            if len(allele_set) == 0:
 | 
			
		||||
                raise ValueError("Passed in no alleles.")
 | 
			
		||||
            return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
 | 
			
		||||
 | 
			
		||||
    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
 | 
			
		||||
        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
 | 
			
		||||
        return await self.determine_mlst_st(alleles)
 | 
			
		||||
 | 
			
		||||
    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
 | 
			
		||||
        async for named_strings in query_named_string_groups:
 | 
			
		||||
            for named_string in named_strings:
 | 
			
		||||
                try:
 | 
			
		||||
                    yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
 | 
			
		||||
                except NoBIGSdbMatchesException as e:
 | 
			
		||||
                    if stop_on_fail:
 | 
			
		||||
                        raise e
 | 
			
		||||
                    yield NamedMLSTProfile(named_string.name, None)
 | 
			
		||||
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        await self._http_client.close()
 | 
			
		||||
 | 
			
		||||
    async def __aexit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        await self.close()
 | 
			
		||||
 | 
			
		||||
class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
 | 
			
		||||
    async def __aenter__(self):
 | 
			
		||||
        if self._prepare:
 | 
			
		||||
            await self.update_scheme_locis()
 | 
			
		||||
            await asyncio.gather(
 | 
			
		||||
                self.download_alleles_cache_data(),
 | 
			
		||||
                self.download_scheme_profiles()
 | 
			
		||||
            )
 | 
			
		||||
            await self.load_scheme_profiles()
 | 
			
		||||
        return self
 | 
			
		||||
    
 | 
			
		||||
    def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True):
 | 
			
		||||
        self._database_api = database_api
 | 
			
		||||
        self._database_name = database_name
 | 
			
		||||
        self._schema_id = schema_id
 | 
			
		||||
        self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
 | 
			
		||||
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
 | 
			
		||||
        if cache_path is None:
 | 
			
		||||
            self._cache_path = tempfile.mkdtemp("BIGSdb")
 | 
			
		||||
            self._cleanup_required = True
 | 
			
		||||
        else:
 | 
			
		||||
            self._cache_path = cache_path
 | 
			
		||||
            self._cleanup_required = False
 | 
			
		||||
        self._loci: list[str] = []
 | 
			
		||||
        self._profiles_st_map = {}
 | 
			
		||||
        self._prepare = prepare
 | 
			
		||||
 | 
			
		||||
    async def update_scheme_locis(self):
 | 
			
		||||
        self._loci.clear()
 | 
			
		||||
        async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response:
 | 
			
		||||
            schema_json = await schema_response.json()
 | 
			
		||||
            for locus in schema_json["loci"]:
 | 
			
		||||
                locus_name = path.basename(locus)
 | 
			
		||||
                self._loci.append(locus_name)
 | 
			
		||||
        self._loci.sort()
 | 
			
		||||
    
 | 
			
		||||
    async def load_scheme_profiles(self):
 | 
			
		||||
        self._profiles_st_map.clear()
 | 
			
		||||
        with open(self.get_scheme_profile_path()) as profile_cache_handle:
 | 
			
		||||
            reader = csv.DictReader(profile_cache_handle, delimiter="\t")
 | 
			
		||||
            for line in reader:
 | 
			
		||||
                alleles = []
 | 
			
		||||
                for locus in self._loci:
 | 
			
		||||
                    alleles.append(line[locus])
 | 
			
		||||
                self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"])
 | 
			
		||||
            
 | 
			
		||||
    def get_locus_cache_path(self, locus) -> str:
 | 
			
		||||
        return path.join(self._cache_path, locus + "." + "fasta")
 | 
			
		||||
 | 
			
		||||
    def get_scheme_profile_path(self):
 | 
			
		||||
        return path.join(self._cache_path, "profiles.csv")
 | 
			
		||||
 | 
			
		||||
    async def download_alleles_cache_data(self):
 | 
			
		||||
        for locus in self._loci:
 | 
			
		||||
            with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
 | 
			
		||||
                async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
 | 
			
		||||
                    async for chunk, eof in fasta_response.content.iter_chunks():
 | 
			
		||||
                        fasta_handle.write(chunk)
 | 
			
		||||
 | 
			
		||||
    async def download_scheme_profiles(self):
 | 
			
		||||
        with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle:
 | 
			
		||||
            async with self._http_client.get("profiles_csv") as profiles_response:
 | 
			
		||||
                async for chunk, eof in profiles_response.content.iter_chunks():
 | 
			
		||||
                    profile_cache_handle.write(chunk)
 | 
			
		||||
        await self.load_scheme_profiles()
 | 
			
		||||
    
 | 
			
		||||
    async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
 | 
			
		||||
        aligner = PairwiseAligner("blastn")
 | 
			
		||||
        aligner.mode = "local"
 | 
			
		||||
        with AsyncBiopythonPairwiseAlignmentEngine(aligner, max_threads=4) as aligner_engine:
 | 
			
		||||
            for query_sequence_string in query_sequence_strings:
 | 
			
		||||
                for locus in self._loci:
 | 
			
		||||
                    async for allele_variant in read_fasta(self.get_locus_cache_path(locus)):
 | 
			
		||||
                        aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True)
 | 
			
		||||
                        break # start a bunch of full alignments for each variant to select segments
 | 
			
		||||
            alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set)
 | 
			
		||||
            async for alignment_result, additional_information in aligner_engine:
 | 
			
		||||
                result_variant_name = additional_information["variant_name"]
 | 
			
		||||
                result_locus, variant_id = result_variant_name.split("_")
 | 
			
		||||
                full_alignment = additional_information["full"]
 | 
			
		||||
                if full_alignment:
 | 
			
		||||
                    if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0:
 | 
			
		||||
                        # I.e., 100% exactly the same
 | 
			
		||||
                        yield Allele(result_locus, variant_id, None)
 | 
			
		||||
                        continue
 | 
			
		||||
                    else:
 | 
			
		||||
                        alignment_rankings[result_locus].add((alignment_result, variant_id))
 | 
			
		||||
                    interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]]
 | 
			
		||||
                    async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)):
 | 
			
		||||
                        if result_variant_name == allele_variant.name:
 | 
			
		||||
                            continue # Skip if we just finished aligning this
 | 
			
		||||
                        aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False)
 | 
			
		||||
                else:
 | 
			
		||||
                    alignment_rankings[result_locus].add((alignment_result, variant_id))
 | 
			
		||||
            for final_locus, alignments in alignment_rankings.items():
 | 
			
		||||
                closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.match_metric)[0]
 | 
			
		||||
                yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats)
 | 
			
		||||
 | 
			
		||||
    async def determine_mlst_st(self, alleles):
 | 
			
		||||
        allele_variants: dict[str, Allele] = {}
 | 
			
		||||
        if isinstance(alleles, AsyncIterable):
 | 
			
		||||
            async for allele in alleles:
 | 
			
		||||
                allele_variants[allele.allele_locus] = allele
 | 
			
		||||
        else:
 | 
			
		||||
            for allele in alleles:
 | 
			
		||||
                allele_variants[allele.allele_locus] = allele
 | 
			
		||||
        ordered_profile = []
 | 
			
		||||
        for locus in self._loci:
 | 
			
		||||
               ordered_profile.append(allele_variants[locus].allele_variant)
 | 
			
		||||
 | 
			
		||||
        st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)]
 | 
			
		||||
        return MLSTProfile(set(allele_variants.values()), st, clonal_complex)
 | 
			
		||||
 | 
			
		||||
    async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
 | 
			
		||||
        alleles = self.determine_mlst_allele_variants(query_sequence_strings)
 | 
			
		||||
        return await self.determine_mlst_st(alleles)
 | 
			
		||||
 | 
			
		||||
    async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
 | 
			
		||||
        async for named_strings in query_named_string_groups:
 | 
			
		||||
            for named_string in named_strings:
 | 
			
		||||
                try:
 | 
			
		||||
                    yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
 | 
			
		||||
                except NoBIGSdbMatchesException as e:
 | 
			
		||||
                    if stop_on_fail:
 | 
			
		||||
                        raise e
 | 
			
		||||
                    yield NamedMLSTProfile(named_string.name, None)
 | 
			
		||||
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        await self._http_client.close()
 | 
			
		||||
        if self._cleanup_required:
 | 
			
		||||
            shutil.rmtree(self._cache_path)
 | 
			
		||||
 | 
			
		||||
    async def __aexit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        await self.close()
 | 
			
		||||
 | 
			
		||||
class BIGSdbIndex(AbstractAsyncContextManager):
 | 
			
		||||
    KNOWN_BIGSDB_APIS = {
 | 
			
		||||
        "https://bigsdb.pasteur.fr/api",
 | 
			
		||||
        "https://rest.pubmlst.org"
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self._http_client = ClientSession()
 | 
			
		||||
        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
 | 
			
		||||
        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
    async def __aenter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
    
 | 
			
		||||
    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
 | 
			
		||||
        if self._known_seqdef_dbs_origin is not None and not force:
 | 
			
		||||
            return self._known_seqdef_dbs_origin
 | 
			
		||||
        known_seqdef_dbs = dict()
 | 
			
		||||
        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
 | 
			
		||||
            async with self._http_client.get(f"{known_bigsdb}/db") as response:
 | 
			
		||||
                response_json_databases = await response.json()
 | 
			
		||||
                for database_group in response_json_databases:
 | 
			
		||||
                    for database_info in database_group["databases"]:
 | 
			
		||||
                        if str(database_info["name"]).endswith("seqdef"):
 | 
			
		||||
                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
 | 
			
		||||
        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
 | 
			
		||||
        return self._known_seqdef_dbs_origin
 | 
			
		||||
 | 
			
		||||
    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
 | 
			
		||||
        known_databases = await self.get_known_seqdef_dbs()
 | 
			
		||||
        if seqdef_db_name not in known_databases:
 | 
			
		||||
            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
 | 
			
		||||
        return known_databases[seqdef_db_name]     
 | 
			
		||||
 | 
			
		||||
    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
 | 
			
		||||
        if seqdef_db_name in self._seqdefdb_schemas and not force:
 | 
			
		||||
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
 | 
			
		||||
        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
 | 
			
		||||
        async with self._http_client.get(uri_path) as response: 
 | 
			
		||||
            response_json = await response.json()
 | 
			
		||||
            schema_descriptions: Mapping[str, int] = dict()
 | 
			
		||||
            for scheme_definition in response_json["schemes"]:
 | 
			
		||||
                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
 | 
			
		||||
                scheme_desc: str = scheme_definition["description"]
 | 
			
		||||
                schema_descriptions[scheme_desc] = scheme_id
 | 
			
		||||
            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
 | 
			
		||||
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
 | 
			
		||||
 | 
			
		||||
    async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
 | 
			
		||||
        return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
 | 
			
		||||
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        await self._http_client.close()
 | 
			
		||||
 | 
			
		||||
    async def __aexit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        await self.close()
 | 
			
		||||
 | 
			
		||||
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
 | 
			
		||||
    if local:
 | 
			
		||||
        return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
 | 
			
		||||
    return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
 | 
			
		||||
							
								
								
									
										26
									
								
								src/autobigs/engine/analysis/genbank.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								src/autobigs/engine/analysis/genbank.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,26 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
from contextlib import AbstractAsyncContextManager
 | 
			
		||||
import tempfile
 | 
			
		||||
from typing import Iterable, Union
 | 
			
		||||
from Bio import Entrez
 | 
			
		||||
from Bio import SeqIO
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation
 | 
			
		||||
 | 
			
		||||
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
 | 
			
		||||
    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
 | 
			
		||||
        record = SeqIO.read(fetch_stream, "genbank")
 | 
			
		||||
        sequence_features = list()
 | 
			
		||||
        for feature in record.features:
 | 
			
		||||
            start = int(feature.location.start)
 | 
			
		||||
            end = int(feature.location.end)
 | 
			
		||||
            qualifiers = feature.qualifiers
 | 
			
		||||
            for qualifier_key in qualifiers:
 | 
			
		||||
                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
 | 
			
		||||
            sequence_features.append(StringAnnotation(
 | 
			
		||||
                type=feature.type,
 | 
			
		||||
                start=start,
 | 
			
		||||
                end=end+1,  # Position is exclusive
 | 
			
		||||
                feature_properties=qualifiers
 | 
			
		||||
            ))
 | 
			
		||||
        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
 | 
			
		||||
@@ -1,166 +0,0 @@
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from contextlib import AbstractAsyncContextManager
 | 
			
		||||
from numbers import Number
 | 
			
		||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
 | 
			
		||||
 | 
			
		||||
from aiohttp import ClientSession, ClientTimeout
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.data.structures.genomics import NamedString
 | 
			
		||||
from autobigs.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
 | 
			
		||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
 | 
			
		||||
 | 
			
		||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, database_api: str, database_name: str, schema_id: int):
 | 
			
		||||
        self._database_name = database_name
 | 
			
		||||
        self._schema_id = schema_id
 | 
			
		||||
        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
 | 
			
		||||
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
 | 
			
		||||
 | 
			
		||||
    async def __aenter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
 | 
			
		||||
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
 | 
			
		||||
        uri_path = "sequence"
 | 
			
		||||
        response = await self._http_client.post(uri_path, json={
 | 
			
		||||
            "sequence": sequence_string,
 | 
			
		||||
            "partial_matches": not exact
 | 
			
		||||
        })
 | 
			
		||||
        sequence_response: dict = await response.json()
 | 
			
		||||
 | 
			
		||||
        if "exact_matches" in sequence_response:
 | 
			
		||||
            # loci -> list of alleles with id and loci
 | 
			
		||||
            exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
 | 
			
		||||
            for allele_loci, alleles in exact_matches.items():
 | 
			
		||||
                for allele in alleles:
 | 
			
		||||
                    alelle_id = allele["allele_id"]
 | 
			
		||||
                    yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
 | 
			
		||||
        elif "partial_matches" in sequence_response:
 | 
			
		||||
            if exact:
 | 
			
		||||
                raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
 | 
			
		||||
            partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
 | 
			
		||||
            for allele_loci, partial_match in partial_matches.items():
 | 
			
		||||
                if len(partial_match) <= 0:
 | 
			
		||||
                    continue
 | 
			
		||||
                partial_match_profile = PartialAllelicMatchProfile(
 | 
			
		||||
                    percent_identity=float(partial_match["identity"]),
 | 
			
		||||
                    mismatches=int(partial_match["mismatches"]),
 | 
			
		||||
                    bitscore=float(partial_match["bitscore"]),
 | 
			
		||||
                    gaps=int(partial_match["gaps"])
 | 
			
		||||
                )
 | 
			
		||||
                yield Allele(
 | 
			
		||||
                    allele_loci=allele_loci,
 | 
			
		||||
                    allele_variant=str(partial_match["allele"]),
 | 
			
		||||
                    partial_match_profile=partial_match_profile
 | 
			
		||||
                )
 | 
			
		||||
        else:
 | 
			
		||||
            raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
 | 
			
		||||
        uri_path = "designations"
 | 
			
		||||
        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
 | 
			
		||||
        if isinstance(alleles, AsyncIterable):
 | 
			
		||||
            async for allele in alleles:
 | 
			
		||||
                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
 | 
			
		||||
        else:
 | 
			
		||||
            for allele in alleles:
 | 
			
		||||
                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
 | 
			
		||||
        request_json = {
 | 
			
		||||
            "designations": allele_request_dict
 | 
			
		||||
        }
 | 
			
		||||
        async with self._http_client.post(uri_path, json=request_json) as response:
 | 
			
		||||
            response_json: dict = await response.json()
 | 
			
		||||
            allele_map: dict[str, list[Allele]] = defaultdict(list)
 | 
			
		||||
            response_json.setdefault("fields", dict())
 | 
			
		||||
            schema_fields_returned: dict[str, str] = response_json["fields"]
 | 
			
		||||
            schema_fields_returned.setdefault("ST", "unknown")
 | 
			
		||||
            schema_fields_returned.setdefault("clonal_complex", "unknown")
 | 
			
		||||
            schema_exact_matches: dict = response_json["exact_matches"]
 | 
			
		||||
            for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
 | 
			
		||||
                for exact_match_allele in exact_match_alleles:
 | 
			
		||||
                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
 | 
			
		||||
            if len(allele_map) == 0:
 | 
			
		||||
                raise ValueError("Passed in no alleles.")
 | 
			
		||||
            return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
 | 
			
		||||
 | 
			
		||||
    async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
 | 
			
		||||
        alleles = self.fetch_mlst_allele_variants(string, exact)
 | 
			
		||||
        return await self.fetch_mlst_st(alleles)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
 | 
			
		||||
        async for named_string in namedStrings:
 | 
			
		||||
            try:
 | 
			
		||||
                yield (named_string.name, await self.profile_string(named_string.sequence, exact))
 | 
			
		||||
            except NoBIGSdbMatchesException as e:
 | 
			
		||||
                if stop_on_fail:
 | 
			
		||||
                    raise e
 | 
			
		||||
                yield (named_string.name, None)
 | 
			
		||||
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        await self._http_client.close()
 | 
			
		||||
 | 
			
		||||
    async def __aexit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        await self.close()
 | 
			
		||||
 | 
			
		||||
class BIGSdbIndex(AbstractAsyncContextManager):
 | 
			
		||||
    KNOWN_BIGSDB_APIS = {
 | 
			
		||||
        "https://bigsdb.pasteur.fr/api",
 | 
			
		||||
        "https://rest.pubmlst.org"
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self._http_client = ClientSession()
 | 
			
		||||
        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
 | 
			
		||||
        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
    async def __aenter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
    
 | 
			
		||||
    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
 | 
			
		||||
        if self._known_seqdef_dbs_origin is not None and not force:
 | 
			
		||||
            return self._known_seqdef_dbs_origin
 | 
			
		||||
        known_seqdef_dbs = dict()
 | 
			
		||||
        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
 | 
			
		||||
            async with self._http_client.get(f"{known_bigsdb}/db") as response:
 | 
			
		||||
                response_json_databases = await response.json()
 | 
			
		||||
                for database_group in response_json_databases:
 | 
			
		||||
                    for database_info in database_group["databases"]:
 | 
			
		||||
                        if str(database_info["name"]).endswith("seqdef"):
 | 
			
		||||
                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
 | 
			
		||||
        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
 | 
			
		||||
        return self._known_seqdef_dbs_origin
 | 
			
		||||
 | 
			
		||||
    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
 | 
			
		||||
        known_databases = await self.get_known_seqdef_dbs()
 | 
			
		||||
        if seqdef_db_name not in known_databases:
 | 
			
		||||
            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
 | 
			
		||||
        return known_databases[seqdef_db_name]     
 | 
			
		||||
 | 
			
		||||
    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
 | 
			
		||||
        if seqdef_db_name in self._seqdefdb_schemas and not force:
 | 
			
		||||
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
 | 
			
		||||
        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
 | 
			
		||||
        async with self._http_client.get(uri_path) as response: 
 | 
			
		||||
            response_json = await response.json()
 | 
			
		||||
            schema_descriptions: Mapping[str, int] = dict()
 | 
			
		||||
            for scheme_definition in response_json["schemes"]:
 | 
			
		||||
                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
 | 
			
		||||
                scheme_desc: str = scheme_definition["description"]
 | 
			
		||||
                schema_descriptions[scheme_desc] = scheme_id
 | 
			
		||||
            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
 | 
			
		||||
            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
 | 
			
		||||
 | 
			
		||||
    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
 | 
			
		||||
        return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
 | 
			
		||||
 | 
			
		||||
    async def close(self):
 | 
			
		||||
        await self._http_client.close()
 | 
			
		||||
 | 
			
		||||
    async def __aexit__(self, exc_type, exc_value, traceback):
 | 
			
		||||
        await self.close()
 | 
			
		||||
    
 | 
			
		||||
@@ -1,21 +0,0 @@
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from typing import Mapping, Sequence, Union
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class PartialAllelicMatchProfile:
 | 
			
		||||
    percent_identity: float
 | 
			
		||||
    mismatches: int
 | 
			
		||||
    bitscore: float
 | 
			
		||||
    gaps: int
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class Allele:
 | 
			
		||||
    allele_loci: str
 | 
			
		||||
    allele_variant: str
 | 
			
		||||
    partial_match_profile: Union[None, PartialAllelicMatchProfile]
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class MLSTProfile:
 | 
			
		||||
    alleles: Mapping[str, Sequence[Allele]]
 | 
			
		||||
    sequence_type: str
 | 
			
		||||
    clonal_complex: str
 | 
			
		||||
@@ -1,9 +1,9 @@
 | 
			
		||||
import asyncio
 | 
			
		||||
from io import TextIOWrapper
 | 
			
		||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
 | 
			
		||||
from typing import Any, AsyncGenerator, Iterable, Union
 | 
			
		||||
from Bio import SeqIO
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.data.structures.genomics import NamedString
 | 
			
		||||
from autobigs.engine.structures.genomics import NamedString
 | 
			
		||||
 | 
			
		||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
 | 
			
		||||
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
 | 
			
		||||
							
								
								
									
										18
									
								
								src/autobigs/engine/structures/alignment.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/autobigs/engine/structures/alignment.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from numbers import Number
 | 
			
		||||
from typing import Sequence
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class AlignmentStats:
 | 
			
		||||
    percent_identity: float
 | 
			
		||||
    mismatches: int
 | 
			
		||||
    gaps: int
 | 
			
		||||
    match_metric: int
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class PairwiseAlignment:
 | 
			
		||||
    reference: str
 | 
			
		||||
    query: str
 | 
			
		||||
    reference_indices: Sequence[Number]
 | 
			
		||||
    query_indices: Sequence[Number]
 | 
			
		||||
    alignment_stats: AlignmentStats
 | 
			
		||||
							
								
								
									
										33
									
								
								src/autobigs/engine/structures/mlst.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								src/autobigs/engine/structures/mlst.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,33 @@
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from typing import Collection, Iterable, Mapping, Sequence, Union
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.structures.alignment import AlignmentStats
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class Allele:
 | 
			
		||||
    allele_locus: str
 | 
			
		||||
    allele_variant: str
 | 
			
		||||
    partial_match_profile: Union[None, AlignmentStats]
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class MLSTProfile:
 | 
			
		||||
    alleles: Collection[Allele]
 | 
			
		||||
    sequence_type: str
 | 
			
		||||
    clonal_complex: str
 | 
			
		||||
 | 
			
		||||
@dataclass(frozen=True)
 | 
			
		||||
class NamedMLSTProfile:
 | 
			
		||||
    name: str
 | 
			
		||||
    mlst_profile: Union[None, MLSTProfile]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def alleles_to_mapping(alleles: Iterable[Allele]):
 | 
			
		||||
    result = defaultdict(list)
 | 
			
		||||
    for allele in alleles:
 | 
			
		||||
        result[allele.allele_locus].append(allele.allele_variant)
 | 
			
		||||
    result = dict(result)
 | 
			
		||||
    for locus, variant in result.items():
 | 
			
		||||
        if len(variant) == 1:
 | 
			
		||||
            result[locus] = variant[0]
 | 
			
		||||
    return result
 | 
			
		||||
@@ -1,22 +1,19 @@
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
import csv
 | 
			
		||||
from os import PathLike
 | 
			
		||||
from typing import AsyncIterable, Mapping, Sequence, Union
 | 
			
		||||
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
 | 
			
		||||
 | 
			
		||||
from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
 | 
			
		||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
 | 
			
		||||
    result_dict: dict[str, Union[list[str], str]] = {}
 | 
			
		||||
    for loci, alleles in alleles_map.items():
 | 
			
		||||
        if len(alleles) == 1:
 | 
			
		||||
            result_dict[loci] = alleles[0].allele_variant
 | 
			
		||||
        else:
 | 
			
		||||
            result_locis = list()
 | 
			
		||||
            for allele in alleles:
 | 
			
		||||
                result_locis.append(allele.allele_variant)
 | 
			
		||||
                result_dict[loci] = result_locis
 | 
			
		||||
    return result_dict
 | 
			
		||||
 | 
			
		||||
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
 | 
			
		||||
    result = defaultdict(list)
 | 
			
		||||
    for allele in alleles:
 | 
			
		||||
        result[allele.allele_locus].append(allele.allele_variant)
 | 
			
		||||
    for locus in result.keys():
 | 
			
		||||
        if len(result[locus]) == 1:
 | 
			
		||||
            result[locus] = result[locus][0] # Take the only one
 | 
			
		||||
    return dict(result)
 | 
			
		||||
 | 
			
		||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
 | 
			
		||||
    failed = list()
 | 
			
		||||
@@ -27,15 +24,16 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
 | 
			
		||||
            if mlst_profile is None:
 | 
			
		||||
                failed.append(name)
 | 
			
		||||
                continue
 | 
			
		||||
            allele_mapping = alleles_to_map(mlst_profile.alleles)
 | 
			
		||||
            if writer is None:
 | 
			
		||||
                header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
 | 
			
		||||
                header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
 | 
			
		||||
                writer = csv.DictWriter(filehandle, fieldnames=header)
 | 
			
		||||
                writer.writeheader()
 | 
			
		||||
            row_dictionary = {
 | 
			
		||||
                "st": mlst_profile.sequence_type,
 | 
			
		||||
                "clonal-complex": mlst_profile.clonal_complex,
 | 
			
		||||
                "id": name,
 | 
			
		||||
                **dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
 | 
			
		||||
                **allele_mapping
 | 
			
		||||
            }
 | 
			
		||||
            writer.writerow(rowdict=row_dictionary)
 | 
			
		||||
    return failed
 | 
			
		||||
							
								
								
									
										42
									
								
								tests/autobigs/engine/analysis/test_aligners.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								tests/autobigs/engine/analysis/test_aligners.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,42 @@
 | 
			
		||||
from Bio import SeqIO
 | 
			
		||||
from Bio.Align import PairwiseAligner
 | 
			
		||||
from pytest import mark
 | 
			
		||||
from pytest import fixture
 | 
			
		||||
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
 | 
			
		||||
from autobigs.engine.structures.alignment import PairwiseAlignment
 | 
			
		||||
 | 
			
		||||
@fixture
 | 
			
		||||
def tohamaI_bpertussis_adk():
 | 
			
		||||
    return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
 | 
			
		||||
 | 
			
		||||
@fixture
 | 
			
		||||
def tohamaI_bpertussis_genome():
 | 
			
		||||
    return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
 | 
			
		||||
 | 
			
		||||
@fixture
 | 
			
		||||
def fdaargos_1560_hinfluenza_adk():
 | 
			
		||||
    return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq)
 | 
			
		||||
 | 
			
		||||
@fixture
 | 
			
		||||
def fdaargos_1560_hinfluenza_genome():
 | 
			
		||||
    return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@fixture(params=[1, 2])
 | 
			
		||||
def dummy_engine(request):
 | 
			
		||||
    aligner = PairwiseAligner("blastn")
 | 
			
		||||
    aligner.mode = "local"
 | 
			
		||||
    with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine:
 | 
			
		||||
        yield engine
 | 
			
		||||
 | 
			
		||||
class TestAsyncPairwiseAlignmentEngine:
 | 
			
		||||
    async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
 | 
			
		||||
        dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
 | 
			
		||||
        async for alignment, additional_information in dummy_engine:
 | 
			
		||||
            assert isinstance(alignment, PairwiseAlignment)
 | 
			
		||||
 | 
			
		||||
    async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
 | 
			
		||||
        dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
 | 
			
		||||
        dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk)
 | 
			
		||||
        async for alignment, additional_information in dummy_engine:
 | 
			
		||||
            assert isinstance(alignment, PairwiseAlignment)
 | 
			
		||||
							
								
								
									
										215
									
								
								tests/autobigs/engine/analysis/test_bigsdb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										215
									
								
								tests/autobigs/engine/analysis/test_bigsdb.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,215 @@
 | 
			
		||||
from os import path
 | 
			
		||||
import random
 | 
			
		||||
import re
 | 
			
		||||
from typing import Callable, Collection, Sequence, Union
 | 
			
		||||
from Bio import SeqIO
 | 
			
		||||
import pytest
 | 
			
		||||
from autobigs.engine.analysis import bigsdb
 | 
			
		||||
from autobigs.engine.structures import mlst
 | 
			
		||||
from autobigs.engine.structures.genomics import NamedString
 | 
			
		||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
 | 
			
		||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
 | 
			
		||||
from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
 | 
			
		||||
 | 
			
		||||
async def generate_async_iterable(normal_iterable):
 | 
			
		||||
    for dummy_sequence in normal_iterable:
 | 
			
		||||
        yield dummy_sequence
 | 
			
		||||
 | 
			
		||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
 | 
			
		||||
    rand = random.Random(gene)
 | 
			
		||||
    if isinstance(mutation_site_count, float):
 | 
			
		||||
        mutation_site_count = int(mutation_site_count * len(gene))
 | 
			
		||||
    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
 | 
			
		||||
    scrambled = list(gene)
 | 
			
		||||
    for random_location in random_locations:
 | 
			
		||||
        scrambled[random_location] = rand.choice(alphabet)
 | 
			
		||||
    return "".join(scrambled)
 | 
			
		||||
 | 
			
		||||
def get_first_sequence_from_fasta(resource: str):
 | 
			
		||||
    return str(SeqIO.read(path.join("tests/resources/", resource), "fasta").seq)
 | 
			
		||||
 | 
			
		||||
def get_multiple_sequences_from_fasta(resource: str):
 | 
			
		||||
    return tuple(SeqIO.parse(path.join("tests/resources/", resource), "fasta"))
 | 
			
		||||
 | 
			
		||||
bpertussis_tohamaI_profile = MLSTProfile((
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("fumC", "1", None),
 | 
			
		||||
        Allele("glyA", "1", None),
 | 
			
		||||
        Allele("tyrB", "1", None),
 | 
			
		||||
        Allele("icd", "1", None),
 | 
			
		||||
        Allele("pepA", "1", None),
 | 
			
		||||
        Allele("pgm", "1", None)), "1", "ST-2 complex")
 | 
			
		||||
 | 
			
		||||
bpertussis_tohamaI_bad_profile = MLSTProfile((
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("fumC", "2", None),
 | 
			
		||||
        Allele("glyA", "36", None),
 | 
			
		||||
        Allele("tyrB", "4", None),
 | 
			
		||||
        Allele("icd", "4", None),
 | 
			
		||||
        Allele("pepA", "1", None),
 | 
			
		||||
        Allele("pgm", "5", None),
 | 
			
		||||
    ), "unknown", "unknown")
 | 
			
		||||
 | 
			
		||||
hinfluenzae_fdaargos_profile = MLSTProfile((
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("atpG", "1", None),
 | 
			
		||||
        Allele("frdB", "1", None),
 | 
			
		||||
        Allele("fucK", "1", None),
 | 
			
		||||
        Allele("mdh", "1", None),
 | 
			
		||||
        Allele("pgi", "1", None),
 | 
			
		||||
        Allele("recA", "5", None)
 | 
			
		||||
    ), "3", "ST-3 complex")
 | 
			
		||||
 | 
			
		||||
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("atpG", "1", None),
 | 
			
		||||
        Allele("frdB", "1", None),
 | 
			
		||||
        Allele("fucK", "1", None),
 | 
			
		||||
        Allele("mdh", "1", None),
 | 
			
		||||
        Allele("pgi", "1", None),
 | 
			
		||||
        Allele("recA", "5", None)
 | 
			
		||||
    ), "3", "ST-3 complex")
 | 
			
		||||
 | 
			
		||||
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
 | 
			
		||||
 | 
			
		||||
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
 | 
			
		||||
    (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
 | 
			
		||||
    (True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
 | 
			
		||||
])
 | 
			
		||||
class TestBIGSdbMLSTProfiler:
 | 
			
		||||
    async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        sequence = get_first_sequence_from_fasta(seq_path)
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
 | 
			
		||||
            targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
 | 
			
		||||
            async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
 | 
			
		||||
                assert isinstance(exact_match, Allele)
 | 
			
		||||
                assert exact_match.allele_locus in expected_alleles
 | 
			
		||||
                assert exact_match.allele_variant == expected_alleles[exact_match.allele_locus]
 | 
			
		||||
                targets_left.remove(exact_match.allele_locus)
 | 
			
		||||
 | 
			
		||||
            assert len(targets_left) == 0
 | 
			
		||||
 | 
			
		||||
    async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
 | 
			
		||||
        mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
 | 
			
		||||
            for target_sequence in target_sequences:
 | 
			
		||||
                match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
 | 
			
		||||
                if match is None:
 | 
			
		||||
                    continue
 | 
			
		||||
                gene = match.group(1).lower()
 | 
			
		||||
                if gene not in mlst_targets:
 | 
			
		||||
                    continue
 | 
			
		||||
                scrambled = gene_scrambler(str(target_sequence.seq), 0.125)
 | 
			
		||||
                async for partial_match in profiler.determine_mlst_allele_variants([scrambled]):
 | 
			
		||||
                    assert partial_match.partial_match_profile is not None
 | 
			
		||||
                    mlst_targets.remove(gene)
 | 
			
		||||
 | 
			
		||||
            assert len(mlst_targets) == 0
 | 
			
		||||
 | 
			
		||||
    async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
 | 
			
		||||
            assert mlst_st_data is not None
 | 
			
		||||
            assert isinstance(mlst_st_data, MLSTProfile)
 | 
			
		||||
            assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
 | 
			
		||||
            assert mlst_st_data.sequence_type == expected_profile.sequence_type
 | 
			
		||||
 | 
			
		||||
    async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        dummy_alleles = bad_profile.alleles
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
 | 
			
		||||
            assert mlst_profile.clonal_complex == "unknown"
 | 
			
		||||
            assert mlst_profile.sequence_type == "unknown"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        sequence = get_first_sequence_from_fasta(seq_path)
 | 
			
		||||
        dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
 | 
			
		||||
                name, profile = named_profile.name, named_profile.mlst_profile
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                assert profile.clonal_complex == expected_profile.clonal_complex
 | 
			
		||||
                assert profile.sequence_type == expected_profile.sequence_type
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        valid_seq = get_first_sequence_from_fasta(seq_path)
 | 
			
		||||
        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
 | 
			
		||||
                name, profile = name_profile.name, name_profile.mlst_profile
 | 
			
		||||
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                if name == "should_fail":
 | 
			
		||||
                    assert profile.clonal_complex == "unknown"
 | 
			
		||||
                    assert profile.sequence_type == "unknown"
 | 
			
		||||
                    assert len(profile.alleles) > 0
 | 
			
		||||
                else:
 | 
			
		||||
                    assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                    assert profile.clonal_complex == expected_profile.clonal_complex
 | 
			
		||||
                    assert profile.sequence_type == expected_profile.sequence_type
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
 | 
			
		||||
        valid_seq = get_first_sequence_from_fasta(seq_path)
 | 
			
		||||
        dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
 | 
			
		||||
 | 
			
		||||
        async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
 | 
			
		||||
            async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
 | 
			
		||||
                name, profile = named_profile.name, named_profile.mlst_profile
 | 
			
		||||
                
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                if name == "should_fail":
 | 
			
		||||
                    assert profile.clonal_complex == "unknown"
 | 
			
		||||
                    assert profile.sequence_type == "unknown"
 | 
			
		||||
                    assert len(profile.alleles) > 0
 | 
			
		||||
                else:
 | 
			
		||||
                    assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                    assert profile.clonal_complex == expected_profile.clonal_complex
 | 
			
		||||
                    assert profile.sequence_type == expected_profile.sequence_type
 | 
			
		||||
 | 
			
		||||
class TestBIGSdbIndex:
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_index_all_databases_is_not_empty(self):
 | 
			
		||||
        async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
            assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_index_references_pubmlst_correctly(self):
 | 
			
		||||
        async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_index_references_institutpasteur_correctly(self):
 | 
			
		||||
        async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
            assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_index_get_schemas_for_bordetella(self):
 | 
			
		||||
        async with BIGSdbIndex() as index:
 | 
			
		||||
            schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
 | 
			
		||||
            assert len(schemas.keys()) > 0
 | 
			
		||||
            assert "MLST" in schemas
 | 
			
		||||
            assert isinstance(schemas["MLST"], int)
 | 
			
		||||
 | 
			
		||||
    async def test_bigsdb_index_get_databases_has_only_seqdef(self):
 | 
			
		||||
        async with BIGSdbIndex() as index:
 | 
			
		||||
            databases = await index.get_known_seqdef_dbs()
 | 
			
		||||
            assert len(databases.keys()) > 0
 | 
			
		||||
            for database_name in databases.keys():
 | 
			
		||||
                assert database_name.endswith("seqdef")
 | 
			
		||||
            assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
 | 
			
		||||
 | 
			
		||||
    @pytest.mark.parametrize("local", [
 | 
			
		||||
        (True),
 | 
			
		||||
        (False)
 | 
			
		||||
    ])
 | 
			
		||||
    async def test_bigsdb_index_instantiates_correct_profiler(self, local):
 | 
			
		||||
        sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
        async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
            async with await bigsdb_index.build_profiler_from_seqdefdb(local, "pubmlst_bordetella_seqdef", 3) as profiler:
 | 
			
		||||
                assert isinstance(profiler, BIGSdbMLSTProfiler)
 | 
			
		||||
                profile = await profiler.profile_string(sequence)
 | 
			
		||||
                assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
                assert profile.sequence_type == "1"
 | 
			
		||||
@@ -1,21 +0,0 @@
 | 
			
		||||
from autobigs.engine.data.local.csv import dict_loci_alleles_variants_from_loci
 | 
			
		||||
from autobigs.engine.data.structures.mlst import Allele
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
 | 
			
		||||
    alleles_map = {
 | 
			
		||||
        "adk": [Allele("adk", "1", None)]
 | 
			
		||||
    }
 | 
			
		||||
    results = dict_loci_alleles_variants_from_loci(alleles_map)
 | 
			
		||||
    for loci, variant in results.items():
 | 
			
		||||
        assert isinstance(variant, str)
 | 
			
		||||
        assert variant == "1"
 | 
			
		||||
 | 
			
		||||
def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
 | 
			
		||||
    alleles_map = {
 | 
			
		||||
        "adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
 | 
			
		||||
    }
 | 
			
		||||
    results = dict_loci_alleles_variants_from_loci(alleles_map)
 | 
			
		||||
    for loci, variant in results.items():
 | 
			
		||||
        assert isinstance(variant, list)
 | 
			
		||||
        assert len(variant) == 2
 | 
			
		||||
@@ -1,244 +0,0 @@
 | 
			
		||||
import random
 | 
			
		||||
import re
 | 
			
		||||
from typing import Collection, Sequence, Union
 | 
			
		||||
from Bio import SeqIO
 | 
			
		||||
import pytest
 | 
			
		||||
from autobigs.engine.data.structures.genomics import NamedString
 | 
			
		||||
from autobigs.engine.data.structures.mlst import Allele, MLSTProfile
 | 
			
		||||
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
 | 
			
		||||
from autobigs.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
 | 
			
		||||
 | 
			
		||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
 | 
			
		||||
    rand = random.Random(gene)
 | 
			
		||||
    if isinstance(mutation_site_count, float):
 | 
			
		||||
        mutation_site_count = int(mutation_site_count * len(gene))
 | 
			
		||||
    random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
 | 
			
		||||
    scrambled = list(gene)
 | 
			
		||||
    for random_location in random_locations:
 | 
			
		||||
        scrambled[random_location] = rand.choice(alphabet)
 | 
			
		||||
    return "".join(scrambled)
 | 
			
		||||
 | 
			
		||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
 | 
			
		||||
        async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
 | 
			
		||||
            assert isinstance(exact_match, Allele)
 | 
			
		||||
            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
 | 
			
		||||
            targets_left.remove(exact_match.allele_loci)
 | 
			
		||||
 | 
			
		||||
        assert len(targets_left) == 0
 | 
			
		||||
 | 
			
		||||
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
 | 
			
		||||
    sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
 | 
			
		||||
    mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
 | 
			
		||||
        for sequence in sequences:
 | 
			
		||||
            match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
 | 
			
		||||
            if match is None:
 | 
			
		||||
                continue
 | 
			
		||||
            gene = match.group(1)
 | 
			
		||||
            if gene.lower() not in mlst_targets:
 | 
			
		||||
                continue
 | 
			
		||||
            scrambled = gene_scrambler(str(sequence.seq), 0.125)
 | 
			
		||||
            async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
 | 
			
		||||
                assert partial_match.partial_match_profile is not None
 | 
			
		||||
                mlst_targets.remove(gene.lower())
 | 
			
		||||
 | 
			
		||||
        assert len(mlst_targets) == 0
 | 
			
		||||
 | 
			
		||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
 | 
			
		||||
    async def dummy_allele_generator():
 | 
			
		||||
        dummy_alleles = [
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("fumC", "1", None),
 | 
			
		||||
        Allele("glyA", "1", None),
 | 
			
		||||
        Allele("tyrB", "1", None),
 | 
			
		||||
        Allele("icd", "1", None),
 | 
			
		||||
        Allele("pepA", "1", None),
 | 
			
		||||
        Allele("pgm", "1", None),
 | 
			
		||||
        ]
 | 
			
		||||
        for dummy_allele in dummy_alleles:
 | 
			
		||||
            yield dummy_allele
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
 | 
			
		||||
        assert mlst_st_data is not None
 | 
			
		||||
        assert isinstance(mlst_st_data, MLSTProfile)
 | 
			
		||||
        assert mlst_st_data.clonal_complex == "ST-2 complex"
 | 
			
		||||
        assert mlst_st_data.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
 | 
			
		||||
    dummy_alleles = [
 | 
			
		||||
    Allele("adk", "1", None),
 | 
			
		||||
    Allele("fumC", "2", None),
 | 
			
		||||
    Allele("glyA", "36", None),
 | 
			
		||||
    Allele("tyrB", "4", None),
 | 
			
		||||
    Allele("icd", "4", None),
 | 
			
		||||
    Allele("pepA", "1", None),
 | 
			
		||||
    Allele("pgm", "5", None),
 | 
			
		||||
    ]
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
 | 
			
		||||
        assert mlst_profile.clonal_complex == "unknown"
 | 
			
		||||
        assert mlst_profile.sequence_type == "unknown"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def test_institutpasteur_sequence_profiling_is_correct():
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        profile = await dummy_profiler.profile_string(sequence)
 | 
			
		||||
        assert profile is not None
 | 
			
		||||
        assert isinstance(profile, MLSTProfile)
 | 
			
		||||
        assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
        assert profile.sequence_type == "1"
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
 | 
			
		||||
    dummy_alleles = {
 | 
			
		||||
        Allele("adk", "1", None),
 | 
			
		||||
        Allele("atpG", "1", None),
 | 
			
		||||
        Allele("frdB", "1", None),
 | 
			
		||||
        Allele("fucK", "1", None),
 | 
			
		||||
        Allele("mdh", "1", None),
 | 
			
		||||
        Allele("pgi", "1", None),
 | 
			
		||||
        Allele("recA", "5", None),
 | 
			
		||||
    }
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
 | 
			
		||||
        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
 | 
			
		||||
        async for exact_match in exact_matches:
 | 
			
		||||
            assert isinstance(exact_match, Allele)
 | 
			
		||||
            dummy_alleles.remove(exact_match)
 | 
			
		||||
 | 
			
		||||
        assert len(dummy_alleles) == 0
 | 
			
		||||
 | 
			
		||||
async def test_pubmlst_profiling_results_in_correct_st():
 | 
			
		||||
    async def generate_dummy_targets():
 | 
			
		||||
        dummy_alleles = [
 | 
			
		||||
                Allele("adk", "1", None),
 | 
			
		||||
                Allele("atpG", "1", None),
 | 
			
		||||
                Allele("frdB", "1", None),
 | 
			
		||||
                Allele("fucK", "1", None),
 | 
			
		||||
                Allele("mdh", "1", None),
 | 
			
		||||
                Allele("pgi", "1", None),
 | 
			
		||||
                Allele("recA", "5", None),
 | 
			
		||||
            ]
 | 
			
		||||
        for dummy_allele in dummy_alleles:
 | 
			
		||||
            yield dummy_allele
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
 | 
			
		||||
        mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
 | 
			
		||||
        assert mlst_st_data is not None
 | 
			
		||||
        assert isinstance(mlst_st_data, MLSTProfile)
 | 
			
		||||
        assert mlst_st_data.clonal_complex == "ST-3 complex"
 | 
			
		||||
        assert mlst_st_data.sequence_type == "3"
 | 
			
		||||
 | 
			
		||||
async def test_pubmlst_sequence_profiling_is_correct():
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
 | 
			
		||||
        profile = await dummy_profiler.profile_string(sequence)
 | 
			
		||||
        assert profile is not None
 | 
			
		||||
        assert isinstance(profile, MLSTProfile)
 | 
			
		||||
        assert profile.clonal_complex == "ST-3 complex"
 | 
			
		||||
        assert profile.sequence_type == "3"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_all_databases_is_not_empty():
 | 
			
		||||
    async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
        assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_references_pubmlst_correctly():
 | 
			
		||||
    async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_references_institutpasteur_correctly():
 | 
			
		||||
    async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
        assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_instantiates_correct_profiler():
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    async with BIGSdbIndex() as bigsdb_index:
 | 
			
		||||
        async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
 | 
			
		||||
            profile = await profiler.profile_string(sequence)
 | 
			
		||||
            assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
            assert profile.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_profile_multiple_strings_same_string_twice():
 | 
			
		||||
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
 | 
			
		||||
    async def generate_async_iterable_sequences():
 | 
			
		||||
        for dummy_sequence in dummy_sequences:
 | 
			
		||||
            yield dummy_sequence
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
 | 
			
		||||
            assert profile is not None
 | 
			
		||||
            assert isinstance(profile, MLSTProfile)
 | 
			
		||||
            assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
            assert profile.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
 | 
			
		||||
    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
 | 
			
		||||
    async def generate_async_iterable_sequences():
 | 
			
		||||
        for dummy_sequence in dummy_sequences:
 | 
			
		||||
            yield dummy_sequence
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
 | 
			
		||||
            if name == "should_fail":
 | 
			
		||||
                assert profile is None
 | 
			
		||||
            else:
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
                assert profile.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
 | 
			
		||||
    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
 | 
			
		||||
    async def generate_async_iterable_sequences():
 | 
			
		||||
        for dummy_sequence in dummy_sequences:
 | 
			
		||||
            yield dummy_sequence
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
 | 
			
		||||
            if name == "should_fail":
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                assert profile.clonal_complex == "unknown"
 | 
			
		||||
                assert profile.sequence_type == "unknown"
 | 
			
		||||
                assert len(profile.alleles) > 0
 | 
			
		||||
            else:
 | 
			
		||||
                assert profile is not None
 | 
			
		||||
                assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
                assert profile.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
 | 
			
		||||
    valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
 | 
			
		||||
    invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
 | 
			
		||||
    dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
 | 
			
		||||
    async def generate_async_iterable_sequences():
 | 
			
		||||
        for dummy_sequence in dummy_sequences:
 | 
			
		||||
            yield dummy_sequence
 | 
			
		||||
    async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
 | 
			
		||||
        with pytest.raises(NoBIGSdbMatchesException):
 | 
			
		||||
            async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
 | 
			
		||||
                if name == "should_fail":
 | 
			
		||||
                    pytest.fail("Exception should have been thrown, no exception was thrown.")
 | 
			
		||||
                else:
 | 
			
		||||
                    assert profile is not None
 | 
			
		||||
                    assert isinstance(profile, MLSTProfile)
 | 
			
		||||
                    assert profile.clonal_complex == "ST-2 complex"
 | 
			
		||||
                    assert profile.sequence_type == "1"
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_get_schemas_for_bordetella():
 | 
			
		||||
    async with BIGSdbIndex() as index:
 | 
			
		||||
        schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
 | 
			
		||||
        assert len(schemas.keys()) > 0
 | 
			
		||||
        assert "MLST" in schemas
 | 
			
		||||
        assert isinstance(schemas["MLST"], int)
 | 
			
		||||
 | 
			
		||||
async def test_bigsdb_index_get_databases_has_only_seqdef():
 | 
			
		||||
    async with BIGSdbIndex() as index:
 | 
			
		||||
        databases = await index.get_known_seqdef_dbs()
 | 
			
		||||
        assert len(databases.keys()) > 0
 | 
			
		||||
        for database_name in databases.keys():
 | 
			
		||||
            assert database_name.endswith("seqdef")
 | 
			
		||||
        assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
from autobigs.engine.data.local.fasta import read_fasta
 | 
			
		||||
from autobigs.engine.reading import read_fasta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def test_fasta_reader_not_none():
 | 
			
		||||
							
								
								
									
										11
									
								
								tests/resources/fdaargos_1560_hinfluenza_adk.fasta
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tests/resources/fdaargos_1560_hinfluenza_adk.fasta
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
 | 
			
		||||
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
 | 
			
		||||
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
 | 
			
		||||
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
 | 
			
		||||
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
 | 
			
		||||
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
 | 
			
		||||
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
 | 
			
		||||
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
 | 
			
		||||
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
 | 
			
		||||
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
 | 
			
		||||
AAAATCTTAGGCTAA
 | 
			
		||||
							
								
								
									
										27246
									
								
								tests/resources/fdaargos_1560_hinfluenza_features.fasta
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27246
									
								
								tests/resources/fdaargos_1560_hinfluenza_features.fasta
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										11
									
								
								tests/resources/tohama_I_bpertussis_adk.fasta
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								tests/resources/tohama_I_bpertussis_adk.fasta
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
>lcl|BX640419.1_cds_CAE43044.1_2724 [gene=adK] [locus_tag=BP2769] [db_xref=GOA:P0DKX8,InterPro:IPR000850,InterPro:IPR006259,InterPro:IPR007862,InterPro:IPR027417] [protein=adenylate kinase] [protein_id=CAE43044.1] [location=164032..164688] [gbkey=CDS]
 | 
			
		||||
ATGCGTCTCATTCTGCTCGGACCGCCCGGAGCCGGCAAAGGCACCCAAGCCGCCTTTCTCACCCAACACT
 | 
			
		||||
ACGGCATCCCGCAGATATCCACCGGTGACATGCTGCGCGCCGCCGTCAAGGCCGGCACGCCGCTGGGCCT
 | 
			
		||||
GGAAGCCAAGAAGGTCATGGACGCGGGCGGCCTGGTCTCGGACGACCTGATCATCGGCCTGGTGCGCGAT
 | 
			
		||||
CGCCTGACCCAGCCCGATTGCGCCAACGGCTACCTGTTCGACGGTTTCCCGCGCACCATCCCGCAGGCCG
 | 
			
		||||
ACGCGCTCAAGAGCGCCGGCATCGCGCTGGATTACGTGGTCGAGATCGAAGTGCCGGAAAGCGACATCAT
 | 
			
		||||
CGAACGCATGAGCGAACGCCGCGTGCACCCGGCCAGCGGCCGCAGCTACCACGTACGCTTCAATCCGCCC
 | 
			
		||||
AAGGCCGAAGGCGTGGACGACGTCACGGGCGAACCGCTGGTGCAGCGCGACGACGACCGCGAGGAAACCG
 | 
			
		||||
TGCGCCATCGTCTCAACGTCTACCAGAACCAGACCCGCCCGCTGGTCGACTACTACTCGTCCTGGGCCCA
 | 
			
		||||
GTCCGATGCCGCCGCGGCGCCCAAGTACCGCAAGATCTCCGGCGTCGGCTCGGTCGACGAAATCAAGAGC
 | 
			
		||||
CGCCTGTCGCAGGCTCTGCAGAGCTAA
 | 
			
		||||
		Reference in New Issue
	
	Block a user