Refactored code layout

2025-01-16 21:54:52 +00:00
parent bad7dfc3a8
commit a3c864b565
12 changed files with 9 additions and 30 deletions
--- a/src/automlst/engine/data/local/init.py
+++ b/src/automlst/engine/data/local/init.py
--- a/src/automlst/engine/data/local/abif.py
+++ b/src/automlst/engine/data/local/abif.py
@@ -0,0 +1,114 @@
+import asyncio
+from numbers import Number
+from os import path
+from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
+from automlst.engine.data.structures.genomics import NamedString, SangerTraceData
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO, Align
+
+
+def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
+    with open(seq_path, "rb") as seq_handle:
+        return SeqIO.read(seq_handle, "abi")
+
+
+async def read_abif(seq_path: str) -> SangerTraceData:
+    ext = path.splitext(seq_path)[1]
+    if ext.lower() != ".ab1" and ext.lower() != "abi":
+        raise ValueError(
+            'seq_path must have file extension of "ab1", or "abi".')
+    biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
+    biopython_annotations = biopython_seq.annotations
+
+    # Lot of type ignoring since Biopython did not define their typing.
+    biopython_abif_raw = biopython_annotations["abif_raw"]  # type: ignore
+    trace_data = SangerTraceData(
+        path.basename(seq_path),
+        biopython_seq.seq,
+        biopython_abif_raw.get("APFN2"),  # type: ignore
+        biopython_abif_raw.get("APrN1"),  # type: ignore
+        biopython_abif_raw.get("APrV1"),  # type: ignore
+        biopython_abif_raw.get("APrX1"),  # type: ignore
+        biopython_abif_raw.get("APXV1"),  # type: ignore
+        biopython_abif_raw.get("CMNT1"),  # type: ignore
+        biopython_abif_raw.get("CpEP1"),  # type: ignore
+        biopython_abif_raw.get("CTID1"),  # type: ignore
+        biopython_abif_raw.get("CTNM1"),  # type: ignore
+        biopython_abif_raw.get("CTTL1"),  # type: ignore
+        biopython_abif_raw.get("DATA1"),  # type: ignore
+        biopython_abif_raw.get("DATA2"),  # type: ignore
+        biopython_abif_raw.get("DATA3"),  # type: ignore
+        biopython_abif_raw.get("DATA4"),  # type: ignore
+        biopython_abif_raw.get("DATA5"),  # type: ignore
+        biopython_abif_raw.get("DATA6"),  # type: ignore
+        biopython_abif_raw.get("DATA7"),  # type: ignore
+        biopython_abif_raw.get("DATA8"),  # type: ignore
+        biopython_abif_raw.get("DSam1"),  # type: ignore
+        biopython_abif_raw.get("DyeN1"),  # type: ignore
+        biopython_abif_raw.get("DyeN2"),  # type: ignore
+        biopython_abif_raw.get("DyeN3"),  # type: ignore
+        biopython_abif_raw.get("DyeN4"),  # type: ignore
+        biopython_abif_raw.get("DyeW1"),  # type: ignore
+        biopython_abif_raw.get("DyeW2"),  # type: ignore
+        biopython_abif_raw.get("DyeW3"),  # type: ignore
+        biopython_abif_raw.get("DyeW4"),  # type: ignore
+        biopython_abif_raw.get("DySN1"),  # type: ignore
+        biopython_abif_raw.get("EPVt1"),  # type: ignore
+        biopython_abif_raw.get("EVNT1"),  # type: ignore
+        biopython_abif_raw.get("EVNT2"),  # type: ignore
+        biopython_abif_raw.get("EVNT3"),  # type: ignore
+        biopython_abif_raw.get("EVNT4"),  # type: ignore
+        biopython_abif_raw.get("FWO_1"),  # type: ignore
+        biopython_abif_raw.get("GTyp1"),  # type: ignore
+        biopython_abif_raw.get("InSc1"),  # type: ignore
+        biopython_abif_raw.get("InVt1"),  # type: ignore
+        biopython_abif_raw.get("LANE1"),  # type: ignore
+        biopython_abif_raw.get("LIMS1"),  # type: ignore
+        biopython_abif_raw.get("LNTD1"),  # type: ignore
+        biopython_abif_raw.get("LsrP1"),  # type: ignore
+        biopython_abif_raw.get("MCHN1"),  # type: ignore
+        biopython_abif_raw.get("MODF1"),  # type: ignore
+        biopython_abif_raw.get("MODL1"),  # type: ignore
+        biopython_abif_raw.get("NAVG1"),  # type: ignore
+        biopython_abif_raw.get("NLNE1"),  # type: ignore
+        biopython_abif_raw.get("OfSc1"),  # type: ignore
+        biopython_abif_raw.get("PDMF1"),  # type: ignore
+        biopython_abif_raw.get("PXLB1"),  # type: ignore
+        biopython_abif_raw.get("RGCm1"),  # type: ignore
+        biopython_abif_raw.get("RGNm1"),  # type: ignore
+        biopython_abif_raw.get("RMdV1"),  # type: ignore
+        biopython_abif_raw.get("RMdX1"),  # type: ignore
+        biopython_abif_raw.get("RMXV1"),  # type: ignore
+        biopython_abif_raw.get("RPrN1"),  # type: ignore
+        biopython_abif_raw.get("RPrV1"),  # type: ignore
+        biopython_abif_raw.get("RUND1"),  # type: ignore
+        biopython_abif_raw.get("RUND2"),  # type: ignore
+        biopython_abif_raw.get("RUND3"),  # type: ignore
+        biopython_abif_raw.get("RUND4"),  # type: ignore
+        biopython_abif_raw.get("RunN1"),  # type: ignore
+        biopython_abif_raw.get("RUNT1"),  # type: ignore
+        biopython_abif_raw.get("RUNT2"),  # type: ignore
+        biopython_abif_raw.get("RUNT3"),  # type: ignore
+        biopython_abif_raw.get("RUNT4"),  # type: ignore
+        biopython_abif_raw.get("Satd"),  # type: ignore
+        biopython_abif_raw.get("Scal1"),  # type: ignore
+        biopython_abif_raw.get("SCAN1"),  # type: ignore
+        biopython_abif_raw.get("SMED1"),  # type: ignore
+        biopython_abif_raw.get("SMLt"),  # type: ignore
+        biopython_abif_raw.get("SMPL1"),  # type: ignore
+        biopython_abif_raw.get("SVER1"),  # type: ignore
+        biopython_abif_raw.get("SVER3"),  # type: ignore
+        biopython_abif_raw.get("Tmpr1"),  # type: ignore
+        biopython_abif_raw.get("TUBE"),  # type: ignore
+        biopython_abif_raw.get("User")  # type: ignore
+    )
+    return trace_data
+
+
+def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
+    aligner = Align.PairwiseAligner(scoring="blastn")
+    aligner.mode = "local"
+    alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
+        0]  # take the best alignment
+    # TODO actually assemble the consensus sequence here
+    raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
--- a/src/automlst/engine/data/local/csv.py
+++ b/src/automlst/engine/data/local/csv.py
@@ -0,0 +1,41 @@
+import csv
+from io import TextIOWrapper
+from os import PathLike
+from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
+
+from automlst.engine.data.structures.mlst import Allele, MLSTProfile
+
+
+def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
+    result_dict: dict[str, Union[list[str], str]] = {}
+    for loci, alleles in alleles_map.items():
+        if len(alleles) == 1:
+            result_dict[loci] = alleles[0].allele_variant
+        for allele in alleles:
+            result_locis = list()
+            result_locis.append(allele.allele_variant)
+            result_dict[loci] = result_locis
+    return result_dict
+
+
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
+    failed = list()
+    with open(handle, "w", newline='') as filehandle:
+        header = None
+        writer: Union[csv.DictWriter, None] = None
+        async for name, mlst_profile in mlst_profiles_iterable:
+            if mlst_profile is None:
+                failed.append(name)
+                continue
+            if writer is None:
+                header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
+                writer = csv.DictWriter(filehandle, fieldnames=header)
+                writer.writeheader()
+            row_dictionary = {
+                "st": mlst_profile.sequence_type,
+                "clonal-complex": mlst_profile.clonal_complex,
+                "id": name,
+                **dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
+            }
+            writer.writerow(rowdict=row_dictionary)
+    return failed
--- a/src/automlst/engine/data/local/fasta.py
+++ b/src/automlst/engine/data/local/fasta.py
@@ -0,0 +1,16 @@
+import asyncio
+from io import TextIOWrapper
+from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
+from Bio import SeqIO
+
+from automlst.engine.data.structures.genomics import NamedString
+
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
+    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
+    for fasta_sequence in await fasta_sequences:
+        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
+
+async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
+    for handle in handles:
+        async for named_seq in read_fasta(handle):
+            yield named_seq
--- a/src/automlst/engine/data/remote/init.py
+++ b/src/automlst/engine/data/remote/init.py
--- a/src/automlst/engine/data/remote/databases/bigsdb.py
+++ b/src/automlst/engine/data/remote/databases/bigsdb.py
@@ -0,0 +1,166 @@
+from collections import defaultdict
+from contextlib import AbstractAsyncContextManager
+from numbers import Number
+from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
+
+from aiohttp import ClientSession, ClientTimeout
+
+from automlst.engine.data.structures.genomics import NamedString
+from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
+from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
+
+class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
+
+    def __init__(self, database_api: str, database_name: str, schema_id: int):
+        self._database_name = database_name
+        self._schema_id = schema_id
+        self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
+        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
+
+    async def __aenter__(self):
+        return self
+
+    async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
+        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
+        uri_path = "sequence"
+        response = await self._http_client.post(uri_path, json={
+            "sequence": sequence_string,
+            "partial_matches": not exact
+        })
+        sequence_response: dict = await response.json()
+
+        if "exact_matches" in sequence_response:
+            # loci -> list of alleles with id and loci
+            exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+            for allele_loci, alleles in exact_matches.items():
+                for allele in alleles:
+                    alelle_id = allele["allele_id"]
+                    yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
+        elif "partial_matches" in sequence_response:
+            if exact:
+                raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
+            partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] 
+            for allele_loci, partial_match in partial_matches.items():
+                if len(partial_match) <= 0:
+                    continue
+                partial_match_profile = PartialAllelicMatchProfile(
+                    percent_identity=float(partial_match["identity"]),
+                    mismatches=int(partial_match["mismatches"]),
+                    bitscore=float(partial_match["bitscore"]),
+                    gaps=int(partial_match["gaps"])
+                )
+                yield Allele(
+                    allele_loci=allele_loci,
+                    allele_variant=str(partial_match["allele"]),
+                    partial_match_profile=partial_match_profile
+                )
+        else:
+            raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
+
+
+
+    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+        uri_path = "designations"
+        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        if isinstance(alleles, AsyncIterable):
+            async for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        else:
+            for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        request_json = {
+            "designations": allele_request_dict
+        }
+        async with self._http_client.post(uri_path, json=request_json) as response:
+            response_json: dict = await response.json()
+            allele_map: dict[str, list[Allele]] = defaultdict(list)
+            response_json.setdefault("fields", dict())
+            schema_fields_returned: dict[str, str] = response_json["fields"]
+            schema_fields_returned.setdefault("ST", "unknown")
+            schema_fields_returned.setdefault("clonal_complex", "unknown")
+            schema_exact_matches: dict = response_json["exact_matches"]
+            for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
+                for exact_match_allele in exact_match_alleles:
+                    allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
+            if len(allele_map) == 0:
+                raise ValueError("Passed in no alleles.")
+            return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+
+    async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
+        alleles = self.fetch_mlst_allele_variants(string, exact)
+        return await self.fetch_mlst_st(alleles)
+
+
+    async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
+        async for named_string in namedStrings:
+            try:
+                yield (named_string.name, await self.profile_string(named_string.sequence, exact))
+            except NoBIGSdbMatchesException as e:
+                if stop_on_fail:
+                    raise e
+                yield (named_string.name, None)
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+
+class BIGSdbIndex(AbstractAsyncContextManager):
+    KNOWN_BIGSDB_APIS = {
+        "https://bigsdb.pasteur.fr/api",
+        "https://rest.pubmlst.org"
+    }
+
+    def __init__(self):
+        self._http_client = ClientSession()
+        self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
+        self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
+        super().__init__()
+
+    async def __aenter__(self):
+        return self
+    
+    async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
+        if self._known_seqdef_dbs_origin is not None and not force:
+            return self._known_seqdef_dbs_origin
+        known_seqdef_dbs = dict()
+        for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
+            async with self._http_client.get(f"{known_bigsdb}/db") as response:
+                response_json_databases = await response.json()
+                for database_group in response_json_databases:
+                    for database_info in database_group["databases"]:
+                        if str(database_info["name"]).endswith("seqdef"):
+                            known_seqdef_dbs[database_info["name"]] = known_bigsdb
+        self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
+        return self._known_seqdef_dbs_origin
+
+    async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
+        known_databases = await self.get_known_seqdef_dbs()
+        if seqdef_db_name not in known_databases:
+            raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
+        return known_databases[seqdef_db_name]     
+
+    async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
+        if seqdef_db_name in self._seqdefdb_schemas and not force:
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
+        uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
+        async with self._http_client.get(uri_path) as response: 
+            response_json = await response.json()
+            schema_descriptions: Mapping[str, int] = dict()
+            for scheme_definition in response_json["schemes"]:
+                scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
+                scheme_desc: str = scheme_definition["description"]
+                schema_descriptions[scheme_desc] = scheme_id
+            self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
+            return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
+
+    async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
+        return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
+    
--- a/src/automlst/engine/data/structures/init.py
+++ b/src/automlst/engine/data/structures/init.py
--- a/src/automlst/engine/data/structures/genomics.py
+++ b/src/automlst/engine/data/structures/genomics.py
--- a/src/automlst/engine/data/structures/mlst.py
+++ b/src/automlst/engine/data/structures/mlst.py