Completed NCBI lookup, FASTA reading, ABIF reading, and Institut Pasteur querying

2025-01-03 16:41:51 +00:00 · 2025-01-03 16:41:51 +00:00 · 362e0867e5
commit 362e0867e5
parent b834aa93b0
22 changed files with 339 additions and 56 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 aiohttp[speedups]
 biopython
 pytest
-pytest-asyncio
+pytest-asyncio
 dataclasses-json
--- a/src/mlstmyfasta/engine/annotations.py
+++ b/src/mlstmyfasta/engine/annotations.py
@ -6,30 +6,8 @@ from Bio import Entrez
 from Bio import SeqIO
 import numpy as np
-
+from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation
-# TODO Change this out for a more professional approach
+from mlstmyfasta.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
 Entrez.email = "yunyangdeng@outlook.com"
 from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
 async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
        record = SeqIO.read(fetch_stream, "genbank")
        sequence_features = list()
        for feature in record.features:
            start = int(feature.location.start)
            end = int(feature.location.end)
            qualifiers = feature.qualifiers
            for qualifier_key in qualifiers:
                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
            sequence_features.append(StringAnnotation(
                type=feature.type,
                start=start,
                end=end+1,  # Position is exclusive
                feature_properties=qualifiers
            ))
        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
 async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
@ -60,4 +38,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string:
            end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
            feature_properties=dict(annotation.feature_properties) # same as original
        ))
-    return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
+    return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
 def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
    return annotated_string.sequence[string_annotation.start:string_annotation.end]
--- a/src/mlstmyfasta/engine/data/MLST.py
+++ b/src/mlstmyfasta/engine/data/MLST.py
@ -2,5 +2,5 @@ from dataclasses import dataclass
@dataclass
 class Allele:
-    allele_name: str
+    allele_loci: str
    allele_variant: str
--- a/src/mlstmyfasta/engine/data/genomics.py
+++ b/src/mlstmyfasta/engine/data/genomics.py
@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import Mapping, Sequence, Set
+from numbers import Number
 from typing import Mapping, Sequence, Set, Union
@dataclass
@ -8,12 +9,97 @@ class StringAnnotation:
    start: int
    end: int
    feature_properties: Mapping[str, Set[str]]
-    
+
@dataclass
-class AnnotatedString:
+class NamedString:
    name: str
    sequence: str
@dataclass
 class AnnotatedString(NamedString):
    annotations: Sequence[StringAnnotation]
-def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
+@dataclass
-    return annotated_string.sequence[string_annotation.start:string_annotation.end]
+class SangerTraceData:
    sequence: Sequence[str]
    seq_param_file_name: str
    analysis_proto_settings_name: str
    analysis_rpto_settings_ver: str
    analysis_proto_xml_data: str
    analysis_proto_xml_schema_ver: str
    sample_comment: Union[None, str]
    capillary_machine: bool
    container_identifier: str
    container_name: str
    comment_title: str
    channel_1: Sequence[Number]
    channel_2: Sequence[Number]
    channel_3: Sequence[Number]
    channel_4: Sequence[Number]
    measured_voltage_dv: Sequence[Number]
    measured_current_ma: Sequence[Number]
    measured_power_mw: Sequence[Number]
    measured_temperature_celsius: Sequence[Number]
    down_sample_factor: Number
    dye_1: str
    dye_2: str
    dye_3: str
    dye_4: str
    dye_wavelength_1: str
    dye_wavelength_2: str
    dye_wavelength_3: str
    dye_wavelength_4: str
    dye_set_name: str
    electrophoresis_voltage_setting_v: Number
    start_run_event: str
    stop_run_event: str
    start_collection_event: str
    stop_collection_event: str
    base_order: Sequence[str]
    gel_type_desc: str
    injection_time_sec: Number
    inection_voltage_v: Number
    lane_or_capillary: Number
    sample_tracking_id: str
    length_to_detector_cm: Number
    laser_power_mw: Number
    instrument_name_and_serial: str
    data_collection_module_file: str
    model_number: str
    pixels_avg_per_lane: Number
    number_of_capillaries: Number
    marked_off_scale_scans: Union[None, Sequence[Number]]
    # Skipped Ovrl, OvrV
    mobility_file: str
    # Skipped PRJT, PROJ
    pixel_bin_size: Number
    # Skipped scan rate
    results_group_comment: Union[None, str]
    results_group_name: str
    run_module_ver: str
    run_module_xml: str
    run_module_xml_ver: str
    run_proto_name: str
    run_proto_ver: str
    run_start_date: str  # Date time object
    run_stop_date: str  # Date time object
    data_collection_start_date: str
    data_collection_stop_date: str
    run_name: str
    run_start_time: str  # time object
    run_stop_time: str  # time object
    collection_start_time: str  # time object
    collection_stop_time: str  # time object
    saturated_data_points: Union[None, Sequence[Number]]
    color_rescaling_divisor: Number
    scan_count: Number
    polymer_lot_expiration: str  # date time object
    polymer_lot_number: Number
    sample_name: str
    # Skipped genescan data
    # Skipped size standard file name
    data_collection_software_ver: str
    data_collection_firmware_ver: str
    run_temperature_setting_celcius: Number
    well_id: str
    plate_user_name: str
--- a/src/mlstmyfasta/engine/local/abif.py
+++ b/src/mlstmyfasta/engine/local/abif.py
@ -0,0 +1,104 @@
 import asyncio
 from numbers import Number
 from os import path
 from typing import Sequence, Union
 from mlstmyfasta.engine.data.genomics import SangerTraceData
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
 def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
    with open(seq_path, "rb") as seq_handle:
        return SeqIO.read(seq_handle, "abi")
 async def load_sanger_sequence(seq_path: str) -> SangerTraceData:
    ext = path.splitext(seq_path)[1]
    if ext.lower() != ".ab1" and ext.lower() != "abi":
        raise ValueError(
            'seq_path must have file extension of "ab1", or "abi".')
    biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
    biopython_annotations = biopython_seq.annotations
    # Lot of type ignoring since Biopython did not define their typing.
    biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
    trace_data = SangerTraceData(
        biopython_seq.seq,
        biopython_abif_raw.get("APFN2"), # type: ignore
        biopython_abif_raw.get("APrN1"), # type: ignore
        biopython_abif_raw.get("APrV1"), # type: ignore
        biopython_abif_raw.get("APrX1"), # type: ignore
        biopython_abif_raw.get("APXV1"), # type: ignore
        biopython_abif_raw.get("CMNT1"), # type: ignore
        biopython_abif_raw.get("CpEP1"), # type: ignore
        biopython_abif_raw.get("CTID1"), # type: ignore
        biopython_abif_raw.get("CTNM1"), # type: ignore
        biopython_abif_raw.get("CTTL1"), # type: ignore
        biopython_abif_raw.get("DATA1"), # type: ignore
        biopython_abif_raw.get("DATA2"), # type: ignore
        biopython_abif_raw.get("DATA3"), # type: ignore
        biopython_abif_raw.get("DATA4"), # type: ignore
        biopython_abif_raw.get("DATA5"), # type: ignore
        biopython_abif_raw.get("DATA6"), # type: ignore
        biopython_abif_raw.get("DATA7"), # type: ignore
        biopython_abif_raw.get("DATA8"), # type: ignore
        biopython_abif_raw.get("DSam1"), # type: ignore
        biopython_abif_raw.get("DyeN1"), # type: ignore
        biopython_abif_raw.get("DyeN2"), # type: ignore
        biopython_abif_raw.get("DyeN3"), # type: ignore
        biopython_abif_raw.get("DyeN4"), # type: ignore
        biopython_abif_raw.get("DyeW1"), # type: ignore
        biopython_abif_raw.get("DyeW2"), # type: ignore
        biopython_abif_raw.get("DyeW3"), # type: ignore
        biopython_abif_raw.get("DyeW4"), # type: ignore
        biopython_abif_raw.get("DySN1"), # type: ignore
        biopython_abif_raw.get("EPVt1"), # type: ignore
        biopython_abif_raw.get("EVNT1"), # type: ignore
        biopython_abif_raw.get("EVNT2"), # type: ignore
        biopython_abif_raw.get("EVNT3"), # type: ignore
        biopython_abif_raw.get("EVNT4"), # type: ignore
        biopython_abif_raw.get("FWO_1"), # type: ignore
        biopython_abif_raw.get("GTyp1"), # type: ignore
        biopython_abif_raw.get("InSc1"), # type: ignore
        biopython_abif_raw.get("InVt1"), # type: ignore
        biopython_abif_raw.get("LANE1"), # type: ignore
        biopython_abif_raw.get("LIMS1"), # type: ignore
        biopython_abif_raw.get("LNTD1"), # type: ignore
        biopython_abif_raw.get("LsrP1"), # type: ignore
        biopython_abif_raw.get("MCHN1"), # type: ignore
        biopython_abif_raw.get("MODF1"), # type: ignore
        biopython_abif_raw.get("MODL1"), # type: ignore
        biopython_abif_raw.get("NAVG1"), # type: ignore
        biopython_abif_raw.get("NLNE1"), # type: ignore
        biopython_abif_raw.get("OfSc1"), # type: ignore
        biopython_abif_raw.get("PDMF1"), # type: ignore
        biopython_abif_raw.get("PXLB1"), # type: ignore
        biopython_abif_raw.get("RGCm1"), # type: ignore
        biopython_abif_raw.get("RGNm1"), # type: ignore
        biopython_abif_raw.get("RMdV1"), # type: ignore
        biopython_abif_raw.get("RMdX1"), # type: ignore
        biopython_abif_raw.get("RMXV1"), # type: ignore
        biopython_abif_raw.get("RPrN1"), # type: ignore
        biopython_abif_raw.get("RPrV1"), # type: ignore
        biopython_abif_raw.get("RUND1"), # type: ignore
        biopython_abif_raw.get("RUND2"), # type: ignore
        biopython_abif_raw.get("RUND3"), # type: ignore
        biopython_abif_raw.get("RUND4"), # type: ignore
        biopython_abif_raw.get("RunN1"), # type: ignore
        biopython_abif_raw.get("RUNT1"), # type: ignore
        biopython_abif_raw.get("RUNT2"), # type: ignore
        biopython_abif_raw.get("RUNT3"), # type: ignore
        biopython_abif_raw.get("RUNT4"), # type: ignore
        biopython_abif_raw.get("Satd"), # type: ignore
        biopython_abif_raw.get("Scal1"), # type: ignore
        biopython_abif_raw.get("SCAN1"), # type: ignore
        biopython_abif_raw.get("SMED1"), # type: ignore
        biopython_abif_raw.get("SMLt"), # type: ignore
        biopython_abif_raw.get("SMPL1"), # type: ignore
        biopython_abif_raw.get("SVER1"), # type: ignore
        biopython_abif_raw.get("SVER3"), # type: ignore
        biopython_abif_raw.get("Tmpr1"), # type: ignore
        biopython_abif_raw.get("TUBE"), # type: ignore
        biopython_abif_raw.get("User") # type: ignore
    )
    return trace_data
--- a/src/mlstmyfasta/engine/local/fasta.py
+++ b/src/mlstmyfasta/engine/local/fasta.py
@ -0,0 +1,11 @@
 import asyncio
 from io import TextIOWrapper
 from typing import Any, AsyncGenerator, Generator, Sequence, Union
 from Bio import SeqIO
 from mlstmyfasta.engine.data.genomics import NamedString
 async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
    for fasta_sequence in await fasta_sequences:
        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
--- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/init.py
+++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/init.py
--- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py
+++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/profiling.py
@ -0,0 +1,36 @@
 from contextlib import AbstractAsyncContextManager
 import re
 from typing import Sequence
 from aiohttp import ClientSession, ClientTimeout
 from mlstmyfasta.engine.data.MLST import Allele
 from mlstmyfasta.engine.data.genomics import NamedString
 from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse
 class InstitutPasteurProfiler(AbstractAsyncContextManager):
    async def __aenter__(self):
        return self
    def __init__(self, database_name: str):
        self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
    async def fetch_mlst_profile(self, sequence_string: str):
        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
        uri_path = f"schemes/3/sequence"
        response = await self._http_client.post(uri_path, json={
            "sequence": sequence_string
        })
        sequence_response: dict = await response.json()
        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
        for allele_loci, alleles in exact_matches.items():
            for allele in alleles:
                alelle_id = allele["allele_id"]
                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
    async def close(self):
        await self._http_client.close()
    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.close()
--- a/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py
+++ b/src/mlstmyfasta/engine/remote/databases/institutpasteur/structures.py
@ -0,0 +1,22 @@
 from dataclasses import dataclass
 from typing import Sequence, Union
 from dataclasses_json import dataclass_json
@dataclass_json
@dataclass
 class InstitutPasteurSequenceRequest:
    sequence: str
    details: Union[bool, None]
    partial_matches: Union[bool, None]
    base64: Union[bool, None]
@dataclass_json
@dataclass
 class InstitutPasteurSequenceResponse:
    exact_matches: Sequence[tuple[int, str]]
    start: Union[int, None]
    end: Union[int, None]
    orientation: Union[str, None]
    length: Union[int, None]
    contig: Union[str, None]
--- a/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
+++ b/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
--- a/src/mlstmyfasta/engine/remote/databases/ncbi/genbank.py
+++ b/src/mlstmyfasta/engine/remote/databases/ncbi/genbank.py
@ -0,0 +1,27 @@
 import asyncio
 from Bio import Entrez
 from Bio import SeqIO
 # TODO Change this out for a more professional approach
 Entrez.email = "yunyangdeng@outlook.com"
 from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation
 async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
        record = SeqIO.read(fetch_stream, "genbank")
        sequence_features = list()
        for feature in record.features:
            start = int(feature.location.start)
            end = int(feature.location.end)
            qualifiers = feature.qualifiers
            for qualifier_key in qualifiers:
                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
            sequence_features.append(StringAnnotation(
                type=feature.type,
                start=start,
                end=end+1,  # Position is exclusive
                feature_properties=qualifiers
            ))
        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
--- a/src/mlstmyfasta/engine/web/databases/institutpasteur/structures.py
+++ b/src/mlstmyfasta/engine/web/databases/institutpasteur/structures.py
@ -1,18 +0,0 @@
 from dataclasses import dataclass
 from typing import Sequence, Union
@dataclass
 class SequenceRequestBody:
    sequence: str
    details: Union[None, str]
    partial_matches: Union[None, bool]
    base64: Union[None, bool]
@dataclass
 class SequenceResponse:
    exact_matches: Sequence[tuple[str, str]]
    start: Union[None, int]
    end: Union[None, int]
    orientation: Union[None, str]
    length: Union[None, int]
    contig: Union[None, str]
--- a/tests/mlstmyfasta/engine/local/test_abif.py
+++ b/tests/mlstmyfasta/engine/local/test_abif.py
@ -0,0 +1,8 @@
 import os
 from mlstmyfasta.engine.local.abif import load_sanger_sequence
 async def test_load_sanger_sequence_has_data():
    assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
    result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1")
    assert result_data is not None
--- a/tests/mlstmyfasta/engine/local/test_fasta.py
+++ b/tests/mlstmyfasta/engine/local/test_fasta.py
@ -0,0 +1,7 @@
 from mlstmyfasta.engine.local.fasta import read_fasta
 async def test_fasta_reader_not_none():
    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
    async for named_string in named_strings:
        assert named_string.name == "BX470248.1"
--- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py
+++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_access.py
--- a/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py
+++ b/tests/mlstmyfasta/engine/remote/databases/institutpasteur/test_profiles.py
@ -0,0 +1,16 @@
 from Bio import SeqIO
 from mlstmyfasta.engine.data.MLST import Allele
 from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
 async def test_profiling_results_in_exact_matches_when_exact():
    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
        exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence)
        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
        async for exact_match in exact_matches:
            assert isinstance(exact_match, Allele)
            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
            targets_left.remove(exact_match.allele_loci)
        assert len(targets_left) == 0
--- a/tests/mlstmyfasta/engine/remote/databases/ncbi/test_genbank.py
+++ b/tests/mlstmyfasta/engine/remote/databases/ncbi/test_genbank.py
@ -0,0 +1,5 @@
 from mlstmyfasta.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
 async def test_fetch_ncbi_genbank_with_id_works():
    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
--- a/tests/mlstmyfasta/engine/test_annotations.py
+++ b/tests/mlstmyfasta/engine/test_annotations.py
@ -1,14 +1,11 @@
-from mlstmyfasta.engine.annotations import annotate_from_genbank, fetch_ncbi_genbank
+from mlstmyfasta.engine.annotate import annotate_from_genbank, fetch_ncbi_genbank
 from Bio import SeqIO
 from mlstmyfasta.engine.data.genomics import AnnotatedString
 async def test_fetch_ncbi_genbank_with_id_works():
    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
 async def test_annotate_from_genbank_for_adk_annotation():
-    sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
-    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"]))
+    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", sequence, max_annotation_length=750, gene_targets=set(["adk"]))
    assert isinstance(annotated_sequence, AnnotatedString)
    assert len(annotated_sequence.annotations) >= 1
    assert annotated_sequence.annotations[0].type == "gene"
--- a/tests/resources/1I1_F_P1815443_047.ab1
+++ b/tests/resources/1I1_F_P1815443_047.ab1
--- a/tests/resources/1I1_R_P1815443_094.ab1
+++ b/tests/resources/1I1_R_P1815443_094.ab1
--- a/tests/resources/1I2_F_P1815443_048.ab1
+++ b/tests/resources/1I2_F_P1815443_048.ab1
--- a/tests/resources/1I2_R_P1815443_011.ab1
+++ b/tests/resources/1I2_R_P1815443_011.ab1