Tested annotation alignment system

2025-01-03 14:39:18 +00:00 · 2025-01-03 14:39:18 +00:00 · b834aa93b0
commit b834aa93b0
parent 28733337d2
9 changed files with 28 additions and 25 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -12,8 +12,7 @@
 			"extensions": [
 				"ms-python.isort",
 				"njpwerner.autodocstring",
-				"ms-python.black-formatter",
+				"ms-python.black-formatter"
 				"ms-python.pylint"
 			]
 		}
 	},
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-requests
+aiohttp[speedups]
 biopython
 pytest
 pytest-asyncio
--- a/src/mlstmyfasta/engine/annotations.py
+++ b/src/mlstmyfasta/engine/annotations.py
@ -1,10 +1,15 @@
 import asyncio
 from collections.abc import Set
 from typing import Any, Generator, List, Sequence
 from Bio.Align import PairwiseAligner
 from Bio import Entrez
 from Bio import SeqIO
 import numpy as np
 # TODO Change this out for a more professional approach
 Entrez.email = "yunyangdeng@outlook.com"
 from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
@ -15,22 +20,31 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
        for feature in record.features:
            start = int(feature.location.start)
            end = int(feature.location.end)
            qualifiers = feature.qualifiers
            for qualifier_key in qualifiers:
                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
            sequence_features.append(StringAnnotation(
                type=feature.type,
                start=start,
                end=end+1,  # Position is exclusive
-                feature_properties=feature.qualifiers
+                feature_properties=qualifiers
            ))
        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
-async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750):
+async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
    # TODO implement asynchronous alignment algorithm
    reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
    query_annotations = list()
    aligner = PairwiseAligner("blastn")
    aligner.mode = "local"
    for annotation in reference_annotations.annotations:
-        if annotation.end - annotation.start > max_annotation_length:
+        if annotation.type != "gene" or "gene" not in annotation.feature_properties:
            continue
        if len(gene_targets) > 0 and "gene" in annotation.feature_properties:
            if not annotation.feature_properties["gene"].intersection(gene_targets):
                continue
        if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length:
            # TODO implement a failsafe
            continue
        feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
@ -38,7 +52,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string:
        if len(alignments) < 1:
            # TODO implement a failsafe
            continue
-        top_alignment = sorted(aligner.align(query_string, annotation))[0]
+        top_alignment = sorted(alignments)[0]
        # TODO Check if alternatives are better
        query_annotations.append(StringAnnotation(
            type=annotation.type, # same as original
--- a/src/mlstmyfasta/engine/data/genomics.py
+++ b/src/mlstmyfasta/engine/data/genomics.py
@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Mapping, Sequence
+from typing import Mapping, Sequence, Set
@dataclass
@ -7,7 +7,7 @@ class StringAnnotation:
    type: str
    start: int
    end: int
-    feature_properties: Mapping[str, Sequence[str]]
+    feature_properties: Mapping[str, Set[str]]
@dataclass
 class AnnotatedString:
--- a/src/mlstmyfasta/engine/external/databases/institutpasteur/query.py
+++ b/src/mlstmyfasta/engine/external/databases/institutpasteur/query.py
@ -1,13 +0,0 @@
 import json
 import requests
 from mlstmyfasta.engine.data import MLST
 from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceRequestBody
 from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceResponse
 async def query_fasta_mlst_profile(database: str, request_body: SequenceRequestBody):
    url: str = f"https://bigsdb.pasteur.fr/api/db/{database}/mlst/sequence"
    request_result = requests.post(url, json=json.dumps(request_body), timeout=10000)
    seq_result: SequenceResponse = request_result.json()
    result = list()
    for exact_match in seq_result.exact_matches:
        result.append(MLST.Allele())
--- a/src/mlstmyfasta/engine/external/databases/institutpasteur/init.py
+++ b/src/mlstmyfasta/engine/external/databases/institutpasteur/init.py
--- a/src/mlstmyfasta/engine/external/databases/institutpasteur/structures.py
+++ b/src/mlstmyfasta/engine/external/databases/institutpasteur/structures.py
--- a/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
+++ b/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
--- a/tests/mlstmyfasta/engine/test_annotations.py
+++ b/tests/mlstmyfasta/engine/test_annotations.py
@ -6,7 +6,10 @@ from mlstmyfasta.engine.data.genomics import AnnotatedString
 async def test_fetch_ncbi_genbank_with_id_works():
    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
-async def test_annotate_from_genbank_results_in_annotations():
+async def test_annotate_from_genbank_for_adk_annotation():
    sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
-    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence))
+    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"]))
-    assert annotated_sequence is AnnotatedString
+    assert isinstance(annotated_sequence, AnnotatedString)
    assert len(annotated_sequence.annotations) >= 1
    assert annotated_sequence.annotations[0].type == "gene"
    assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]