Wrote simple annotation test

2025-01-02 21:28:45 +00:00 · 2025-01-02 21:28:45 +00:00 · aa2d494bee
commit aa2d494bee
parent a9ae193cdc
4 changed files with 58410 additions and 20 deletions
--- a/src/mlstmyfasta/engine/annotations.py
+++ b/src/mlstmyfasta/engine/annotations.py
@ -1,34 +1,42 @@
+import asyncio
 from typing import Any, Generator, List, Sequence
 from Bio.Align import PairwiseAligner
 from Bio import Entrez
 from Bio import SeqIO
+import numpy as np

-from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding
+from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding


-async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
-    with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
+async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
+    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
        record = SeqIO.read(fetch_stream, "genbank")
        sequence_features = list()
        for feature in record.features:
            start = int(feature.location.start)
            end = int(feature.location.end)
-            sequence_features.append(StrandFeature(
+            sequence_features.append(StringAnnotation(
                type=feature.type,
                start=start,
                end=end+1,  # Position is exclusive
                feature_properties=feature.qualifiers
            ))
-        return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features)
+        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)


-async def annotate_from_genbank(genbank_id: str, query_coding: str):
-    strand = await fetch_ncbi_genbank(genbank_id=genbank_id)
+async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
+    reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
    aligner = PairwiseAligner("blastn")
    aligner.mode = "local"
-    for feature in strand.features:
-        feature_coding = get_feature_coding(strand=strand, feature=feature)
-        if len(aligner.align(query_coding, feature)) < 1:
+    for annotation in reference_annotations.annotations:
+        if annotation.end - annotation.start > max_annotation_length:
            # TODO implement a failsafe
            continue
-        sequence = sorted(aligner.align(query_coding, feature))[0]
+        feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
+        alignments = aligner.align(query_coding, feature_string_sequence)
+        if len(alignments) < 1:
+            # TODO implement a failsafe
+            continue
+        top_alignment = sorted(aligner.align(query_coding, annotation))[0]
+        # TODO Check if alternatives are better
+        
--- a/src/mlstmyfasta/engine/data/genomics.py
+++ b/src/mlstmyfasta/engine/data/genomics.py
@ -3,17 +3,17 @@ from typing import Mapping, Sequence


@dataclass
-class StrandFeature:
+class StringAnnotation:
    type: str
    start: int
    end: int
    feature_properties: Mapping[str, Sequence[str]]
    
@dataclass
-class Strand:
+class AnnotatedString:
    name: str
-    coding: str
-    features: Sequence[StrandFeature]
+    sequence: str
+    annotations: Sequence[StringAnnotation]

-def get_feature_coding(strand: Strand, feature: StrandFeature) -> str:
-    return strand.coding[feature.start:feature.end]
+def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
+    return annotated_string.sequence[string_annotation.start:string_annotation.end]
--- a/tests/mlstmyfasta/engine/test_annotations.py
+++ b/tests/mlstmyfasta/engine/test_annotations.py
@ -1,6 +1,11 @@
-from mlstmyfasta.engine.annotations import fetch_ncbi_genbank
+from mlstmyfasta.engine.annotations import annotate_from_genbank, fetch_ncbi_genbank
+from Bio import SeqIO

+from mlstmyfasta.engine.data.genomics import AnnotatedString

 async def test_fetch_ncbi_genbank_with_id_works():
-    assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0 
-    
+    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
+
+async def test_annotate_from_genbank_results_in_annotations():
+    sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
+    assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString
--- a/tests/resources/tohama_I_bpertussis.fasta
+++ b/tests/resources/tohama_I_bpertussis.fasta