From 28733337d2c9fe980e045635c80c26b25df82be0 Mon Sep 17 00:00:00 2001
From: Harrison Deng <yunyangdeng@outlook.com>
Date: Thu, 2 Jan 2025 21:53:55 +0000
Subject: [PATCH] Completed untested annotation function

---
 src/mlstmyfasta/engine/annotations.py        | 15 +++++++++++----
 tests/mlstmyfasta/engine/test_annotations.py |  3 ++-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/mlstmyfasta/engine/annotations.py b/src/mlstmyfasta/engine/annotations.py
index 5598a05..43c8f89 100644
--- a/src/mlstmyfasta/engine/annotations.py
+++ b/src/mlstmyfasta/engine/annotations.py
@@ -24,8 +24,9 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
         return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
 
 
-async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
+async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750):
     reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
+    query_annotations = list()
     aligner = PairwiseAligner("blastn")
     aligner.mode = "local"
     for annotation in reference_annotations.annotations:
@@ -33,10 +34,16 @@ async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotati
             # TODO implement a failsafe
             continue
         feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
-        alignments = aligner.align(query_coding, feature_string_sequence)
+        alignments = aligner.align(query_string, feature_string_sequence)
         if len(alignments) < 1:
             # TODO implement a failsafe
             continue
-        top_alignment = sorted(aligner.align(query_coding, annotation))[0]
+        top_alignment = sorted(aligner.align(query_string, annotation))[0]
         # TODO Check if alternatives are better
-        
\ No newline at end of file
+        query_annotations.append(StringAnnotation(
+            type=annotation.type, # same as original
+            start=np.min(top_alignment.aligned[0]), # We only care about the start of first chunk
+            end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
+            feature_properties=dict(annotation.feature_properties) # same as original
+        ))
+    return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
\ No newline at end of file
diff --git a/tests/mlstmyfasta/engine/test_annotations.py b/tests/mlstmyfasta/engine/test_annotations.py
index c20afda..8b7d2cc 100644
--- a/tests/mlstmyfasta/engine/test_annotations.py
+++ b/tests/mlstmyfasta/engine/test_annotations.py
@@ -8,4 +8,5 @@ async def test_fetch_ncbi_genbank_with_id_works():
 
 async def test_annotate_from_genbank_results_in_annotations():
     sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
-    assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString
\ No newline at end of file
+    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence))
+    assert annotated_sequence is AnnotatedString
\ No newline at end of file