From 28733337d2c9fe980e045635c80c26b25df82be0 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Thu, 2 Jan 2025 21:53:55 +0000 Subject: [PATCH] Completed untested annotation function --- src/mlstmyfasta/engine/annotations.py | 15 +++++++++++---- tests/mlstmyfasta/engine/test_annotations.py | 3 ++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/mlstmyfasta/engine/annotations.py b/src/mlstmyfasta/engine/annotations.py index 5598a05..43c8f89 100644 --- a/src/mlstmyfasta/engine/annotations.py +++ b/src/mlstmyfasta/engine/annotations.py @@ -24,8 +24,9 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features) -async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000): +async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750): reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id) + query_annotations = list() aligner = PairwiseAligner("blastn") aligner.mode = "local" for annotation in reference_annotations.annotations: @@ -33,10 +34,16 @@ async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotati # TODO implement a failsafe continue feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation) - alignments = aligner.align(query_coding, feature_string_sequence) + alignments = aligner.align(query_string, feature_string_sequence) if len(alignments) < 1: # TODO implement a failsafe continue - top_alignment = sorted(aligner.align(query_coding, annotation))[0] + top_alignment = sorted(aligner.align(query_string, annotation))[0] # TODO Check if alternatives are better - \ No newline at end of file + query_annotations.append(StringAnnotation( + type=annotation.type, # same as original + start=np.min(top_alignment.aligned[0]), # We only care about the start of first chunk + end=np.max(top_alignment.aligned[0]), # and the end of the last chunk + feature_properties=dict(annotation.feature_properties) # same as original + )) + return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations) \ No newline at end of file diff --git a/tests/mlstmyfasta/engine/test_annotations.py b/tests/mlstmyfasta/engine/test_annotations.py index c20afda..8b7d2cc 100644 --- a/tests/mlstmyfasta/engine/test_annotations.py +++ b/tests/mlstmyfasta/engine/test_annotations.py @@ -8,4 +8,5 @@ async def test_fetch_ncbi_genbank_with_id_works(): async def test_annotate_from_genbank_results_in_annotations(): sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq - assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString \ No newline at end of file + annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence)) + assert annotated_sequence is AnnotatedString \ No newline at end of file