sangeranalysis.engine/src/mlstmyfasta/engine/annotations.py

import asyncio
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
import numpy as np

from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding


async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
        record = SeqIO.read(fetch_stream, "genbank")
        sequence_features = list()
        for feature in record.features:
            start = int(feature.location.start)
            end = int(feature.location.end)
            sequence_features.append(StringAnnotation(
                type=feature.type,
                start=start,
                end=end+1,  # Position is exclusive
                feature_properties=feature.qualifiers
            ))
        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)


async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
    reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
    aligner = PairwiseAligner("blastn")
    aligner.mode = "local"
    for annotation in reference_annotations.annotations:
        if annotation.end - annotation.start > max_annotation_length:
            # TODO implement a failsafe
            continue
        feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
        alignments = aligner.align(query_coding, feature_string_sequence)
        if len(alignments) < 1:
            # TODO implement a failsafe
            continue
        top_alignment = sorted(aligner.align(query_coding, annotation))[0]
        # TODO Check if alternatives are better