import asyncio from typing import Any, Generator, List, Sequence from Bio.Align import PairwiseAligner from Bio import Entrez from Bio import SeqIO import numpy as np from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream: record = SeqIO.read(fetch_stream, "genbank") sequence_features = list() for feature in record.features: start = int(feature.location.start) end = int(feature.location.end) sequence_features.append(StringAnnotation( type=feature.type, start=start, end=end+1, # Position is exclusive feature_properties=feature.qualifiers )) return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features) async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000): reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id) aligner = PairwiseAligner("blastn") aligner.mode = "local" for annotation in reference_annotations.annotations: if annotation.end - annotation.start > max_annotation_length: # TODO implement a failsafe continue feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation) alignments = aligner.align(query_coding, feature_string_sequence) if len(alignments) < 1: # TODO implement a failsafe continue top_alignment = sorted(aligner.align(query_coding, annotation))[0] # TODO Check if alternatives are better