42 lines
1.8 KiB
Python

import asyncio
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
import numpy as np
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=feature.qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for annotation in reference_annotations.annotations:
if annotation.end - annotation.start > max_annotation_length:
# TODO implement a failsafe
continue
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
alignments = aligner.align(query_coding, feature_string_sequence)
if len(alignments) < 1:
# TODO implement a failsafe
continue
top_alignment = sorted(aligner.align(query_coding, annotation))[0]
# TODO Check if alternatives are better