42 lines
1.8 KiB
Python
42 lines
1.8 KiB
Python
import asyncio
|
|
from typing import Any, Generator, List, Sequence
|
|
from Bio.Align import PairwiseAligner
|
|
from Bio import Entrez
|
|
from Bio import SeqIO
|
|
import numpy as np
|
|
|
|
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
|
|
|
|
|
|
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
|
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
|
record = SeqIO.read(fetch_stream, "genbank")
|
|
sequence_features = list()
|
|
for feature in record.features:
|
|
start = int(feature.location.start)
|
|
end = int(feature.location.end)
|
|
sequence_features.append(StringAnnotation(
|
|
type=feature.type,
|
|
start=start,
|
|
end=end+1, # Position is exclusive
|
|
feature_properties=feature.qualifiers
|
|
))
|
|
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
|
|
|
|
|
async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
|
|
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
|
|
aligner = PairwiseAligner("blastn")
|
|
aligner.mode = "local"
|
|
for annotation in reference_annotations.annotations:
|
|
if annotation.end - annotation.start > max_annotation_length:
|
|
# TODO implement a failsafe
|
|
continue
|
|
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
|
|
alignments = aligner.align(query_coding, feature_string_sequence)
|
|
if len(alignments) < 1:
|
|
# TODO implement a failsafe
|
|
continue
|
|
top_alignment = sorted(aligner.align(query_coding, annotation))[0]
|
|
# TODO Check if alternatives are better
|
|
|