diff --git a/src/autobigs/engine/analysis/genbank.py b/src/autobigs/engine/analysis/genbank.py new file mode 100644 index 0000000..8d771ad --- /dev/null +++ b/src/autobigs/engine/analysis/genbank.py @@ -0,0 +1,26 @@ +import asyncio +from contextlib import AbstractAsyncContextManager +import tempfile +from typing import Iterable, Union +from Bio import Entrez +from Bio import SeqIO + +from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation + +async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: + with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream: + record = SeqIO.read(fetch_stream, "genbank") + sequence_features = list() + for feature in record.features: + start = int(feature.location.start) + end = int(feature.location.end) + qualifiers = feature.qualifiers + for qualifier_key in qualifiers: + qualifiers[qualifier_key] = set(qualifiers[qualifier_key]) + sequence_features.append(StringAnnotation( + type=feature.type, + start=start, + end=end+1, # Position is exclusive + feature_properties=qualifiers + )) + return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)