From a27e09da315f26c1b6d3357da033bc6c51c0a8b6 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Thu, 6 Feb 2025 17:11:20 +0000 Subject: [PATCH] Added code to retrieve sequences and annotations from NCBI GenBank --- src/autobigs/engine/analysis/genbank.py | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 src/autobigs/engine/analysis/genbank.py diff --git a/src/autobigs/engine/analysis/genbank.py b/src/autobigs/engine/analysis/genbank.py new file mode 100644 index 0000000..8d771ad --- /dev/null +++ b/src/autobigs/engine/analysis/genbank.py @@ -0,0 +1,26 @@ +import asyncio +from contextlib import AbstractAsyncContextManager +import tempfile +from typing import Iterable, Union +from Bio import Entrez +from Bio import SeqIO + +from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation + +async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: + with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream: + record = SeqIO.read(fetch_stream, "genbank") + sequence_features = list() + for feature in record.features: + start = int(feature.location.start) + end = int(feature.location.end) + qualifiers = feature.qualifiers + for qualifier_key in qualifiers: + qualifiers[qualifier_key] = set(qualifiers[qualifier_key]) + sequence_features.append(StringAnnotation( + type=feature.type, + start=start, + end=end+1, # Position is exclusive + feature_properties=qualifiers + )) + return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)