diff --git a/src/mlstmyfasta/engine/annotations/wgs.py b/src/mlstmyfasta/engine/annotations/wgs.py new file mode 100644 index 0000000..efb5f80 --- /dev/null +++ b/src/mlstmyfasta/engine/annotations/wgs.py @@ -0,0 +1,24 @@ +from typing import Any, Generator, List, Sequence +from Bio.Align import PairwiseAligner +from Bio import Entrez +from Bio import SeqIO + +from mlstmyfasta.engine.data.genomics import Strand, StrandFeature + +async def fetch_ncbi_genbank(genbank_id: str) -> Strand: + with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream: + record = SeqIO.read(fetch_stream, "genbank") + sequence_features = list() + for feature in record.features: + start, end = feature.location.split("..") + start = int(start) + end = int(end) + feature_properties = dict() + for qualifier in feature.qualifiers: + feature_properties[qualifier.key] = qualifier.value + sequence_features.append(StrandFeature(name=feature.key, + start=start, + end=end, + feature_properties=feature_properties + )) + return Strand(name=genbank_id, coding=record.sequence, features=sequence_features) diff --git a/src/mlstmyfasta/engine/data/genomics.py b/src/mlstmyfasta/engine/data/genomics.py index aba72eb..4abf716 100644 --- a/src/mlstmyfasta/engine/data/genomics.py +++ b/src/mlstmyfasta/engine/data/genomics.py @@ -1,12 +1,18 @@ from dataclasses import dataclass +from typing import Mapping, Sequence + @dataclass -class Sequence: +class StrandFeature: name: str - sequence: str - + start: int + end: int + feature_properties: Mapping[str, str] + @dataclass -class SequenceFeature: - type: str +class Strand: name: str + coding: str + features: Sequence[StrandFeature] + \ No newline at end of file