diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..e137fad --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 629f81a..b5c9f69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ addopts = [ asyncio_mode = "auto" [tool.pylint.main] -source-roots = src +source-roots = "src" [tool.pylint.format] # Maximum number of characters on a single line. diff --git a/requirements.txt b/requirements.txt index 4198645..791625e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ requests -biopython \ No newline at end of file +biopython +pytest +pytest-asyncio \ No newline at end of file diff --git a/src/mlstmyfasta/engine/annotations/__init__.py b/src/mlstmyfasta/engine/__init__.py similarity index 100% rename from src/mlstmyfasta/engine/annotations/__init__.py rename to src/mlstmyfasta/engine/__init__.py diff --git a/src/mlstmyfasta/engine/annotations.py b/src/mlstmyfasta/engine/annotations.py new file mode 100644 index 0000000..4c4d817 --- /dev/null +++ b/src/mlstmyfasta/engine/annotations.py @@ -0,0 +1,34 @@ +from typing import Any, Generator, List, Sequence +from Bio.Align import PairwiseAligner +from Bio import Entrez +from Bio import SeqIO + +from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding + + +async def fetch_ncbi_genbank(genbank_id: str) -> Strand: + with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream: + record = SeqIO.read(fetch_stream, "genbank") + sequence_features = list() + for feature in record.features: + start = int(feature.location.start) + end = int(feature.location.end) + sequence_features.append(StrandFeature( + type=feature.type, + start=start, + end=end+1, # Position is exclusive + feature_properties=feature.qualifiers + )) + return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features) + + +async def annotate_from_genbank(genbank_id: str, query_coding: str): + strand = await fetch_ncbi_genbank(genbank_id=genbank_id) + aligner = PairwiseAligner("blastn") + aligner.mode = "local" + for feature in strand.features: + feature_coding = get_feature_coding(strand=strand, feature=feature) + if len(aligner.align(query_coding, feature)) < 1: + # TODO implement a failsafe + continue + sequence = sorted(aligner.align(query_coding, feature))[0] diff --git a/src/mlstmyfasta/engine/annotations/wgs.py b/src/mlstmyfasta/engine/annotations/wgs.py deleted file mode 100644 index efb5f80..0000000 --- a/src/mlstmyfasta/engine/annotations/wgs.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Any, Generator, List, Sequence -from Bio.Align import PairwiseAligner -from Bio import Entrez -from Bio import SeqIO - -from mlstmyfasta.engine.data.genomics import Strand, StrandFeature - -async def fetch_ncbi_genbank(genbank_id: str) -> Strand: - with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream: - record = SeqIO.read(fetch_stream, "genbank") - sequence_features = list() - for feature in record.features: - start, end = feature.location.split("..") - start = int(start) - end = int(end) - feature_properties = dict() - for qualifier in feature.qualifiers: - feature_properties[qualifier.key] = qualifier.value - sequence_features.append(StrandFeature(name=feature.key, - start=start, - end=end, - feature_properties=feature_properties - )) - return Strand(name=genbank_id, coding=record.sequence, features=sequence_features) diff --git a/src/mlstmyfasta/engine/data/genomics.py b/src/mlstmyfasta/engine/data/genomics.py index 4abf716..e24698a 100644 --- a/src/mlstmyfasta/engine/data/genomics.py +++ b/src/mlstmyfasta/engine/data/genomics.py @@ -4,10 +4,10 @@ from typing import Mapping, Sequence @dataclass class StrandFeature: - name: str + type: str start: int end: int - feature_properties: Mapping[str, str] + feature_properties: Mapping[str, Sequence[str]] @dataclass class Strand: @@ -15,4 +15,5 @@ class Strand: coding: str features: Sequence[StrandFeature] - \ No newline at end of file +def get_feature_coding(strand: Strand, feature: StrandFeature): + strand.coding[feature.start:feature.end] \ No newline at end of file diff --git a/tests/mlstmyfasta/engine/test_annotations.py b/tests/mlstmyfasta/engine/test_annotations.py new file mode 100644 index 0000000..33ac743 --- /dev/null +++ b/tests/mlstmyfasta/engine/test_annotations.py @@ -0,0 +1,6 @@ +from mlstmyfasta.engine.annotations import fetch_ncbi_genbank + + +async def test_fetch_ncbi_genbank_with_id_works(): + assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0 + \ No newline at end of file