diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b3b453e..251ee70 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,8 +12,7 @@ "extensions": [ "ms-python.isort", "njpwerner.autodocstring", - "ms-python.black-formatter", - "ms-python.pylint" + "ms-python.black-formatter" ] } }, diff --git a/requirements.txt b/requirements.txt index 791625e..e114e01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -requests +aiohttp[speedups] biopython pytest pytest-asyncio \ No newline at end of file diff --git a/src/mlstmyfasta/engine/annotations.py b/src/mlstmyfasta/engine/annotations.py index 43c8f89..72b3483 100644 --- a/src/mlstmyfasta/engine/annotations.py +++ b/src/mlstmyfasta/engine/annotations.py @@ -1,10 +1,15 @@ import asyncio +from collections.abc import Set from typing import Any, Generator, List, Sequence from Bio.Align import PairwiseAligner from Bio import Entrez from Bio import SeqIO import numpy as np + +# TODO Change this out for a more professional approach +Entrez.email = "yunyangdeng@outlook.com" + from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding @@ -15,22 +20,31 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString: for feature in record.features: start = int(feature.location.start) end = int(feature.location.end) + qualifiers = feature.qualifiers + for qualifier_key in qualifiers: + qualifiers[qualifier_key] = set(qualifiers[qualifier_key]) sequence_features.append(StringAnnotation( type=feature.type, start=start, end=end+1, # Position is exclusive - feature_properties=feature.qualifiers + feature_properties=qualifiers )) return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features) -async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750): +async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()): + # TODO implement asynchronous alignment algorithm reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id) query_annotations = list() aligner = PairwiseAligner("blastn") aligner.mode = "local" for annotation in reference_annotations.annotations: - if annotation.end - annotation.start > max_annotation_length: + if annotation.type != "gene" or "gene" not in annotation.feature_properties: + continue + if len(gene_targets) > 0 and "gene" in annotation.feature_properties: + if not annotation.feature_properties["gene"].intersection(gene_targets): + continue + if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length: # TODO implement a failsafe continue feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation) @@ -38,7 +52,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: if len(alignments) < 1: # TODO implement a failsafe continue - top_alignment = sorted(aligner.align(query_string, annotation))[0] + top_alignment = sorted(alignments)[0] # TODO Check if alternatives are better query_annotations.append(StringAnnotation( type=annotation.type, # same as original diff --git a/src/mlstmyfasta/engine/data/genomics.py b/src/mlstmyfasta/engine/data/genomics.py index bc29444..85361f6 100644 --- a/src/mlstmyfasta/engine/data/genomics.py +++ b/src/mlstmyfasta/engine/data/genomics.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Mapping, Sequence +from typing import Mapping, Sequence, Set @dataclass @@ -7,7 +7,7 @@ class StringAnnotation: type: str start: int end: int - feature_properties: Mapping[str, Sequence[str]] + feature_properties: Mapping[str, Set[str]] @dataclass class AnnotatedString: diff --git a/src/mlstmyfasta/engine/external/databases/institutpasteur/query.py b/src/mlstmyfasta/engine/external/databases/institutpasteur/query.py deleted file mode 100644 index df4e677..0000000 --- a/src/mlstmyfasta/engine/external/databases/institutpasteur/query.py +++ /dev/null @@ -1,13 +0,0 @@ -import json -import requests -from mlstmyfasta.engine.data import MLST -from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceRequestBody -from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceResponse - -async def query_fasta_mlst_profile(database: str, request_body: SequenceRequestBody): - url: str = f"https://bigsdb.pasteur.fr/api/db/{database}/mlst/sequence" - request_result = requests.post(url, json=json.dumps(request_body), timeout=10000) - seq_result: SequenceResponse = request_result.json() - result = list() - for exact_match in seq_result.exact_matches: - result.append(MLST.Allele()) \ No newline at end of file diff --git a/src/mlstmyfasta/engine/external/databases/institutpasteur/__init__.py b/src/mlstmyfasta/engine/web/databases/institutpasteur/__init__.py similarity index 100% rename from src/mlstmyfasta/engine/external/databases/institutpasteur/__init__.py rename to src/mlstmyfasta/engine/web/databases/institutpasteur/__init__.py diff --git a/src/mlstmyfasta/engine/external/databases/institutpasteur/structures.py b/src/mlstmyfasta/engine/web/databases/institutpasteur/structures.py similarity index 100% rename from src/mlstmyfasta/engine/external/databases/institutpasteur/structures.py rename to src/mlstmyfasta/engine/web/databases/institutpasteur/structures.py diff --git a/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py b/tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/mlstmyfasta/engine/test_annotations.py b/tests/mlstmyfasta/engine/test_annotations.py index 8b7d2cc..33907cd 100644 --- a/tests/mlstmyfasta/engine/test_annotations.py +++ b/tests/mlstmyfasta/engine/test_annotations.py @@ -6,7 +6,10 @@ from mlstmyfasta.engine.data.genomics import AnnotatedString async def test_fetch_ncbi_genbank_with_id_works(): assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 -async def test_annotate_from_genbank_results_in_annotations(): +async def test_annotate_from_genbank_for_adk_annotation(): sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq - annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence)) - assert annotated_sequence is AnnotatedString \ No newline at end of file + annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"])) + assert isinstance(annotated_sequence, AnnotatedString) + assert len(annotated_sequence.annotations) >= 1 + assert annotated_sequence.annotations[0].type == "gene" + assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]