Tested annotation alignment system

This commit is contained in:
Harrison Deng 2025-01-03 14:39:18 +00:00
parent 28733337d2
commit b834aa93b0
9 changed files with 28 additions and 25 deletions

View File

@ -12,8 +12,7 @@
"extensions": [ "extensions": [
"ms-python.isort", "ms-python.isort",
"njpwerner.autodocstring", "njpwerner.autodocstring",
"ms-python.black-formatter", "ms-python.black-formatter"
"ms-python.pylint"
] ]
} }
}, },

View File

@ -1,4 +1,4 @@
requests aiohttp[speedups]
biopython biopython
pytest pytest
pytest-asyncio pytest-asyncio

View File

@ -1,10 +1,15 @@
import asyncio import asyncio
from collections.abc import Set
from typing import Any, Generator, List, Sequence from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner from Bio.Align import PairwiseAligner
from Bio import Entrez from Bio import Entrez
from Bio import SeqIO from Bio import SeqIO
import numpy as np import numpy as np
# TODO Change this out for a more professional approach
Entrez.email = "yunyangdeng@outlook.com"
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
@ -15,22 +20,31 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
for feature in record.features: for feature in record.features:
start = int(feature.location.start) start = int(feature.location.start)
end = int(feature.location.end) end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation( sequence_features.append(StringAnnotation(
type=feature.type, type=feature.type,
start=start, start=start,
end=end+1, # Position is exclusive end=end+1, # Position is exclusive
feature_properties=feature.qualifiers feature_properties=qualifiers
)) ))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features) return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750): async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
# TODO implement asynchronous alignment algorithm
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id) reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
query_annotations = list() query_annotations = list()
aligner = PairwiseAligner("blastn") aligner = PairwiseAligner("blastn")
aligner.mode = "local" aligner.mode = "local"
for annotation in reference_annotations.annotations: for annotation in reference_annotations.annotations:
if annotation.end - annotation.start > max_annotation_length: if annotation.type != "gene" or "gene" not in annotation.feature_properties:
continue
if len(gene_targets) > 0 and "gene" in annotation.feature_properties:
if not annotation.feature_properties["gene"].intersection(gene_targets):
continue
if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length:
# TODO implement a failsafe # TODO implement a failsafe
continue continue
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation) feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
@ -38,7 +52,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string:
if len(alignments) < 1: if len(alignments) < 1:
# TODO implement a failsafe # TODO implement a failsafe
continue continue
top_alignment = sorted(aligner.align(query_string, annotation))[0] top_alignment = sorted(alignments)[0]
# TODO Check if alternatives are better # TODO Check if alternatives are better
query_annotations.append(StringAnnotation( query_annotations.append(StringAnnotation(
type=annotation.type, # same as original type=annotation.type, # same as original

View File

@ -1,5 +1,5 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Mapping, Sequence from typing import Mapping, Sequence, Set
@dataclass @dataclass
@ -7,7 +7,7 @@ class StringAnnotation:
type: str type: str
start: int start: int
end: int end: int
feature_properties: Mapping[str, Sequence[str]] feature_properties: Mapping[str, Set[str]]
@dataclass @dataclass
class AnnotatedString: class AnnotatedString:

View File

@ -1,13 +0,0 @@
import json
import requests
from mlstmyfasta.engine.data import MLST
from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceRequestBody
from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceResponse
async def query_fasta_mlst_profile(database: str, request_body: SequenceRequestBody):
url: str = f"https://bigsdb.pasteur.fr/api/db/{database}/mlst/sequence"
request_result = requests.post(url, json=json.dumps(request_body), timeout=10000)
seq_result: SequenceResponse = request_result.json()
result = list()
for exact_match in seq_result.exact_matches:
result.append(MLST.Allele())

View File

@ -6,7 +6,10 @@ from mlstmyfasta.engine.data.genomics import AnnotatedString
async def test_fetch_ncbi_genbank_with_id_works(): async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
async def test_annotate_from_genbank_results_in_annotations(): async def test_annotate_from_genbank_for_adk_annotation():
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence)) annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"]))
assert annotated_sequence is AnnotatedString assert isinstance(annotated_sequence, AnnotatedString)
assert len(annotated_sequence.annotations) >= 1
assert annotated_sequence.annotations[0].type == "gene"
assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]