Tested annotation alignment system
This commit is contained in:
parent
28733337d2
commit
b834aa93b0
@ -12,8 +12,7 @@
|
||||
"extensions": [
|
||||
"ms-python.isort",
|
||||
"njpwerner.autodocstring",
|
||||
"ms-python.black-formatter",
|
||||
"ms-python.pylint"
|
||||
"ms-python.black-formatter"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
@ -1,4 +1,4 @@
|
||||
requests
|
||||
aiohttp[speedups]
|
||||
biopython
|
||||
pytest
|
||||
pytest-asyncio
|
@ -1,10 +1,15 @@
|
||||
import asyncio
|
||||
from collections.abc import Set
|
||||
from typing import Any, Generator, List, Sequence
|
||||
from Bio.Align import PairwiseAligner
|
||||
from Bio import Entrez
|
||||
from Bio import SeqIO
|
||||
import numpy as np
|
||||
|
||||
|
||||
# TODO Change this out for a more professional approach
|
||||
Entrez.email = "yunyangdeng@outlook.com"
|
||||
|
||||
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
|
||||
|
||||
|
||||
@ -15,22 +20,31 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
||||
for feature in record.features:
|
||||
start = int(feature.location.start)
|
||||
end = int(feature.location.end)
|
||||
qualifiers = feature.qualifiers
|
||||
for qualifier_key in qualifiers:
|
||||
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
|
||||
sequence_features.append(StringAnnotation(
|
||||
type=feature.type,
|
||||
start=start,
|
||||
end=end+1, # Position is exclusive
|
||||
feature_properties=feature.qualifiers
|
||||
feature_properties=qualifiers
|
||||
))
|
||||
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
||||
|
||||
|
||||
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750):
|
||||
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
|
||||
# TODO implement asynchronous alignment algorithm
|
||||
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
|
||||
query_annotations = list()
|
||||
aligner = PairwiseAligner("blastn")
|
||||
aligner.mode = "local"
|
||||
for annotation in reference_annotations.annotations:
|
||||
if annotation.end - annotation.start > max_annotation_length:
|
||||
if annotation.type != "gene" or "gene" not in annotation.feature_properties:
|
||||
continue
|
||||
if len(gene_targets) > 0 and "gene" in annotation.feature_properties:
|
||||
if not annotation.feature_properties["gene"].intersection(gene_targets):
|
||||
continue
|
||||
if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length:
|
||||
# TODO implement a failsafe
|
||||
continue
|
||||
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
|
||||
@ -38,7 +52,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string:
|
||||
if len(alignments) < 1:
|
||||
# TODO implement a failsafe
|
||||
continue
|
||||
top_alignment = sorted(aligner.align(query_string, annotation))[0]
|
||||
top_alignment = sorted(alignments)[0]
|
||||
# TODO Check if alternatives are better
|
||||
query_annotations.append(StringAnnotation(
|
||||
type=annotation.type, # same as original
|
||||
|
@ -1,5 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence
|
||||
from typing import Mapping, Sequence, Set
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -7,7 +7,7 @@ class StringAnnotation:
|
||||
type: str
|
||||
start: int
|
||||
end: int
|
||||
feature_properties: Mapping[str, Sequence[str]]
|
||||
feature_properties: Mapping[str, Set[str]]
|
||||
|
||||
@dataclass
|
||||
class AnnotatedString:
|
||||
|
@ -1,13 +0,0 @@
|
||||
import json
|
||||
import requests
|
||||
from mlstmyfasta.engine.data import MLST
|
||||
from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceRequestBody
|
||||
from mlstmyfasta.engine.external.databases.institutpasteur.structures import SequenceResponse
|
||||
|
||||
async def query_fasta_mlst_profile(database: str, request_body: SequenceRequestBody):
|
||||
url: str = f"https://bigsdb.pasteur.fr/api/db/{database}/mlst/sequence"
|
||||
request_result = requests.post(url, json=json.dumps(request_body), timeout=10000)
|
||||
seq_result: SequenceResponse = request_result.json()
|
||||
result = list()
|
||||
for exact_match in seq_result.exact_matches:
|
||||
result.append(MLST.Allele())
|
0
tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
vendored
Normal file
0
tests/mlstmyfasta/engine/external/databases/institutpasteur/test_access.py
vendored
Normal file
@ -6,7 +6,10 @@ from mlstmyfasta.engine.data.genomics import AnnotatedString
|
||||
async def test_fetch_ncbi_genbank_with_id_works():
|
||||
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
|
||||
|
||||
async def test_annotate_from_genbank_results_in_annotations():
|
||||
async def test_annotate_from_genbank_for_adk_annotation():
|
||||
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
|
||||
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence))
|
||||
assert annotated_sequence is AnnotatedString
|
||||
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"]))
|
||||
assert isinstance(annotated_sequence, AnnotatedString)
|
||||
assert len(annotated_sequence.annotations) >= 1
|
||||
assert annotated_sequence.annotations[0].type == "gene"
|
||||
assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user