Wrote simple annotation test
This commit is contained in:
parent
a9ae193cdc
commit
aa2d494bee
@ -1,34 +1,42 @@
|
|||||||
|
import asyncio
|
||||||
from typing import Any, Generator, List, Sequence
|
from typing import Any, Generator, List, Sequence
|
||||||
from Bio.Align import PairwiseAligner
|
from Bio.Align import PairwiseAligner
|
||||||
from Bio import Entrez
|
from Bio import Entrez
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding
|
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
|
||||||
|
|
||||||
|
|
||||||
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
|
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
||||||
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
|
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
||||||
record = SeqIO.read(fetch_stream, "genbank")
|
record = SeqIO.read(fetch_stream, "genbank")
|
||||||
sequence_features = list()
|
sequence_features = list()
|
||||||
for feature in record.features:
|
for feature in record.features:
|
||||||
start = int(feature.location.start)
|
start = int(feature.location.start)
|
||||||
end = int(feature.location.end)
|
end = int(feature.location.end)
|
||||||
sequence_features.append(StrandFeature(
|
sequence_features.append(StringAnnotation(
|
||||||
type=feature.type,
|
type=feature.type,
|
||||||
start=start,
|
start=start,
|
||||||
end=end+1, # Position is exclusive
|
end=end+1, # Position is exclusive
|
||||||
feature_properties=feature.qualifiers
|
feature_properties=feature.qualifiers
|
||||||
))
|
))
|
||||||
return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features)
|
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
||||||
|
|
||||||
|
|
||||||
async def annotate_from_genbank(genbank_id: str, query_coding: str):
|
async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
|
||||||
strand = await fetch_ncbi_genbank(genbank_id=genbank_id)
|
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
|
||||||
aligner = PairwiseAligner("blastn")
|
aligner = PairwiseAligner("blastn")
|
||||||
aligner.mode = "local"
|
aligner.mode = "local"
|
||||||
for feature in strand.features:
|
for annotation in reference_annotations.annotations:
|
||||||
feature_coding = get_feature_coding(strand=strand, feature=feature)
|
if annotation.end - annotation.start > max_annotation_length:
|
||||||
if len(aligner.align(query_coding, feature)) < 1:
|
|
||||||
# TODO implement a failsafe
|
# TODO implement a failsafe
|
||||||
continue
|
continue
|
||||||
sequence = sorted(aligner.align(query_coding, feature))[0]
|
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
|
||||||
|
alignments = aligner.align(query_coding, feature_string_sequence)
|
||||||
|
if len(alignments) < 1:
|
||||||
|
# TODO implement a failsafe
|
||||||
|
continue
|
||||||
|
top_alignment = sorted(aligner.align(query_coding, annotation))[0]
|
||||||
|
# TODO Check if alternatives are better
|
||||||
|
|
@ -3,17 +3,17 @@ from typing import Mapping, Sequence
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class StrandFeature:
|
class StringAnnotation:
|
||||||
type: str
|
type: str
|
||||||
start: int
|
start: int
|
||||||
end: int
|
end: int
|
||||||
feature_properties: Mapping[str, Sequence[str]]
|
feature_properties: Mapping[str, Sequence[str]]
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Strand:
|
class AnnotatedString:
|
||||||
name: str
|
name: str
|
||||||
coding: str
|
sequence: str
|
||||||
features: Sequence[StrandFeature]
|
annotations: Sequence[StringAnnotation]
|
||||||
|
|
||||||
def get_feature_coding(strand: Strand, feature: StrandFeature) -> str:
|
def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
|
||||||
return strand.coding[feature.start:feature.end]
|
return annotated_string.sequence[string_annotation.start:string_annotation.end]
|
@ -1,6 +1,11 @@
|
|||||||
from mlstmyfasta.engine.annotations import fetch_ncbi_genbank
|
from mlstmyfasta.engine.annotations import annotate_from_genbank, fetch_ncbi_genbank
|
||||||
|
from Bio import SeqIO
|
||||||
|
|
||||||
|
from mlstmyfasta.engine.data.genomics import AnnotatedString
|
||||||
|
|
||||||
async def test_fetch_ncbi_genbank_with_id_works():
|
async def test_fetch_ncbi_genbank_with_id_works():
|
||||||
assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0
|
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
|
||||||
|
|
||||||
|
async def test_annotate_from_genbank_results_in_annotations():
|
||||||
|
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
|
||||||
|
assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString
|
58377
tests/resources/tohama_I_bpertussis.fasta
Normal file
58377
tests/resources/tohama_I_bpertussis.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user