Wrote simple annotation test

This commit is contained in:
Harrison Deng 2025-01-02 21:28:45 +00:00
parent a9ae193cdc
commit aa2d494bee
4 changed files with 58410 additions and 20 deletions

View File

@ -1,34 +1,42 @@
import asyncio
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
import numpy as np
from mlstmyfasta.engine.data.genomics import Strand, StrandFeature, get_feature_coding
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
async def fetch_ncbi_genbank(genbank_id: str) -> Strand:
with Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text") as fetch_stream:
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
sequence_features.append(StrandFeature(
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=feature.qualifiers
))
return Strand(name=genbank_id, coding=str(record.seq), features=sequence_features)
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_coding: str):
strand = await fetch_ncbi_genbank(genbank_id=genbank_id)
async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for feature in strand.features:
feature_coding = get_feature_coding(strand=strand, feature=feature)
if len(aligner.align(query_coding, feature)) < 1:
for annotation in reference_annotations.annotations:
if annotation.end - annotation.start > max_annotation_length:
# TODO implement a failsafe
continue
sequence = sorted(aligner.align(query_coding, feature))[0]
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
alignments = aligner.align(query_coding, feature_string_sequence)
if len(alignments) < 1:
# TODO implement a failsafe
continue
top_alignment = sorted(aligner.align(query_coding, annotation))[0]
# TODO Check if alternatives are better

View File

@ -3,17 +3,17 @@ from typing import Mapping, Sequence
@dataclass
class StrandFeature:
class StringAnnotation:
type: str
start: int
end: int
feature_properties: Mapping[str, Sequence[str]]
@dataclass
class Strand:
class AnnotatedString:
name: str
coding: str
features: Sequence[StrandFeature]
sequence: str
annotations: Sequence[StringAnnotation]
def get_feature_coding(strand: Strand, feature: StrandFeature) -> str:
return strand.coding[feature.start:feature.end]
def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
return annotated_string.sequence[string_annotation.start:string_annotation.end]

View File

@ -1,6 +1,11 @@
from mlstmyfasta.engine.annotations import fetch_ncbi_genbank
from mlstmyfasta.engine.annotations import annotate_from_genbank, fetch_ncbi_genbank
from Bio import SeqIO
from mlstmyfasta.engine.data.genomics import AnnotatedString
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).coding) > 0
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
async def test_annotate_from_genbank_results_in_annotations():
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString

File diff suppressed because it is too large Load Diff