Completed untested annotation function

This commit is contained in:
Harrison Deng 2025-01-02 21:53:55 +00:00
parent aa2d494bee
commit 28733337d2
2 changed files with 13 additions and 5 deletions

View File

@ -24,8 +24,9 @@ async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotation_length=1000):
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length=750):
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
query_annotations = list()
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for annotation in reference_annotations.annotations:
@ -33,10 +34,16 @@ async def annotate_from_genbank(genbank_id: str, query_coding: str, max_annotati
# TODO implement a failsafe
continue
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
alignments = aligner.align(query_coding, feature_string_sequence)
alignments = aligner.align(query_string, feature_string_sequence)
if len(alignments) < 1:
# TODO implement a failsafe
continue
top_alignment = sorted(aligner.align(query_coding, annotation))[0]
top_alignment = sorted(aligner.align(query_string, annotation))[0]
# TODO Check if alternatives are better
query_annotations.append(StringAnnotation(
type=annotation.type, # same as original
start=np.min(top_alignment.aligned[0]), # We only care about the start of first chunk
end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
feature_properties=dict(annotation.feature_properties) # same as original
))
return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)

View File

@ -8,4 +8,5 @@ async def test_fetch_ncbi_genbank_with_id_works():
async def test_annotate_from_genbank_results_in_annotations():
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq
assert (await annotate_from_genbank("CP011448.1", str(sequence))) is AnnotatedString
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence))
assert annotated_sequence is AnnotatedString