126 lines
6.4 KiB
Python
126 lines
6.4 KiB
Python
import asyncio
|
|
from numbers import Number
|
|
from os import path
|
|
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
|
|
from automlst.engine.data.genomics import NamedString, SangerTraceData
|
|
from Bio.SeqRecord import SeqRecord
|
|
from Bio import SeqIO, Align
|
|
|
|
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
|
|
|
|
|
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
|
with open(seq_path, "rb") as seq_handle:
|
|
return SeqIO.read(seq_handle, "abi")
|
|
|
|
|
|
async def read_abif(seq_path: str) -> SangerTraceData:
|
|
ext = path.splitext(seq_path)[1]
|
|
if ext.lower() != ".ab1" and ext.lower() != "abi":
|
|
raise ValueError(
|
|
'seq_path must have file extension of "ab1", or "abi".')
|
|
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
|
|
biopython_annotations = biopython_seq.annotations
|
|
|
|
# Lot of type ignoring since Biopython did not define their typing.
|
|
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
|
|
trace_data = SangerTraceData(
|
|
path.basename(seq_path),
|
|
biopython_seq.seq,
|
|
biopython_abif_raw.get("APFN2"), # type: ignore
|
|
biopython_abif_raw.get("APrN1"), # type: ignore
|
|
biopython_abif_raw.get("APrV1"), # type: ignore
|
|
biopython_abif_raw.get("APrX1"), # type: ignore
|
|
biopython_abif_raw.get("APXV1"), # type: ignore
|
|
biopython_abif_raw.get("CMNT1"), # type: ignore
|
|
biopython_abif_raw.get("CpEP1"), # type: ignore
|
|
biopython_abif_raw.get("CTID1"), # type: ignore
|
|
biopython_abif_raw.get("CTNM1"), # type: ignore
|
|
biopython_abif_raw.get("CTTL1"), # type: ignore
|
|
biopython_abif_raw.get("DATA1"), # type: ignore
|
|
biopython_abif_raw.get("DATA2"), # type: ignore
|
|
biopython_abif_raw.get("DATA3"), # type: ignore
|
|
biopython_abif_raw.get("DATA4"), # type: ignore
|
|
biopython_abif_raw.get("DATA5"), # type: ignore
|
|
biopython_abif_raw.get("DATA6"), # type: ignore
|
|
biopython_abif_raw.get("DATA7"), # type: ignore
|
|
biopython_abif_raw.get("DATA8"), # type: ignore
|
|
biopython_abif_raw.get("DSam1"), # type: ignore
|
|
biopython_abif_raw.get("DyeN1"), # type: ignore
|
|
biopython_abif_raw.get("DyeN2"), # type: ignore
|
|
biopython_abif_raw.get("DyeN3"), # type: ignore
|
|
biopython_abif_raw.get("DyeN4"), # type: ignore
|
|
biopython_abif_raw.get("DyeW1"), # type: ignore
|
|
biopython_abif_raw.get("DyeW2"), # type: ignore
|
|
biopython_abif_raw.get("DyeW3"), # type: ignore
|
|
biopython_abif_raw.get("DyeW4"), # type: ignore
|
|
biopython_abif_raw.get("DySN1"), # type: ignore
|
|
biopython_abif_raw.get("EPVt1"), # type: ignore
|
|
biopython_abif_raw.get("EVNT1"), # type: ignore
|
|
biopython_abif_raw.get("EVNT2"), # type: ignore
|
|
biopython_abif_raw.get("EVNT3"), # type: ignore
|
|
biopython_abif_raw.get("EVNT4"), # type: ignore
|
|
biopython_abif_raw.get("FWO_1"), # type: ignore
|
|
biopython_abif_raw.get("GTyp1"), # type: ignore
|
|
biopython_abif_raw.get("InSc1"), # type: ignore
|
|
biopython_abif_raw.get("InVt1"), # type: ignore
|
|
biopython_abif_raw.get("LANE1"), # type: ignore
|
|
biopython_abif_raw.get("LIMS1"), # type: ignore
|
|
biopython_abif_raw.get("LNTD1"), # type: ignore
|
|
biopython_abif_raw.get("LsrP1"), # type: ignore
|
|
biopython_abif_raw.get("MCHN1"), # type: ignore
|
|
biopython_abif_raw.get("MODF1"), # type: ignore
|
|
biopython_abif_raw.get("MODL1"), # type: ignore
|
|
biopython_abif_raw.get("NAVG1"), # type: ignore
|
|
biopython_abif_raw.get("NLNE1"), # type: ignore
|
|
biopython_abif_raw.get("OfSc1"), # type: ignore
|
|
biopython_abif_raw.get("PDMF1"), # type: ignore
|
|
biopython_abif_raw.get("PXLB1"), # type: ignore
|
|
biopython_abif_raw.get("RGCm1"), # type: ignore
|
|
biopython_abif_raw.get("RGNm1"), # type: ignore
|
|
biopython_abif_raw.get("RMdV1"), # type: ignore
|
|
biopython_abif_raw.get("RMdX1"), # type: ignore
|
|
biopython_abif_raw.get("RMXV1"), # type: ignore
|
|
biopython_abif_raw.get("RPrN1"), # type: ignore
|
|
biopython_abif_raw.get("RPrV1"), # type: ignore
|
|
biopython_abif_raw.get("RUND1"), # type: ignore
|
|
biopython_abif_raw.get("RUND2"), # type: ignore
|
|
biopython_abif_raw.get("RUND3"), # type: ignore
|
|
biopython_abif_raw.get("RUND4"), # type: ignore
|
|
biopython_abif_raw.get("RunN1"), # type: ignore
|
|
biopython_abif_raw.get("RUNT1"), # type: ignore
|
|
biopython_abif_raw.get("RUNT2"), # type: ignore
|
|
biopython_abif_raw.get("RUNT3"), # type: ignore
|
|
biopython_abif_raw.get("RUNT4"), # type: ignore
|
|
biopython_abif_raw.get("Satd"), # type: ignore
|
|
biopython_abif_raw.get("Scal1"), # type: ignore
|
|
biopython_abif_raw.get("SCAN1"), # type: ignore
|
|
biopython_abif_raw.get("SMED1"), # type: ignore
|
|
biopython_abif_raw.get("SMLt"), # type: ignore
|
|
biopython_abif_raw.get("SMPL1"), # type: ignore
|
|
biopython_abif_raw.get("SVER1"), # type: ignore
|
|
biopython_abif_raw.get("SVER3"), # type: ignore
|
|
biopython_abif_raw.get("Tmpr1"), # type: ignore
|
|
biopython_abif_raw.get("TUBE"), # type: ignore
|
|
biopython_abif_raw.get("User") # type: ignore
|
|
)
|
|
return trace_data
|
|
|
|
|
|
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
|
|
aligner = Align.PairwiseAligner(scoring="blastn")
|
|
aligner.mode = "local"
|
|
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
|
|
0] # take the best alignment
|
|
# TODO actually assemble the consensus sequence here
|
|
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
|
|
|
|
|
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
|
if isinstance(reference, str):
|
|
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
|
|
else:
|
|
reference_seq: NamedString = reference
|
|
for sanger_trace in sanger_traces:
|
|
yield NamedString("NA", "NA")
|
|
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.") |