Completed NCBI lookup, FASTA reading, ABIF reading, and Institut Pasteur querying

This commit is contained in:
Harrison Deng 2025-01-03 16:41:51 +00:00
parent b834aa93b0
commit 362e0867e5
22 changed files with 339 additions and 56 deletions

View File

@ -1,4 +1,5 @@
aiohttp[speedups] aiohttp[speedups]
biopython biopython
pytest pytest
pytest-asyncio pytest-asyncio
dataclasses-json

View File

@ -6,30 +6,8 @@ from Bio import Entrez
from Bio import SeqIO from Bio import SeqIO
import numpy as np import numpy as np
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation
# TODO Change this out for a more professional approach from mlstmyfasta.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
Entrez.email = "yunyangdeng@outlook.com"
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation, get_feature_coding
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()): async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
@ -60,4 +38,7 @@ async def annotate_from_genbank(genbank_id: str, query_name: str, query_string:
end=np.max(top_alignment.aligned[0]), # and the end of the last chunk end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
feature_properties=dict(annotation.feature_properties) # same as original feature_properties=dict(annotation.feature_properties) # same as original
)) ))
return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations) return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
return annotated_string.sequence[string_annotation.start:string_annotation.end]

View File

@ -2,5 +2,5 @@ from dataclasses import dataclass
@dataclass @dataclass
class Allele: class Allele:
allele_name: str allele_loci: str
allele_variant: str allele_variant: str

View File

@ -1,5 +1,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from typing import Mapping, Sequence, Set from numbers import Number
from typing import Mapping, Sequence, Set, Union
@dataclass @dataclass
@ -8,12 +9,97 @@ class StringAnnotation:
start: int start: int
end: int end: int
feature_properties: Mapping[str, Set[str]] feature_properties: Mapping[str, Set[str]]
@dataclass @dataclass
class AnnotatedString: class NamedString:
name: str name: str
sequence: str sequence: str
@dataclass
class AnnotatedString(NamedString):
annotations: Sequence[StringAnnotation] annotations: Sequence[StringAnnotation]
def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str: @dataclass
return annotated_string.sequence[string_annotation.start:string_annotation.end] class SangerTraceData:
sequence: Sequence[str]
seq_param_file_name: str
analysis_proto_settings_name: str
analysis_rpto_settings_ver: str
analysis_proto_xml_data: str
analysis_proto_xml_schema_ver: str
sample_comment: Union[None, str]
capillary_machine: bool
container_identifier: str
container_name: str
comment_title: str
channel_1: Sequence[Number]
channel_2: Sequence[Number]
channel_3: Sequence[Number]
channel_4: Sequence[Number]
measured_voltage_dv: Sequence[Number]
measured_current_ma: Sequence[Number]
measured_power_mw: Sequence[Number]
measured_temperature_celsius: Sequence[Number]
down_sample_factor: Number
dye_1: str
dye_2: str
dye_3: str
dye_4: str
dye_wavelength_1: str
dye_wavelength_2: str
dye_wavelength_3: str
dye_wavelength_4: str
dye_set_name: str
electrophoresis_voltage_setting_v: Number
start_run_event: str
stop_run_event: str
start_collection_event: str
stop_collection_event: str
base_order: Sequence[str]
gel_type_desc: str
injection_time_sec: Number
inection_voltage_v: Number
lane_or_capillary: Number
sample_tracking_id: str
length_to_detector_cm: Number
laser_power_mw: Number
instrument_name_and_serial: str
data_collection_module_file: str
model_number: str
pixels_avg_per_lane: Number
number_of_capillaries: Number
marked_off_scale_scans: Union[None, Sequence[Number]]
# Skipped Ovrl, OvrV
mobility_file: str
# Skipped PRJT, PROJ
pixel_bin_size: Number
# Skipped scan rate
results_group_comment: Union[None, str]
results_group_name: str
run_module_ver: str
run_module_xml: str
run_module_xml_ver: str
run_proto_name: str
run_proto_ver: str
run_start_date: str # Date time object
run_stop_date: str # Date time object
data_collection_start_date: str
data_collection_stop_date: str
run_name: str
run_start_time: str # time object
run_stop_time: str # time object
collection_start_time: str # time object
collection_stop_time: str # time object
saturated_data_points: Union[None, Sequence[Number]]
color_rescaling_divisor: Number
scan_count: Number
polymer_lot_expiration: str # date time object
polymer_lot_number: Number
sample_name: str
# Skipped genescan data
# Skipped size standard file name
data_collection_software_ver: str
data_collection_firmware_ver: str
run_temperature_setting_celcius: Number
well_id: str
plate_user_name: str

View File

@ -0,0 +1,104 @@
import asyncio
from numbers import Number
from os import path
from typing import Sequence, Union
from mlstmyfasta.engine.data.genomics import SangerTraceData
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
with open(seq_path, "rb") as seq_handle:
return SeqIO.read(seq_handle, "abi")
async def load_sanger_sequence(seq_path: str) -> SangerTraceData:
ext = path.splitext(seq_path)[1]
if ext.lower() != ".ab1" and ext.lower() != "abi":
raise ValueError(
'seq_path must have file extension of "ab1", or "abi".')
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
biopython_annotations = biopython_seq.annotations
# Lot of type ignoring since Biopython did not define their typing.
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
trace_data = SangerTraceData(
biopython_seq.seq,
biopython_abif_raw.get("APFN2"), # type: ignore
biopython_abif_raw.get("APrN1"), # type: ignore
biopython_abif_raw.get("APrV1"), # type: ignore
biopython_abif_raw.get("APrX1"), # type: ignore
biopython_abif_raw.get("APXV1"), # type: ignore
biopython_abif_raw.get("CMNT1"), # type: ignore
biopython_abif_raw.get("CpEP1"), # type: ignore
biopython_abif_raw.get("CTID1"), # type: ignore
biopython_abif_raw.get("CTNM1"), # type: ignore
biopython_abif_raw.get("CTTL1"), # type: ignore
biopython_abif_raw.get("DATA1"), # type: ignore
biopython_abif_raw.get("DATA2"), # type: ignore
biopython_abif_raw.get("DATA3"), # type: ignore
biopython_abif_raw.get("DATA4"), # type: ignore
biopython_abif_raw.get("DATA5"), # type: ignore
biopython_abif_raw.get("DATA6"), # type: ignore
biopython_abif_raw.get("DATA7"), # type: ignore
biopython_abif_raw.get("DATA8"), # type: ignore
biopython_abif_raw.get("DSam1"), # type: ignore
biopython_abif_raw.get("DyeN1"), # type: ignore
biopython_abif_raw.get("DyeN2"), # type: ignore
biopython_abif_raw.get("DyeN3"), # type: ignore
biopython_abif_raw.get("DyeN4"), # type: ignore
biopython_abif_raw.get("DyeW1"), # type: ignore
biopython_abif_raw.get("DyeW2"), # type: ignore
biopython_abif_raw.get("DyeW3"), # type: ignore
biopython_abif_raw.get("DyeW4"), # type: ignore
biopython_abif_raw.get("DySN1"), # type: ignore
biopython_abif_raw.get("EPVt1"), # type: ignore
biopython_abif_raw.get("EVNT1"), # type: ignore
biopython_abif_raw.get("EVNT2"), # type: ignore
biopython_abif_raw.get("EVNT3"), # type: ignore
biopython_abif_raw.get("EVNT4"), # type: ignore
biopython_abif_raw.get("FWO_1"), # type: ignore
biopython_abif_raw.get("GTyp1"), # type: ignore
biopython_abif_raw.get("InSc1"), # type: ignore
biopython_abif_raw.get("InVt1"), # type: ignore
biopython_abif_raw.get("LANE1"), # type: ignore
biopython_abif_raw.get("LIMS1"), # type: ignore
biopython_abif_raw.get("LNTD1"), # type: ignore
biopython_abif_raw.get("LsrP1"), # type: ignore
biopython_abif_raw.get("MCHN1"), # type: ignore
biopython_abif_raw.get("MODF1"), # type: ignore
biopython_abif_raw.get("MODL1"), # type: ignore
biopython_abif_raw.get("NAVG1"), # type: ignore
biopython_abif_raw.get("NLNE1"), # type: ignore
biopython_abif_raw.get("OfSc1"), # type: ignore
biopython_abif_raw.get("PDMF1"), # type: ignore
biopython_abif_raw.get("PXLB1"), # type: ignore
biopython_abif_raw.get("RGCm1"), # type: ignore
biopython_abif_raw.get("RGNm1"), # type: ignore
biopython_abif_raw.get("RMdV1"), # type: ignore
biopython_abif_raw.get("RMdX1"), # type: ignore
biopython_abif_raw.get("RMXV1"), # type: ignore
biopython_abif_raw.get("RPrN1"), # type: ignore
biopython_abif_raw.get("RPrV1"), # type: ignore
biopython_abif_raw.get("RUND1"), # type: ignore
biopython_abif_raw.get("RUND2"), # type: ignore
biopython_abif_raw.get("RUND3"), # type: ignore
biopython_abif_raw.get("RUND4"), # type: ignore
biopython_abif_raw.get("RunN1"), # type: ignore
biopython_abif_raw.get("RUNT1"), # type: ignore
biopython_abif_raw.get("RUNT2"), # type: ignore
biopython_abif_raw.get("RUNT3"), # type: ignore
biopython_abif_raw.get("RUNT4"), # type: ignore
biopython_abif_raw.get("Satd"), # type: ignore
biopython_abif_raw.get("Scal1"), # type: ignore
biopython_abif_raw.get("SCAN1"), # type: ignore
biopython_abif_raw.get("SMED1"), # type: ignore
biopython_abif_raw.get("SMLt"), # type: ignore
biopython_abif_raw.get("SMPL1"), # type: ignore
biopython_abif_raw.get("SVER1"), # type: ignore
biopython_abif_raw.get("SVER3"), # type: ignore
biopython_abif_raw.get("Tmpr1"), # type: ignore
biopython_abif_raw.get("TUBE"), # type: ignore
biopython_abif_raw.get("User") # type: ignore
)
return trace_data

View File

@ -0,0 +1,11 @@
import asyncio
from io import TextIOWrapper
from typing import Any, AsyncGenerator, Generator, Sequence, Union
from Bio import SeqIO
from mlstmyfasta.engine.data.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
for fasta_sequence in await fasta_sequences:
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))

View File

@ -0,0 +1,36 @@
from contextlib import AbstractAsyncContextManager
import re
from typing import Sequence
from aiohttp import ClientSession, ClientTimeout
from mlstmyfasta.engine.data.MLST import Allele
from mlstmyfasta.engine.data.genomics import NamedString
from mlstmyfasta.engine.remote.databases.institutpasteur.structures import InstitutPasteurSequenceRequest, InstitutPasteurSequenceResponse
class InstitutPasteurProfiler(AbstractAsyncContextManager):
async def __aenter__(self):
return self
def __init__(self, database_name: str):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_profile(self, sequence_string: str):
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = f"schemes/3/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
sequence_response: dict = await response.json()
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

View File

@ -0,0 +1,22 @@
from dataclasses import dataclass
from typing import Sequence, Union
from dataclasses_json import dataclass_json
@dataclass_json
@dataclass
class InstitutPasteurSequenceRequest:
sequence: str
details: Union[bool, None]
partial_matches: Union[bool, None]
base64: Union[bool, None]
@dataclass_json
@dataclass
class InstitutPasteurSequenceResponse:
exact_matches: Sequence[tuple[int, str]]
start: Union[int, None]
end: Union[int, None]
orientation: Union[str, None]
length: Union[int, None]
contig: Union[str, None]

View File

@ -0,0 +1,27 @@
import asyncio
from Bio import Entrez
from Bio import SeqIO
# TODO Change this out for a more professional approach
Entrez.email = "yunyangdeng@outlook.com"
from mlstmyfasta.engine.data.genomics import AnnotatedString, StringAnnotation
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)

View File

@ -1,18 +0,0 @@
from dataclasses import dataclass
from typing import Sequence, Union
@dataclass
class SequenceRequestBody:
sequence: str
details: Union[None, str]
partial_matches: Union[None, bool]
base64: Union[None, bool]
@dataclass
class SequenceResponse:
exact_matches: Sequence[tuple[str, str]]
start: Union[None, int]
end: Union[None, int]
orientation: Union[None, str]
length: Union[None, int]
contig: Union[None, str]

View File

@ -0,0 +1,8 @@
import os
from mlstmyfasta.engine.local.abif import load_sanger_sequence
async def test_load_sanger_sequence_has_data():
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await load_sanger_sequence("tests/resources/1I1_F_P1815443_047.ab1")
assert result_data is not None

View File

@ -0,0 +1,7 @@
from mlstmyfasta.engine.local.fasta import read_fasta
async def test_fasta_reader_not_none():
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
async for named_string in named_strings:
assert named_string.name == "BX470248.1"

View File

@ -0,0 +1,16 @@
from Bio import SeqIO
from mlstmyfasta.engine.data.MLST import Allele
from mlstmyfasta.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_profile(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0

View File

@ -0,0 +1,5 @@
from mlstmyfasta.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0

View File

@ -1,14 +1,11 @@
from mlstmyfasta.engine.annotations import annotate_from_genbank, fetch_ncbi_genbank from mlstmyfasta.engine.annotate import annotate_from_genbank, fetch_ncbi_genbank
from Bio import SeqIO from Bio import SeqIO
from mlstmyfasta.engine.data.genomics import AnnotatedString from mlstmyfasta.engine.data.genomics import AnnotatedString
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
async def test_annotate_from_genbank_for_adk_annotation(): async def test_annotate_from_genbank_for_adk_annotation():
sequence = SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", str(sequence), max_annotation_length=750, gene_targets=set(["adk"])) annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", sequence, max_annotation_length=750, gene_targets=set(["adk"]))
assert isinstance(annotated_sequence, AnnotatedString) assert isinstance(annotated_sequence, AnnotatedString)
assert len(annotated_sequence.annotations) >= 1 assert len(annotated_sequence.annotations) >= 1
assert annotated_sequence.annotations[0].type == "gene" assert annotated_sequence.annotations[0].type == "gene"

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.