Moved to a general BIGSdb implementation
Updated tests Removed ABIF UI for the time being Began updating CLI
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
import asyncio
|
||||
from numbers import Number
|
||||
from os import path
|
||||
from typing import Any, AsyncGenerator, Collection, Sequence, Union
|
||||
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
|
||||
from automlst.engine.data.genomics import NamedString, SangerTraceData
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio import SeqIO, Align
|
||||
|
||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
||||
|
||||
|
||||
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
||||
with open(seq_path, "rb") as seq_handle:
|
||||
@@ -110,9 +112,15 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
|
||||
aligner.mode = "local"
|
||||
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
|
||||
0] # take the best alignment
|
||||
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
|
||||
# TODO actually assemble the consensus sequence here
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
||||
|
||||
|
||||
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
||||
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
||||
if isinstance(reference, str):
|
||||
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
|
||||
else:
|
||||
reference_seq: NamedString = reference
|
||||
for sanger_trace in sanger_traces:
|
||||
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]
|
||||
yield NamedString("NA", "NA")
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
@@ -6,7 +6,7 @@ from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, list[str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
result_dict[loci] = list()
|
||||
@@ -15,17 +15,19 @@ def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]])
|
||||
return result_dict
|
||||
|
||||
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
|
||||
mlst_profiles = list(mlst_profiles_iterable)
|
||||
header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
|
||||
with open(handle, "w", newline='') as filehandle:
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
writer.writeheader()
|
||||
for mlst_profile in mlst_profiles:
|
||||
header = None
|
||||
writer: Union[csv.DictWriter, None] = None
|
||||
async for name, mlst_profile in mlst_profiles_iterable:
|
||||
if writer is None:
|
||||
header = ["st", "clonal-complex", "id", *mlst_profile.alleles.keys()]
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
writer.writeheader()
|
||||
row_dictionary = {
|
||||
"st": mlst_profile.sequence_type,
|
||||
"clonal-complex": mlst_profile.clonal_complex,
|
||||
**loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
"id": name,
|
||||
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
}
|
||||
|
||||
writer.writerow(rowdict=row_dictionary)
|
||||
writer.writerow(rowdict=row_dictionary)
|
@@ -1,6 +1,6 @@
|
||||
import asyncio
|
||||
from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Generator, Sequence, Union
|
||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
@@ -8,4 +8,9 @@ from automlst.engine.data.genomics import NamedString
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
||||
for fasta_sequence in await fasta_sequences:
|
||||
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
|
||||
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
|
||||
|
||||
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
|
||||
for handle in handles:
|
||||
async for named_seq in read_fasta(handle):
|
||||
yield named_seq
|
127
src/automlst/engine/remote/databases/bigsdb.py
Normal file
127
src/automlst/engine/remote/databases/bigsdb.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
|
||||
class BigSDBMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._base_url = f"{database_api}/db/{database_name}/schemes/{schema_id}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
if "exact_matches" not in sequence_response:
|
||||
# TODO throw exception for not finding matches.
|
||||
pass
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
|
||||
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json = await response.json()
|
||||
if "fields" not in response_json:
|
||||
# TODO raise exception about invalid parameters or no exact parameterization found
|
||||
pass
|
||||
schema_fields_returned = response_json["fields"]
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
|
||||
async for named_string in namedStrings:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence))
|
||||
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
||||
class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
KNOWN_BIGSDB_APIS = {
|
||||
"https://bigsdb.pasteur.fr/api",
|
||||
"https://rest.pubmlst.org"
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self._http_client = ClientSession()
|
||||
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
|
||||
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
|
||||
super().__init__()
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
|
||||
if self._known_seqdef_dbs_origin is not None and not force:
|
||||
return self._known_seqdef_dbs_origin
|
||||
known_seqdef_dbs = dict()
|
||||
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
|
||||
async with self._http_client.get(f"{known_bigsdb}/db") as response:
|
||||
response_json_databases = await response.json()
|
||||
for database_group in response_json_databases:
|
||||
for database_info in database_group["databases"]:
|
||||
if str(database_info["name"]).endswith("seqdef"):
|
||||
known_seqdef_dbs[database_info["name"]] = known_bigsdb
|
||||
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
|
||||
return self._known_seqdef_dbs_origin
|
||||
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
return (await self.get_known_seqdef_dbs())[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
if self._seqdefdb_schemas[seqdef_db_name] is not None and not force:
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
|
||||
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/{seqdef_db_name}/schemes"
|
||||
async with self._http_client.get(uri_path) as response:
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
|
||||
return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
||||
|
@@ -1,69 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import re
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.remote.databases.mlst import MLSTProfiler
|
||||
|
||||
class InstitutPasteurProfiler(MLSTProfiler):
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def __init__(self, database_name: str):
|
||||
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = f"schemes/{schema_id}/sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
|
||||
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
uri_path = f"schemes/{schema_id}/designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"designations": allele_request_dict
|
||||
})
|
||||
response_json = await response.json()
|
||||
schema_fields_returned = response_json["fields"]
|
||||
schema_exact_matches = response_json["exact_matches"]
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(schema_id, string)
|
||||
return await self.fetch_mlst_st(schema_id, alleles)
|
||||
|
||||
async def get_scheme_ids(self) -> Mapping[str, int]:
|
||||
uri_path = "schemes"
|
||||
response = await self._http_client.get(uri_path)
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
return schema_descriptions
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
@@ -1,33 +0,0 @@
|
||||
from abc import abstractmethod
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union
|
||||
|
||||
from aiohttp import ClientSession
|
||||
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
|
||||
MLST_DATABASES = [
|
||||
"https://bigsdb.pasteur.fr/api/db",
|
||||
"https://rest.pubmlst.org/db"
|
||||
]
|
||||
|
||||
class MLSTProfiler(AbstractAsyncContextManager):
|
||||
@abstractmethod
|
||||
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def get_scheme_ids(self) -> Mapping[str, int]:
|
||||
pass
|
@@ -1,68 +0,0 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
import re
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.remote.databases.mlst import MLSTProfiler
|
||||
|
||||
class PubMLSTProfiler(MLSTProfiler):
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
|
||||
def __init__(self, database_name: str):
|
||||
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
|
||||
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
|
||||
|
||||
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
uri_path = f"schemes/{schema_id}/sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
|
||||
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
uri_path = f"schemes/{schema_id}/designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"designations": allele_request_dict
|
||||
})
|
||||
response_json = await response.json()
|
||||
schema_fields_returned = response_json["fields"]
|
||||
schema_exact_matches = response_json["exact_matches"]
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(schema_id, string)
|
||||
return await self.fetch_mlst_st(schema_id, alleles)
|
||||
|
||||
async def get_scheme_ids(self) -> Mapping[str, int]:
|
||||
uri_path = "schemes"
|
||||
response = await self._http_client.get(uri_path)
|
||||
response_json = await response.json()
|
||||
schema_descriptions: Mapping[str, int] = dict()
|
||||
for scheme_definition in response_json["schemes"]:
|
||||
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
|
||||
scheme_desc: str = scheme_definition["description"]
|
||||
schema_descriptions[scheme_desc] = scheme_id
|
||||
return schema_descriptions
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
|
||||
async def __aexit__(self, exc_type, exc_value, traceback):
|
||||
await self.close()
|
Reference in New Issue
Block a user