Moved to a general BIGSdb implementation

Updated tests

Removed ABIF UI for the time being

Began updating CLI
This commit is contained in:
Harrison Deng 2025-01-08 21:32:10 +00:00
parent 645357ac58
commit 42d0f56b18
20 changed files with 403 additions and 414 deletions

12
.vscode/launch.json vendored
View File

@ -6,18 +6,12 @@
"configurations": [
{
"name": "CLI ipdbmlst",
"name": "CLI blank",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/automlst/cli/root.py",
"program": "${workspaceFolder}/src/automlst/cli/program.py",
"console": "integratedTerminal",
"args": [
"-fa",
"${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
"-ipdbmlst",
"pubmlst_bordetella_seqdef",
"${workspaceFolder}/output"
],
"args": [],
"cwd": "${workspaceFolder}/src",
"env": {
"PYTHONPATH": "${workspaceFolder}/src"

View File

@ -13,7 +13,7 @@ requires-python = ">=3.11"
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
[project.scripts]
automlst = "automlst.cli.root:run"
automlst = "automlst.cli.program:run"
[tool.pyright]
extraPaths = ["src"]

View File

@ -1,23 +0,0 @@
from os import path
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
from automlst.engine.data.mlst import MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.fasta import read_fasta
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
for fasta_path in fastas:
async for fasta in read_fasta(fasta_path):
yield fasta.sequence
for abif_path in abifs:
abif_data = await read_abif(abif_path)
yield "".join(abif_data.sequence)
async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
profiles = list()
async with InstitutPasteurProfiler(database_name=database_name) as profiler:
async for string in strings:
profiles.append(await profiler.profile_string(string))
return profiles

43
src/automlst/cli/info.py Normal file
View File

@ -0,0 +1,43 @@
import asyncio
from automlst.cli import program
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
parser = program.subparsers.add_parser(__name__)
parser.add_argument(
"--retrieve-bigsdbs", "-l",
action="store_true",
dest="list_dbs",
required=False,
default=False,
type=bool,
help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
)
parser.add_argument(
"--retrieve-bigsdb-schemas", "-lschemas",
nargs="+",
action="extend",
dest="list_bigsdb_schemas",
required=False,
default=[],
type=str,
help="Lists the known schema IDs for a given BIGSdb sequence definition database name"
)
async def run(args):
async with BIGSdbIndex() as bigsdb_index:
if args.list_dbs:
known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
print(", ".join(known_seqdef_dbs.keys()))
for bigsdb_schema_name in args.list_bigsdb_schemas:
schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
for schema_desc, schema_id in schemas.items():
print(f"{schema_desc}: {schema_id}")
def run_asynchronously(args):
asyncio.run(run(args))
parser.set_defaults(func=run_asynchronously)

View File

@ -0,0 +1,55 @@
import asyncio
import datetime
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence, Union
from automlst.cli import program
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import MLSTProfile
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_fasta, read_multiple_fastas
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
parser = program.subparsers.add_parser(__name__)
parser.add_argument(
"--fasta", "-fa", "-fst",
nargs="+",
action='extend',
dest="fastas",
required=False,
default=[],
type=str,
help="The FASTA files to process. Multiple can be listed."
)
parser.add_argument(
"seqdefdb",
help="The BIGSdb seqdef database to use for typing."
)
parser.add_argument(
"schema",
type=int,
help="The BIGSdb seqdef database schema ID (integer) to use for typing."
)
parser.add_argument(
"out",
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
help="The output CSV name (.csv will be appended)."
)
async def run(args):
async with BIGSdbIndex() as bigsdb_index:
gen_strings = read_multiple_fastas(args.fastas)
async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
await write_mlst_profiles_as_csv(mlst_profiles, args.out)
def run_asynchronously(args):
asyncio.run(run(args))
parser.set_defaults(func=run_asynchronously)

View File

@ -0,0 +1,22 @@
import argparse
import asyncio
import datetime
from os import path
import os
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_fasta
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
root_parser = argparse.ArgumentParser()
subparsers = root_parser.add_subparsers(required=True)
def run():
args = root_parser.parse_args()
args.func(args)
if __name__ == "__main__":
run()

View File

@ -1,86 +0,0 @@
import argparse
import asyncio
import datetime
from os import path
import os
from automlst.cli import aggregated
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_fasta
parser = argparse.ArgumentParser()
parser.add_argument(
"--run-name", "-name",
dest="run_name",
required=False,
default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"),
type=str,
help="The name of the run. Will use a date and time string if not provided."
)
parser.add_argument(
"--fasta", "-fa", "-fst",
nargs="+",
action='extend',
dest="fastas",
required=False,
default=[],
type=str,
help="The FASTA files to process. Multiple can be listed."
)
parser.add_argument(
"--abif", "-abi", "-ab1",
action='extend',
dest="abifs",
required=False,
default=[],
type=str,
help="The ABIF files to process. Multiple can be listed."
)
parser.add_argument(
"--ncbi-assembly-reference", "-refncbi",
dest="ncbi_assembly_reference",
required=False,
default=None,
type=str,
help="The NCBI GenBank accession ID for the consensus assembly. Either this argument, or the path equivalent must be given if ABIF files are used."
)
parser.add_argument(
"--assembly-reference", "-ref",
dest="assembly_reference",
required=False,
default=None,
type=str,
help="The path to the FASTA sequence to be used as a reference for consensus building."
)
parser.add_argument(
"--institut-pasteur-mlst",
"-ipdbmlst",
dest="institut_pasteur_db",
required=False,
default=None,
type=str,
help="The Institut Pasteur MLST database to use."
)
parser.add_argument(
"out",
default="./.",
help="The output folder. Files will be named by the provided (or default) run name. Outputs will be automatically generated depending on which arguments are used."
)
def run():
args = parser.parse_args()
gen_strings = aggregated.aggregate_sequences(args.fastas, args.abifs)
os.makedirs(args.out, exist_ok=True)
if args.institut_pasteur_db is not None:
mlst_profiles = aggregated.profile_all_genetic_strings(
gen_strings, args.institut_pasteur_db)
asyncio.run(write_mlst_profiles_as_csv(
asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv"))))
if __name__ == "__main__":
run()

View File

@ -1,11 +1,13 @@
import asyncio
from numbers import Number
from os import path
from typing import Any, AsyncGenerator, Collection, Sequence, Union
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
from automlst.engine.data.genomics import NamedString, SangerTraceData
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, Align
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
with open(seq_path, "rb") as seq_handle:
@ -110,9 +112,15 @@ def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedStri
aligner.mode = "local"
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
0] # take the best alignment
return NamedString(alignment_result.sequences[0].id, alignment_result.sequences[0].seq), NamedString(alignment_result.sequences[1].id, alignment_result.sequences[1].seq)
# TODO actually assemble the consensus sequence here
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
async def reference_consensus_assembly(reference: NamedString, sanger_traces: Collection[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
if isinstance(reference, str):
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
else:
reference_seq: NamedString = reference
for sanger_trace in sanger_traces:
yield (await asyncio.to_thread(_biopython_local_pairwise_alignment, reference, sanger_trace))[1]
yield NamedString("NA", "NA")
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")

View File

@ -6,7 +6,7 @@ from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
from automlst.engine.data.mlst import Allele, MLSTProfile
def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {}
for loci, alleles in alleles_map.items():
result_dict[loci] = list()
@ -15,17 +15,19 @@ def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]])
return result_dict
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
mlst_profiles = list(mlst_profiles_iterable)
header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
with open(handle, "w", newline='') as filehandle:
writer = csv.DictWriter(filehandle, fieldnames=header)
writer.writeheader()
for mlst_profile in mlst_profiles:
header = None
writer: Union[csv.DictWriter, None] = None
async for name, mlst_profile in mlst_profiles_iterable:
if writer is None:
header = ["st", "clonal-complex", "id", *mlst_profile.alleles.keys()]
writer = csv.DictWriter(filehandle, fieldnames=header)
writer.writeheader()
row_dictionary = {
"st": mlst_profile.sequence_type,
"clonal-complex": mlst_profile.clonal_complex,
**loci_alleles_variants_from_loci(mlst_profile.alleles)
"id": name,
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
}
writer.writerow(rowdict=row_dictionary)
writer.writerow(rowdict=row_dictionary)

View File

@ -1,6 +1,6 @@
import asyncio
from io import TextIOWrapper
from typing import Any, AsyncGenerator, Generator, Sequence, Union
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
from Bio import SeqIO
from automlst.engine.data.genomics import NamedString
@ -8,4 +8,9 @@ from automlst.engine.data.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
for fasta_sequence in await fasta_sequences:
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
for handle in handles:
async for named_seq in read_fasta(handle):
yield named_seq

View File

@ -0,0 +1,127 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile
class BigSDBMLSTProfiler(AbstractAsyncContextManager):
def __init__(self, database_api: str, database_name: str, schema_id: int):
self._base_url = f"{database_api}/db/{database_name}/schemes/{schema_id}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def __aenter__(self):
return self
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
sequence_response: dict = await response.json()
if "exact_matches" not in sequence_response:
# TODO throw exception for not finding matches.
pass
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = "designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
request_json = {
"designations": allele_request_dict
}
async with self._http_client.post(uri_path, json=request_json) as response:
response_json = await response.json()
if "fields" not in response_json:
# TODO raise exception about invalid parameters or no exact parameterization found
pass
schema_fields_returned = response_json["fields"]
schema_exact_matches: dict = response_json["exact_matches"]
allele_map: dict[str, list[Allele]] = defaultdict(list)
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
async for named_string in namedStrings:
yield (named_string.name, await self.profile_string(named_string.sequence))
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()
class BIGSdbIndex(AbstractAsyncContextManager):
KNOWN_BIGSDB_APIS = {
"https://bigsdb.pasteur.fr/api",
"https://rest.pubmlst.org"
}
def __init__(self):
self._http_client = ClientSession()
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
super().__init__()
async def __aenter__(self):
return self
async def get_known_seqdef_dbs(self, force: bool = False) -> Mapping[str, str]:
if self._known_seqdef_dbs_origin is not None and not force:
return self._known_seqdef_dbs_origin
known_seqdef_dbs = dict()
for known_bigsdb in BIGSdbIndex.KNOWN_BIGSDB_APIS:
async with self._http_client.get(f"{known_bigsdb}/db") as response:
response_json_databases = await response.json()
for database_group in response_json_databases:
for database_info in database_group["databases"]:
if str(database_info["name"]).endswith("seqdef"):
known_seqdef_dbs[database_info["name"]] = known_bigsdb
self._known_seqdef_dbs_origin = dict(known_seqdef_dbs)
return self._known_seqdef_dbs_origin
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
return (await self.get_known_seqdef_dbs())[seqdef_db_name]
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
if self._seqdefdb_schemas[seqdef_db_name] is not None and not force:
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/{seqdef_db_name}/schemes"
async with self._http_client.get(uri_path) as response:
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

View File

@ -1,69 +0,0 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.remote.databases.mlst import MLSTProfiler
class InstitutPasteurProfiler(MLSTProfiler):
async def __aenter__(self):
return self
def __init__(self, database_name: str):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
sequence_response: dict = await response.json()
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
response = await self._http_client.post(uri_path, json={
"designations": allele_request_dict
})
response_json = await response.json()
schema_fields_returned = response_json["fields"]
schema_exact_matches = response_json["exact_matches"]
allele_map: dict[str, list[Allele]] = defaultdict(list)
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(schema_id, string)
return await self.fetch_mlst_st(schema_id, alleles)
async def get_scheme_ids(self) -> Mapping[str, int]:
uri_path = "schemes"
response = await self._http_client.get(uri_path)
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
return schema_descriptions
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

View File

@ -1,33 +0,0 @@
from abc import abstractmethod
from contextlib import AbstractAsyncContextManager
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Union
from aiohttp import ClientSession
from automlst.engine.data.mlst import Allele, MLSTProfile
MLST_DATABASES = [
"https://bigsdb.pasteur.fr/api/db",
"https://rest.pubmlst.org/db"
]
class MLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
pass
@abstractmethod
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
pass
@abstractmethod
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
pass
@abstractmethod
async def close(self):
pass
@abstractmethod
async def get_scheme_ids(self) -> Mapping[str, int]:
pass

View File

@ -1,68 +0,0 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.remote.databases.mlst import MLSTProfiler
class PubMLSTProfiler(MLSTProfiler):
async def __aenter__(self):
return self
def __init__(self, database_name: str):
self._base_url = f"https://rest.pubmlst.org/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, schema_id: int, sequence_string: str) -> AsyncGenerator[Allele, Any]:
uri_path = f"schemes/{schema_id}/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
sequence_response: dict = await response.json()
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, schema_id: int, alleles: AsyncIterable[Allele]) -> MLSTProfile:
uri_path = f"schemes/{schema_id}/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
response = await self._http_client.post(uri_path, json={
"designations": allele_request_dict
})
response_json = await response.json()
schema_fields_returned = response_json["fields"]
schema_exact_matches = response_json["exact_matches"]
allele_map: dict[str, list[Allele]] = defaultdict(list)
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, schema_id: int, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(schema_id, string)
return await self.fetch_mlst_st(schema_id, alleles)
async def get_scheme_ids(self) -> Mapping[str, int]:
uri_path = "schemes"
response = await self._http_client.get(uri_path)
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
return schema_descriptions
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

View File

@ -1,8 +1,12 @@
import os
from automlst.engine.local.abif import read_abif
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
async def test_load_sanger_sequence_has_data():
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
assert result_data is not None
assert result_data is not None
async def test_consensus_assembly_with_ncbi():
consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
# TODO complete implementing this

View File

@ -1,54 +0,0 @@
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.institutpasteur.mlst import InstitutPasteurProfiler
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=3, sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0
async def test_profiling_results_in_correct_st():
async def dummy_allele_generator():
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(3, dummy_allele_generator())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
assert mlst_st_data.sequence_type == "1"
async def test_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(3, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"

View File

@ -1,53 +0,0 @@
import asyncio
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.pubmlst.mlst import PubMLSTProfiler
async def test_profiling_results_in_exact_matches_when_exact():
dummy_alleles = {
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
}
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(schema_id=1, sequence_string=sequence)
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match)
assert len(dummy_alleles) == 0
async def test_profiling_results_in_correct_st():
async def generate_dummy_targets():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(1, generate_dummy_targets())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-3 complex"
assert mlst_st_data.sequence_type == "3"
async def test_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with PubMLSTProfiler(database_name="pubmlst_hinfluenzae_seqdef") as dummy_profiler:
profile = await dummy_profiler.profile_string(1, sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-3 complex"
assert profile.sequence_type == "3"

View File

@ -0,0 +1,115 @@
from Bio import SeqIO
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
async def dummy_allele_generator():
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
assert mlst_st_data.sequence_type == "1"
async def test_institutpasteur_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
dummy_alleles = {
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
}
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match)
assert len(dummy_alleles) == 0
async def test_pubmlst_profiling_results_in_correct_st():
async def generate_dummy_targets():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-3 complex"
assert mlst_st_data.sequence_type == "3"
async def test_pubmlst_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-3 complex"
assert profile.sequence_type == "3"
async def test_bigsdb_index_all_databases_is_not_empty():
async with BIGSdbIndex() as bigsdb_index:
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
async def test_bigsdb_index_references_pubmlst_correctly():
async with BIGSdbIndex() as bigsdb_index:
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
async def test_bigsdb_index_references_institutpasteur_correctly():
async with BIGSdbIndex() as bigsdb_index:
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
async def test_bigsdb_index_instantiates_correct_profiler():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BIGSdbIndex() as bigsdb_index:
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
profile = await profiler.profile_string(sequence)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"