22 Commits
0.1.0 ... 0.2.0

Author SHA1 Message Date
3fd3ef9f20 Updated package description 2025-01-10 21:13:26 +00:00
206a105bf9 Rermoved publishing branch and tag conditions, publish errors are no longer considered failures 2025-01-10 17:59:04 +00:00
ad082b8af6 Revert "Always publish on build success"
This reverts commit cc6a7563ca.
2025-01-10 17:43:14 +00:00
5118a25b6a publish step will now fail if twine upload fails 2025-01-10 17:41:42 +00:00
cc6a7563ca Always publish on build success 2025-01-10 17:35:58 +00:00
484e31879e Removed test installation step from CI 2025-01-10 17:28:25 +00:00
5449ae0c68 Moved CLI to automlst.cli repo 2025-01-10 17:22:12 +00:00
e634647774 Added untested partial matching 2025-01-10 16:12:56 +00:00
f20a656f45 Fixed multiple string typing failure handling 2025-01-10 16:00:27 +00:00
0c0a2c9d4c Finished adding partial allele matching 2025-01-10 15:25:31 +00:00
03fbbe542e allele profiling partial matching works 2025-01-09 21:44:28 +00:00
e60dba936c Added a tag name check for publishing 2025-01-09 18:32:51 +00:00
9589761ddd Now tracks failed profilings 2025-01-09 17:27:15 +00:00
2843d0d592 Added tests to test continuous MLST despite failure 2025-01-09 17:04:53 +00:00
7bd28db6d4 Updated code coverage to only report on source code 2025-01-09 16:51:08 +00:00
463e320386 Updated BIGSdb API to be more tolerant towards failures 2025-01-09 16:49:12 +00:00
d4f890a150 Added specific tests for BIGSdbIndex 2025-01-09 16:23:42 +00:00
022200f197 Refactored class name 2025-01-09 16:22:50 +00:00
e66525d341 Updated README.md 2025-01-09 16:08:44 +00:00
1d531aff42 Updated CI to only report on automlst and fixed report publishing 2025-01-09 16:08:30 +00:00
8febfad282 Added publishing of coverage results 2025-01-09 15:55:39 +00:00
42bcfcf61d Added coverage tracking 2025-01-09 15:54:22 +00:00
17 changed files with 127606 additions and 214 deletions

1
.gitignore vendored
View File

@@ -357,3 +357,4 @@ package
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
output
*.private.*

12
Jenkinsfile vendored
View File

@@ -14,8 +14,9 @@ pipeline {
}
stage("unit tests") {
steps {
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml"
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml --cov=src --cov-report xml:coverage.xml"
xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
recordCoverage(tools: [[parser: 'COBERTURA', pattern: 'coverage.xml']])
}
}
stage("build") {
@@ -23,12 +24,6 @@ pipeline {
sh "python -m build"
}
}
stage("test installation") {
steps {
sh "python -m pip install dist/*.whl --force-reinstall"
sh "automlst -h"
}
}
stage("archive") {
steps {
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
@@ -38,9 +33,6 @@ pipeline {
environment {
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
}
when {
branch '**/main'
}
steps {
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
}

View File

@@ -1,3 +1,12 @@
# FASTA-MLST
# autoMLST
A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
A CLI/library for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
# Components
## automlst.cli
The command line interface, sets up very minimal and mostly makes calls to the library. Uses argparse and is split into two parts:
- `automlst info`: Provides user information on available databases to pull from, and the schemas available.
- `automlst exactmatch`: Provides users the ability to request exact match results from a given database and schema

View File

@@ -3,7 +3,7 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
build-backend = "setuptools.build_meta"
[project]
name = "automlst"
name = "automlst.engine"
dynamic = ["version"]
dependencies = [
@@ -11,10 +11,7 @@ dependencies = [
"aiohttp[speedups]",
]
requires-python = ">=3.11"
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
[project.scripts]
automlst = "automlst.cli.program:run"
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
[tool.setuptools_scm]

View File

@@ -4,4 +4,5 @@ pytest
pytest-asyncio
build
twine
setuptools_scm
setuptools_scm
pytest-cov

View File

@@ -1,48 +0,0 @@
from argparse import ArgumentParser
import asyncio
import datetime
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_multiple_fastas
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
def setup_parser(parser: ArgumentParser):
parser.description = "Returns MLST exact profile matches."
parser.add_argument(
"fastas",
nargs="+",
action='extend',
default=[],
type=str,
help="The FASTA files to process. Multiple can be listed."
)
parser.add_argument(
"seqdefdb",
help="The BIGSdb seqdef database to use for typing."
)
parser.add_argument(
"schema",
type=int,
help="The BIGSdb seqdef database schema ID (integer) to use for typing."
)
parser.add_argument(
"out",
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
help="The output CSV name (.csv will be appended)."
)
parser.set_defaults(func=run_asynchronously)
async def run(args):
async with BIGSdbIndex() as bigsdb_index:
gen_strings = read_multiple_fastas(args.fastas)
async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
await write_mlst_profiles_as_csv(mlst_profiles, args.out)
def run_asynchronously(args):
asyncio.run(run(args))

View File

@@ -1,44 +0,0 @@
from argparse import ArgumentParser
import asyncio
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
def setup_parser(parser: ArgumentParser):
parser.description = "Fetches the latest BIGSdb MLST database definitions."
parser.usage = "test"
parser.add_argument(
"--retrieve-bigsdbs", "-l",
action="store_true",
dest="list_dbs",
required=False,
default=False,
help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
)
parser.add_argument(
"--retrieve-bigsdb-schemas", "-lschemas",
nargs="+",
action="extend",
dest="list_bigsdb_schemas",
required=False,
default=[],
type=str,
help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given."
)
parser.set_defaults(func=run_asynchronously)
async def run(args):
async with BIGSdbIndex() as bigsdb_index:
if args.list_dbs:
known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
print("\n".join(known_seqdef_dbs.keys()))
for bigsdb_schema_name in args.list_bigsdb_schemas:
schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
for schema_desc, schema_id in schemas.items():
print(f"{schema_desc}: {schema_id}")
def run_asynchronously(args):
asyncio.run(run(args))

View File

@@ -1,2 +0,0 @@
def get_module_base_name(name):
return name.split(".")[-1]

View File

@@ -1,27 +0,0 @@
import argparse
import asyncio
import datetime
from os import path
import os
from automlst.cli import exactmatch, info
from automlst.cli.meta import get_module_base_name
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_fasta
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
root_parser = argparse.ArgumentParser()
subparsers = root_parser.add_subparsers(required=True)
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
def run():
args = root_parser.parse_args()
args.func(args)
if __name__ == "__main__":
run()

View File

@@ -1,13 +1,21 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
from typing import Mapping, Sequence, Union
@dataclass(frozen=True)
class PartialAllelicMatchProfile:
percent_identity: float
mismatches: int
bitscore: float
gaps: int
@dataclass(frozen=True)
class Allele:
allele_loci: str
allele_variant: str
partial_match_profile: Union[None, PartialAllelicMatchProfile]
@dataclass(frozen=True)
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: int
clonal_complex: str
sequence_type: str
clonal_complex: str

View File

@@ -0,0 +1,21 @@
from typing import Union
class BIGSDbDatabaseAPIException(Exception):
pass
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, *args):
super().__init__(f"No database \"{database_name}\" found.", *args)
class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args)

View File

@@ -7,19 +7,26 @@ from automlst.engine.data.mlst import Allele, MLSTProfile
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {}
result_dict: dict[str, Union[list[str], str]] = {}
for loci, alleles in alleles_map.items():
result_dict[loci] = list()
if len(alleles) == 1:
result_dict[loci] = alleles[0].allele_variant
for allele in alleles:
result_dict[loci].append(allele.allele_variant)
result_locis = list()
result_locis.append(allele.allele_variant)
result_dict[loci] = result_locis
return result_dict
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
failed = list()
with open(handle, "w", newline='') as filehandle:
header = None
writer: Union[csv.DictWriter, None] = None
async for name, mlst_profile in mlst_profiles_iterable:
if mlst_profile is None:
failed.append(name)
continue
if writer is None:
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
writer = csv.DictWriter(filehandle, fieldnames=header)
@@ -30,4 +37,5 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
"id": name,
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
}
writer.writerow(rowdict=row_dictionary)
writer.writerow(rowdict=row_dictionary)
return failed

View File

@@ -1,13 +1,15 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
from numbers import Number
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
class BigSDBMLSTProfiler(AbstractAsyncContextManager):
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
def __init__(self, database_api: str, database_name: str, schema_id: int):
self._database_name = database_name
@@ -18,56 +20,85 @@ class BigSDBMLSTProfiler(AbstractAsyncContextManager):
async def __aenter__(self):
return self
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
"sequence": sequence_string,
"partial_matches": not exact
})
sequence_response: dict = await response.json()
if "exact_matches" not in sequence_response:
# TODO throw exception for not finding matches.
pass
if "exact_matches" not in sequence_response:
raise ValueError(f"Unable to find exact matches in \"{self._database_name}\" under schema ID \"{self._schema_id}\".")
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
elif "partial_matches" in sequence_response:
if exact:
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = PartialAllelicMatchProfile(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
bitscore=float(partial_match["bitscore"]),
gaps=int(partial_match["gaps"])
)
yield Allele(
allele_loci=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
else:
for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
request_json = {
"designations": allele_request_dict
}
async with self._http_client.post(uri_path, json=request_json) as response:
response_json = await response.json()
if "fields" not in response_json:
# TODO raise exception about invalid parameters or no exact parameterization found
pass
schema_fields_returned = response_json["fields"]
schema_exact_matches: dict = response_json["exact_matches"]
response_json: dict = await response.json()
allele_map: dict[str, list[Allele]] = defaultdict(list)
response_json.setdefault("fields", dict())
schema_fields_returned: dict[str, str] = response_json["fields"]
schema_fields_returned.setdefault("ST", "unknown")
schema_fields_returned.setdefault("clonal_complex", "unknown")
schema_exact_matches: dict = response_json["exact_matches"]
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
if len(allele_map) == 0:
raise ValueError("Passed in no alleles.")
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string, exact)
return await self.fetch_mlst_st(alleles)
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
async for named_string in namedStrings:
yield (named_string.name, await self.profile_string(named_string.sequence))
try:
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
yield (named_string.name, None)
async def close(self):
await self._http_client.close()
@@ -107,7 +138,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
known_databases = await self.get_known_seqdef_dbs()
if seqdef_db_name not in known_databases:
raise ValueError(f"The database \"{seqdef_db_name}\" could not be found.")
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
return known_databases[seqdef_db_name]
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
@@ -124,8 +155,8 @@ class BIGSdbIndex(AbstractAsyncContextManager):
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
async def close(self):
await self._http_client.close()

View File

@@ -1,63 +1,111 @@
import random
import re
from typing import Collection, Sequence, Union
from Bio import SeqIO
import pytest
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
rand = random.Random(gene)
if isinstance(mutation_site_count, float):
mutation_site_count = int(mutation_site_count * len(gene))
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
scrambled = list(gene)
for random_location in random_locations:
scrambled[random_location] = rand.choice(alphabet)
return "".join(scrambled)
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
for sequence in sequences:
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
if match is None:
continue
gene = match.group(1)
if gene.lower() not in mlst_targets:
continue
scrambled = gene_scrambler(str(sequence.seq), 0.125)
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
assert partial_match.partial_match_profile is not None
mlst_targets.remove(gene.lower())
assert len(mlst_targets) == 0
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
async def dummy_allele_generator():
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
Allele("adk", "1", None),
Allele("fumC", "1", None),
Allele("glyA", "1", None),
Allele("tyrB", "1", None),
Allele("icd", "1", None),
Allele("pepA", "1", None),
Allele("pgm", "1", None),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
assert mlst_st_data.sequence_type == "1"
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
dummy_alleles = [
Allele("adk", "1", None),
Allele("fumC", "2", None),
Allele("glyA", "36", None),
Allele("tyrB", "4", None),
Allele("icd", "4", None),
Allele("pepA", "1", None),
Allele("pgm", "5", None),
]
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
assert mlst_profile.clonal_complex == "unknown"
assert mlst_profile.sequence_type == "unknown"
async def test_institutpasteur_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
dummy_alleles = {
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("recA", "5", None),
}
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
dummy_alleles.remove(exact_match)
@@ -67,17 +115,17 @@ async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
async def test_pubmlst_profiling_results_in_correct_st():
async def generate_dummy_targets():
dummy_alleles = [
Allele("adk", "1"),
Allele("atpG", "1"),
Allele("frdB", "1"),
Allele("fucK", "1"),
Allele("mdh", "1"),
Allele("pgi", "1"),
Allele("recA", "5"),
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("recA", "5", None),
]
for dummy_allele in dummy_alleles:
yield dummy_allele
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
@@ -86,7 +134,7 @@ async def test_pubmlst_profiling_results_in_correct_st():
async def test_pubmlst_sequence_profiling_is_correct():
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
profile = await dummy_profiler.profile_string(sequence)
assert profile is not None
assert isinstance(profile, MLSTProfile)
@@ -112,4 +160,85 @@ async def test_bigsdb_index_instantiates_correct_profiler():
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
profile = await profiler.profile_string(sequence)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
assert profile.sequence_type == "1"
async def test_bigsdb_profile_multiple_strings_same_string_twice():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences:
yield dummy_sequence
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences:
yield dummy_sequence
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
if name == "should_fail":
assert profile is None
else:
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences:
yield dummy_sequence
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
if name == "should_fail":
assert profile is not None
assert profile.clonal_complex == "unknown"
assert profile.sequence_type == "unknown"
assert len(profile.alleles) > 0
else:
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
async def generate_async_iterable_sequences():
for dummy_sequence in dummy_sequences:
yield dummy_sequence
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
with pytest.raises(NoBIGSdbMatchesException):
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
if name == "should_fail":
pytest.fail("Exception should have been thrown, no exception was thrown.")
else:
assert profile is not None
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
async def test_bigsdb_index_get_schemas_for_bordetella():
async with BIGSdbIndex() as index:
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
assert len(schemas.keys()) > 0
assert "MLST" in schemas
assert isinstance(schemas["MLST"], int)
async def test_bigsdb_index_get_databases_has_only_seqdef():
async with BIGSdbIndex() as index:
databases = await index.get_known_seqdef_dbs()
assert len(databases.keys()) > 0
for database_name in databases.keys():
assert database_name.endswith("seqdef")
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"

68196
tests/resources/12822.fasta Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff