Compare commits
27 Commits
Author | SHA1 | Date | |
---|---|---|---|
cb22dfac9b | |||
7ea7ead46a | |||
a3c864b565 | |||
bad7dfc3a8 | |||
4fe0f0f287 | |||
3fd3ef9f20 | |||
206a105bf9 | |||
ad082b8af6 | |||
5118a25b6a | |||
cc6a7563ca | |||
484e31879e | |||
5449ae0c68 | |||
e634647774 | |||
f20a656f45 | |||
0c0a2c9d4c | |||
03fbbe542e | |||
e60dba936c | |||
9589761ddd | |||
2843d0d592 | |||
7bd28db6d4 | |||
463e320386 | |||
d4f890a150 | |||
022200f197 | |||
e66525d341 | |||
1d531aff42 | |||
8febfad282 | |||
42bcfcf61d |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -357,3 +357,4 @@ package
|
|||||||
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||||
|
|
||||||
output
|
output
|
||||||
|
*.private.*
|
37
Jenkinsfile
vendored
37
Jenkinsfile
vendored
@@ -14,8 +14,9 @@ pipeline {
|
|||||||
}
|
}
|
||||||
stage("unit tests") {
|
stage("unit tests") {
|
||||||
steps {
|
steps {
|
||||||
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml"
|
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml --cov=src --cov-report xml:coverage.xml"
|
||||||
xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
|
xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
|
||||||
|
recordCoverage(tools: [[parser: 'COBERTURA', pattern: 'coverage.xml']])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage("build") {
|
stage("build") {
|
||||||
@@ -23,26 +24,32 @@ pipeline {
|
|||||||
sh "python -m build"
|
sh "python -m build"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage("test installation") {
|
|
||||||
steps {
|
|
||||||
sh "python -m pip install dist/*.whl --force-reinstall"
|
|
||||||
sh "automlst -h"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
stage("archive") {
|
stage("archive") {
|
||||||
steps {
|
steps {
|
||||||
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
|
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage("publish") {
|
stage("publish") {
|
||||||
environment {
|
parallel {
|
||||||
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
|
stage ("git.reslate.systems") {
|
||||||
}
|
environment {
|
||||||
when {
|
TOKEN = credentials('git.reslate.systems')
|
||||||
branch '**/main'
|
}
|
||||||
}
|
steps {
|
||||||
steps {
|
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
|
}
|
||||||
|
}
|
||||||
|
stage ("test.pypi.org") {
|
||||||
|
when {
|
||||||
|
tag '*.*'
|
||||||
|
}
|
||||||
|
environment {
|
||||||
|
TOKEN = credentials('test.pypi.org')
|
||||||
|
}
|
||||||
|
steps {
|
||||||
|
sh returnStatus: true, script: 'python -m twine upload -r testpypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
13
README.md
13
README.md
@@ -1,3 +1,12 @@
|
|||||||
# FASTA-MLST
|
# autoMLST
|
||||||
|
|
||||||
A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
|
A CLI/library for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
|
||||||
|
|
||||||
|
# Components
|
||||||
|
|
||||||
|
## automlst.cli
|
||||||
|
|
||||||
|
The command line interface, sets up very minimal and mostly makes calls to the library. Uses argparse and is split into two parts:
|
||||||
|
|
||||||
|
- `automlst info`: Provides user information on available databases to pull from, and the schemas available.
|
||||||
|
- `automlst exactmatch`: Provides users the ability to request exact match results from a given database and schema
|
||||||
|
@@ -3,7 +3,7 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
|
|||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "automlst"
|
name = "automlst.engine"
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
@@ -11,10 +11,7 @@ dependencies = [
|
|||||||
"aiohttp[speedups]",
|
"aiohttp[speedups]",
|
||||||
]
|
]
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||||
|
|
||||||
[project.scripts]
|
|
||||||
automlst = "automlst.cli.program:run"
|
|
||||||
|
|
||||||
[tool.setuptools_scm]
|
[tool.setuptools_scm]
|
||||||
|
|
||||||
|
@@ -5,3 +5,4 @@ pytest-asyncio
|
|||||||
build
|
build
|
||||||
twine
|
twine
|
||||||
setuptools_scm
|
setuptools_scm
|
||||||
|
pytest-cov
|
@@ -1,48 +0,0 @@
|
|||||||
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
import asyncio
|
|
||||||
import datetime
|
|
||||||
from automlst.engine.local.csv import write_mlst_profiles_as_csv
|
|
||||||
from automlst.engine.local.fasta import read_multiple_fastas
|
|
||||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
|
||||||
|
|
||||||
|
|
||||||
def setup_parser(parser: ArgumentParser):
|
|
||||||
parser.description = "Returns MLST exact profile matches."
|
|
||||||
parser.add_argument(
|
|
||||||
"fastas",
|
|
||||||
nargs="+",
|
|
||||||
action='extend',
|
|
||||||
default=[],
|
|
||||||
type=str,
|
|
||||||
help="The FASTA files to process. Multiple can be listed."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"seqdefdb",
|
|
||||||
help="The BIGSdb seqdef database to use for typing."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"schema",
|
|
||||||
type=int,
|
|
||||||
help="The BIGSdb seqdef database schema ID (integer) to use for typing."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"out",
|
|
||||||
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
|
|
||||||
help="The output CSV name (.csv will be appended)."
|
|
||||||
)
|
|
||||||
parser.set_defaults(func=run_asynchronously)
|
|
||||||
|
|
||||||
async def run(args):
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
gen_strings = read_multiple_fastas(args.fastas)
|
|
||||||
async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
|
|
||||||
mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
|
|
||||||
await write_mlst_profiles_as_csv(mlst_profiles, args.out)
|
|
||||||
|
|
||||||
def run_asynchronously(args):
|
|
||||||
asyncio.run(run(args))
|
|
||||||
|
|
@@ -1,44 +0,0 @@
|
|||||||
from argparse import ArgumentParser
|
|
||||||
import asyncio
|
|
||||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
|
||||||
|
|
||||||
|
|
||||||
def setup_parser(parser: ArgumentParser):
|
|
||||||
parser.description = "Fetches the latest BIGSdb MLST database definitions."
|
|
||||||
parser.usage = "test"
|
|
||||||
parser.add_argument(
|
|
||||||
"--retrieve-bigsdbs", "-l",
|
|
||||||
action="store_true",
|
|
||||||
dest="list_dbs",
|
|
||||||
required=False,
|
|
||||||
default=False,
|
|
||||||
help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--retrieve-bigsdb-schemas", "-lschemas",
|
|
||||||
nargs="+",
|
|
||||||
action="extend",
|
|
||||||
dest="list_bigsdb_schemas",
|
|
||||||
required=False,
|
|
||||||
default=[],
|
|
||||||
type=str,
|
|
||||||
help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.set_defaults(func=run_asynchronously)
|
|
||||||
|
|
||||||
async def run(args):
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
if args.list_dbs:
|
|
||||||
known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
|
|
||||||
print("\n".join(known_seqdef_dbs.keys()))
|
|
||||||
|
|
||||||
for bigsdb_schema_name in args.list_bigsdb_schemas:
|
|
||||||
schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
|
|
||||||
for schema_desc, schema_id in schemas.items():
|
|
||||||
print(f"{schema_desc}: {schema_id}")
|
|
||||||
|
|
||||||
def run_asynchronously(args):
|
|
||||||
asyncio.run(run(args))
|
|
||||||
|
|
@@ -1,2 +0,0 @@
|
|||||||
def get_module_base_name(name):
|
|
||||||
return name.split(".")[-1]
|
|
@@ -1,27 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import asyncio
|
|
||||||
import datetime
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
|
|
||||||
from automlst.cli import exactmatch, info
|
|
||||||
from automlst.cli.meta import get_module_base_name
|
|
||||||
from automlst.engine.data.genomics import NamedString
|
|
||||||
from automlst.engine.local.abif import read_abif
|
|
||||||
from automlst.engine.local.csv import write_mlst_profiles_as_csv
|
|
||||||
from automlst.engine.local.fasta import read_fasta
|
|
||||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
|
||||||
|
|
||||||
root_parser = argparse.ArgumentParser()
|
|
||||||
subparsers = root_parser.add_subparsers(required=True)
|
|
||||||
|
|
||||||
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
|
|
||||||
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
|
|
||||||
|
|
||||||
|
|
||||||
def run():
|
|
||||||
args = root_parser.parse_args()
|
|
||||||
args.func(args)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run()
|
|
@@ -3,23 +3,30 @@ from io import TextIOWrapper
|
|||||||
from os import PathLike
|
from os import PathLike
|
||||||
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
||||||
|
|
||||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||||
|
|
||||||
|
|
||||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||||
result_dict: dict[str, list[str]] = {}
|
result_dict: dict[str, Union[list[str], str]] = {}
|
||||||
for loci, alleles in alleles_map.items():
|
for loci, alleles in alleles_map.items():
|
||||||
result_dict[loci] = list()
|
if len(alleles) == 1:
|
||||||
|
result_dict[loci] = alleles[0].allele_variant
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
result_dict[loci].append(allele.allele_variant)
|
result_locis = list()
|
||||||
|
result_locis.append(allele.allele_variant)
|
||||||
|
result_dict[loci] = result_locis
|
||||||
return result_dict
|
return result_dict
|
||||||
|
|
||||||
|
|
||||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
|
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||||
|
failed = list()
|
||||||
with open(handle, "w", newline='') as filehandle:
|
with open(handle, "w", newline='') as filehandle:
|
||||||
header = None
|
header = None
|
||||||
writer: Union[csv.DictWriter, None] = None
|
writer: Union[csv.DictWriter, None] = None
|
||||||
async for name, mlst_profile in mlst_profiles_iterable:
|
async for name, mlst_profile in mlst_profiles_iterable:
|
||||||
|
if mlst_profile is None:
|
||||||
|
failed.append(name)
|
||||||
|
continue
|
||||||
if writer is None:
|
if writer is None:
|
||||||
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
|
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
|
||||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||||
@@ -31,3 +38,4 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
|
|||||||
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||||
}
|
}
|
||||||
writer.writerow(rowdict=row_dictionary)
|
writer.writerow(rowdict=row_dictionary)
|
||||||
|
return failed
|
@@ -3,7 +3,7 @@ from io import TextIOWrapper
|
|||||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
|
|
||||||
from automlst.engine.data.genomics import NamedString
|
from automlst.engine.data.structures.genomics import NamedString
|
||||||
|
|
||||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
@@ -1,13 +0,0 @@
|
|||||||
from dataclasses import dataclass
|
|
||||||
from typing import Mapping, Sequence
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class Allele:
|
|
||||||
allele_loci: str
|
|
||||||
allele_variant: str
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class MLSTProfile:
|
|
||||||
alleles: Mapping[str, Sequence[Allele]]
|
|
||||||
sequence_type: int
|
|
||||||
clonal_complex: str
|
|
@@ -1,13 +1,15 @@
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from contextlib import AbstractAsyncContextManager
|
from contextlib import AbstractAsyncContextManager
|
||||||
|
from numbers import Number
|
||||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||||
|
|
||||||
from aiohttp import ClientSession, ClientTimeout
|
from aiohttp import ClientSession, ClientTimeout
|
||||||
|
|
||||||
from automlst.engine.data.genomics import NamedString
|
from automlst.engine.data.structures.genomics import NamedString
|
||||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||||
|
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||||
|
|
||||||
class BigSDBMLSTProfiler(AbstractAsyncContextManager):
|
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||||
|
|
||||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||||
self._database_name = database_name
|
self._database_name = database_name
|
||||||
@@ -18,56 +20,85 @@ class BigSDBMLSTProfiler(AbstractAsyncContextManager):
|
|||||||
async def __aenter__(self):
|
async def __aenter__(self):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||||
uri_path = "sequence"
|
uri_path = "sequence"
|
||||||
response = await self._http_client.post(uri_path, json={
|
response = await self._http_client.post(uri_path, json={
|
||||||
"sequence": sequence_string
|
"sequence": sequence_string,
|
||||||
|
"partial_matches": not exact
|
||||||
})
|
})
|
||||||
sequence_response: dict = await response.json()
|
sequence_response: dict = await response.json()
|
||||||
if "exact_matches" not in sequence_response:
|
|
||||||
# TODO throw exception for not finding matches.
|
|
||||||
pass
|
|
||||||
|
|
||||||
if "exact_matches" not in sequence_response:
|
if "exact_matches" in sequence_response:
|
||||||
raise ValueError(f"Unable to find exact matches in \"{self._database_name}\" under schema ID \"{self._schema_id}\".")
|
# loci -> list of alleles with id and loci
|
||||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||||
for allele_loci, alleles in exact_matches.items():
|
for allele_loci, alleles in exact_matches.items():
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
alelle_id = allele["allele_id"]
|
alelle_id = allele["allele_id"]
|
||||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||||
|
elif "partial_matches" in sequence_response:
|
||||||
|
if exact:
|
||||||
|
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||||
|
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||||
|
for allele_loci, partial_match in partial_matches.items():
|
||||||
|
if len(partial_match) <= 0:
|
||||||
|
continue
|
||||||
|
partial_match_profile = PartialAllelicMatchProfile(
|
||||||
|
percent_identity=float(partial_match["identity"]),
|
||||||
|
mismatches=int(partial_match["mismatches"]),
|
||||||
|
bitscore=float(partial_match["bitscore"]),
|
||||||
|
gaps=int(partial_match["gaps"])
|
||||||
|
)
|
||||||
|
yield Allele(
|
||||||
|
allele_loci=allele_loci,
|
||||||
|
allele_variant=str(partial_match["allele"]),
|
||||||
|
partial_match_profile=partial_match_profile
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||||
|
|
||||||
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
|
||||||
|
|
||||||
|
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||||
uri_path = "designations"
|
uri_path = "designations"
|
||||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||||
async for allele in alleles:
|
if isinstance(alleles, AsyncIterable):
|
||||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
async for allele in alleles:
|
||||||
|
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||||
|
else:
|
||||||
|
for allele in alleles:
|
||||||
|
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||||
request_json = {
|
request_json = {
|
||||||
"designations": allele_request_dict
|
"designations": allele_request_dict
|
||||||
}
|
}
|
||||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||||
response_json = await response.json()
|
response_json: dict = await response.json()
|
||||||
if "fields" not in response_json:
|
|
||||||
# TODO raise exception about invalid parameters or no exact parameterization found
|
|
||||||
pass
|
|
||||||
schema_fields_returned = response_json["fields"]
|
|
||||||
schema_exact_matches: dict = response_json["exact_matches"]
|
|
||||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||||
|
response_json.setdefault("fields", dict())
|
||||||
|
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||||
|
schema_fields_returned.setdefault("ST", "unknown")
|
||||||
|
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||||
|
schema_exact_matches: dict = response_json["exact_matches"]
|
||||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||||
for exact_match_allele in exact_match_alleles:
|
for exact_match_allele in exact_match_alleles:
|
||||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
|
||||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
if len(allele_map) == 0:
|
||||||
|
raise ValueError("Passed in no alleles.")
|
||||||
|
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||||
|
|
||||||
async def profile_string(self, string: str) -> MLSTProfile:
|
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||||
alleles = self.fetch_mlst_allele_variants(string)
|
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||||
return await self.fetch_mlst_st(alleles)
|
return await self.fetch_mlst_st(alleles)
|
||||||
|
|
||||||
|
|
||||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
|
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||||
async for named_string in namedStrings:
|
async for named_string in namedStrings:
|
||||||
yield (named_string.name, await self.profile_string(named_string.sequence))
|
try:
|
||||||
|
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||||
|
except NoBIGSdbMatchesException as e:
|
||||||
|
if stop_on_fail:
|
||||||
|
raise e
|
||||||
|
yield (named_string.name, None)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._http_client.close()
|
await self._http_client.close()
|
||||||
@@ -107,7 +138,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
|
|||||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||||
known_databases = await self.get_known_seqdef_dbs()
|
known_databases = await self.get_known_seqdef_dbs()
|
||||||
if seqdef_db_name not in known_databases:
|
if seqdef_db_name not in known_databases:
|
||||||
raise ValueError(f"The database \"{seqdef_db_name}\" could not be found.")
|
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||||
return known_databases[seqdef_db_name]
|
return known_databases[seqdef_db_name]
|
||||||
|
|
||||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||||
@@ -124,8 +155,8 @@ class BIGSdbIndex(AbstractAsyncContextManager):
|
|||||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||||
|
|
||||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
|
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||||
return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._http_client.close()
|
await self._http_client.close()
|
0
src/automlst/engine/data/structures/__init__.py
Normal file
0
src/automlst/engine/data/structures/__init__.py
Normal file
21
src/automlst/engine/data/structures/mlst.py
Normal file
21
src/automlst/engine/data/structures/mlst.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Mapping, Sequence, Union
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PartialAllelicMatchProfile:
|
||||||
|
percent_identity: float
|
||||||
|
mismatches: int
|
||||||
|
bitscore: float
|
||||||
|
gaps: int
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Allele:
|
||||||
|
allele_loci: str
|
||||||
|
allele_variant: str
|
||||||
|
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MLSTProfile:
|
||||||
|
alleles: Mapping[str, Sequence[Allele]]
|
||||||
|
sequence_type: str
|
||||||
|
clonal_complex: str
|
21
src/automlst/engine/exceptions/database.py
Normal file
21
src/automlst/engine/exceptions/database.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from typing import Union
|
||||||
|
|
||||||
|
class BIGSDbDatabaseAPIException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
||||||
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
|
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||||
|
|
||||||
|
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
|
||||||
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
|
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||||
|
|
||||||
|
class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException):
|
||||||
|
def __init__(self, database_name: str, *args):
|
||||||
|
super().__init__(f"No database \"{database_name}\" found.", *args)
|
||||||
|
|
||||||
|
class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException):
|
||||||
|
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||||
|
super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args)
|
@@ -1,126 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
from numbers import Number
|
|
||||||
from os import path
|
|
||||||
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
|
|
||||||
from automlst.engine.data.genomics import NamedString, SangerTraceData
|
|
||||||
from Bio.SeqRecord import SeqRecord
|
|
||||||
from Bio import SeqIO, Align
|
|
||||||
|
|
||||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
|
||||||
|
|
||||||
|
|
||||||
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
|
||||||
with open(seq_path, "rb") as seq_handle:
|
|
||||||
return SeqIO.read(seq_handle, "abi")
|
|
||||||
|
|
||||||
|
|
||||||
async def read_abif(seq_path: str) -> SangerTraceData:
|
|
||||||
ext = path.splitext(seq_path)[1]
|
|
||||||
if ext.lower() != ".ab1" and ext.lower() != "abi":
|
|
||||||
raise ValueError(
|
|
||||||
'seq_path must have file extension of "ab1", or "abi".')
|
|
||||||
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
|
|
||||||
biopython_annotations = biopython_seq.annotations
|
|
||||||
|
|
||||||
# Lot of type ignoring since Biopython did not define their typing.
|
|
||||||
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
|
|
||||||
trace_data = SangerTraceData(
|
|
||||||
path.basename(seq_path),
|
|
||||||
biopython_seq.seq,
|
|
||||||
biopython_abif_raw.get("APFN2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("APrN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("APrV1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("APrX1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("APXV1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("CMNT1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("CpEP1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("CTID1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("CTNM1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("CTTL1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA5"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA6"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA7"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DATA8"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DSam1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeN2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeN3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeN4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeW1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeW2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeW3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DyeW4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("DySN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("EPVt1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("EVNT1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("EVNT2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("EVNT3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("EVNT4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("FWO_1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("GTyp1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("InSc1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("InVt1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("LANE1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("LIMS1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("LNTD1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("LsrP1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("MCHN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("MODF1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("MODL1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("NAVG1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("NLNE1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("OfSc1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("PDMF1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("PXLB1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RGCm1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RGNm1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RMdV1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RMdX1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RMXV1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RPrN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RPrV1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUND1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUND2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUND3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUND4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RunN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUNT1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUNT2"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUNT3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("RUNT4"), # type: ignore
|
|
||||||
biopython_abif_raw.get("Satd"), # type: ignore
|
|
||||||
biopython_abif_raw.get("Scal1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SCAN1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SMED1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SMLt"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SMPL1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SVER1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("SVER3"), # type: ignore
|
|
||||||
biopython_abif_raw.get("Tmpr1"), # type: ignore
|
|
||||||
biopython_abif_raw.get("TUBE"), # type: ignore
|
|
||||||
biopython_abif_raw.get("User") # type: ignore
|
|
||||||
)
|
|
||||||
return trace_data
|
|
||||||
|
|
||||||
|
|
||||||
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
|
|
||||||
aligner = Align.PairwiseAligner(scoring="blastn")
|
|
||||||
aligner.mode = "local"
|
|
||||||
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
|
|
||||||
0] # take the best alignment
|
|
||||||
# TODO actually assemble the consensus sequence here
|
|
||||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
|
||||||
|
|
||||||
|
|
||||||
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
|
||||||
if isinstance(reference, str):
|
|
||||||
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
|
|
||||||
else:
|
|
||||||
reference_seq: NamedString = reference
|
|
||||||
for sanger_trace in sanger_traces:
|
|
||||||
yield NamedString("NA", "NA")
|
|
||||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
|
@@ -1,27 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
from Bio import Entrez
|
|
||||||
from Bio import SeqIO
|
|
||||||
|
|
||||||
# TODO Change this out for a more professional approach
|
|
||||||
Entrez.email = "yunyangdeng@outlook.com"
|
|
||||||
|
|
||||||
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
|
||||||
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
|
||||||
record = SeqIO.read(fetch_stream, "genbank")
|
|
||||||
sequence_features = list()
|
|
||||||
for feature in record.features:
|
|
||||||
start = int(feature.location.start)
|
|
||||||
end = int(feature.location.end)
|
|
||||||
qualifiers = feature.qualifiers
|
|
||||||
for qualifier_key in qualifiers:
|
|
||||||
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
|
|
||||||
sequence_features.append(StringAnnotation(
|
|
||||||
type=feature.type,
|
|
||||||
start=start,
|
|
||||||
end=end+1, # Position is exclusive
|
|
||||||
feature_properties=qualifiers
|
|
||||||
))
|
|
||||||
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
|
@@ -1,4 +1,4 @@
|
|||||||
from automlst.engine.local.fasta import read_fasta
|
from automlst.engine.data.local.fasta import read_fasta
|
||||||
|
|
||||||
|
|
||||||
async def test_fasta_reader_not_none():
|
async def test_fasta_reader_not_none():
|
244
tests/automlst/engine/data/remote/databases/test_bigsdb.py
Normal file
244
tests/automlst/engine/data/remote/databases/test_bigsdb.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
import random
|
||||||
|
import re
|
||||||
|
from typing import Collection, Sequence, Union
|
||||||
|
from Bio import SeqIO
|
||||||
|
import pytest
|
||||||
|
from automlst.engine.data.structures.genomics import NamedString
|
||||||
|
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||||
|
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||||
|
from automlst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||||
|
|
||||||
|
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||||
|
rand = random.Random(gene)
|
||||||
|
if isinstance(mutation_site_count, float):
|
||||||
|
mutation_site_count = int(mutation_site_count * len(gene))
|
||||||
|
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||||
|
scrambled = list(gene)
|
||||||
|
for random_location in random_locations:
|
||||||
|
scrambled[random_location] = rand.choice(alphabet)
|
||||||
|
return "".join(scrambled)
|
||||||
|
|
||||||
|
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||||
|
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||||
|
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||||
|
assert isinstance(exact_match, Allele)
|
||||||
|
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||||
|
targets_left.remove(exact_match.allele_loci)
|
||||||
|
|
||||||
|
assert len(targets_left) == 0
|
||||||
|
|
||||||
|
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||||
|
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
|
||||||
|
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||||
|
for sequence in sequences:
|
||||||
|
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||||
|
if match is None:
|
||||||
|
continue
|
||||||
|
gene = match.group(1)
|
||||||
|
if gene.lower() not in mlst_targets:
|
||||||
|
continue
|
||||||
|
scrambled = gene_scrambler(str(sequence.seq), 0.125)
|
||||||
|
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||||
|
assert partial_match.partial_match_profile is not None
|
||||||
|
mlst_targets.remove(gene.lower())
|
||||||
|
|
||||||
|
assert len(mlst_targets) == 0
|
||||||
|
|
||||||
|
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||||
|
async def dummy_allele_generator():
|
||||||
|
dummy_alleles = [
|
||||||
|
Allele("adk", "1", None),
|
||||||
|
Allele("fumC", "1", None),
|
||||||
|
Allele("glyA", "1", None),
|
||||||
|
Allele("tyrB", "1", None),
|
||||||
|
Allele("icd", "1", None),
|
||||||
|
Allele("pepA", "1", None),
|
||||||
|
Allele("pgm", "1", None),
|
||||||
|
]
|
||||||
|
for dummy_allele in dummy_alleles:
|
||||||
|
yield dummy_allele
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
||||||
|
assert mlst_st_data is not None
|
||||||
|
assert isinstance(mlst_st_data, MLSTProfile)
|
||||||
|
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||||
|
assert mlst_st_data.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
|
||||||
|
dummy_alleles = [
|
||||||
|
Allele("adk", "1", None),
|
||||||
|
Allele("fumC", "2", None),
|
||||||
|
Allele("glyA", "36", None),
|
||||||
|
Allele("tyrB", "4", None),
|
||||||
|
Allele("icd", "4", None),
|
||||||
|
Allele("pepA", "1", None),
|
||||||
|
Allele("pgm", "5", None),
|
||||||
|
]
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||||
|
assert mlst_profile.clonal_complex == "unknown"
|
||||||
|
assert mlst_profile.sequence_type == "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_institutpasteur_sequence_profiling_is_correct():
|
||||||
|
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
profile = await dummy_profiler.profile_string(sequence)
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||||
|
dummy_alleles = {
|
||||||
|
Allele("adk", "1", None),
|
||||||
|
Allele("atpG", "1", None),
|
||||||
|
Allele("frdB", "1", None),
|
||||||
|
Allele("fucK", "1", None),
|
||||||
|
Allele("mdh", "1", None),
|
||||||
|
Allele("pgi", "1", None),
|
||||||
|
Allele("recA", "5", None),
|
||||||
|
}
|
||||||
|
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||||
|
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||||
|
async for exact_match in exact_matches:
|
||||||
|
assert isinstance(exact_match, Allele)
|
||||||
|
dummy_alleles.remove(exact_match)
|
||||||
|
|
||||||
|
assert len(dummy_alleles) == 0
|
||||||
|
|
||||||
|
async def test_pubmlst_profiling_results_in_correct_st():
|
||||||
|
async def generate_dummy_targets():
|
||||||
|
dummy_alleles = [
|
||||||
|
Allele("adk", "1", None),
|
||||||
|
Allele("atpG", "1", None),
|
||||||
|
Allele("frdB", "1", None),
|
||||||
|
Allele("fucK", "1", None),
|
||||||
|
Allele("mdh", "1", None),
|
||||||
|
Allele("pgi", "1", None),
|
||||||
|
Allele("recA", "5", None),
|
||||||
|
]
|
||||||
|
for dummy_allele in dummy_alleles:
|
||||||
|
yield dummy_allele
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||||
|
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
||||||
|
assert mlst_st_data is not None
|
||||||
|
assert isinstance(mlst_st_data, MLSTProfile)
|
||||||
|
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||||
|
assert mlst_st_data.sequence_type == "3"
|
||||||
|
|
||||||
|
async def test_pubmlst_sequence_profiling_is_correct():
|
||||||
|
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||||
|
profile = await dummy_profiler.profile_string(sequence)
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-3 complex"
|
||||||
|
assert profile.sequence_type == "3"
|
||||||
|
|
||||||
|
async def test_bigsdb_index_all_databases_is_not_empty():
|
||||||
|
async with BIGSdbIndex() as bigsdb_index:
|
||||||
|
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||||
|
|
||||||
|
async def test_bigsdb_index_references_pubmlst_correctly():
|
||||||
|
async with BIGSdbIndex() as bigsdb_index:
|
||||||
|
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||||
|
|
||||||
|
async def test_bigsdb_index_references_institutpasteur_correctly():
|
||||||
|
async with BIGSdbIndex() as bigsdb_index:
|
||||||
|
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||||
|
|
||||||
|
|
||||||
|
async def test_bigsdb_index_instantiates_correct_profiler():
|
||||||
|
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
async with BIGSdbIndex() as bigsdb_index:
|
||||||
|
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
||||||
|
profile = await profiler.profile_string(sequence)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_bigsdb_profile_multiple_strings_same_string_twice():
|
||||||
|
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
|
||||||
|
async def generate_async_iterable_sequences():
|
||||||
|
for dummy_sequence in dummy_sequences:
|
||||||
|
yield dummy_sequence
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
|
||||||
|
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||||
|
async def generate_async_iterable_sequences():
|
||||||
|
for dummy_sequence in dummy_sequences:
|
||||||
|
yield dummy_sequence
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
|
||||||
|
if name == "should_fail":
|
||||||
|
assert profile is None
|
||||||
|
else:
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
|
||||||
|
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||||
|
async def generate_async_iterable_sequences():
|
||||||
|
for dummy_sequence in dummy_sequences:
|
||||||
|
yield dummy_sequence
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
|
||||||
|
if name == "should_fail":
|
||||||
|
assert profile is not None
|
||||||
|
assert profile.clonal_complex == "unknown"
|
||||||
|
assert profile.sequence_type == "unknown"
|
||||||
|
assert len(profile.alleles) > 0
|
||||||
|
else:
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
|
||||||
|
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||||
|
invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||||
|
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
|
||||||
|
async def generate_async_iterable_sequences():
|
||||||
|
for dummy_sequence in dummy_sequences:
|
||||||
|
yield dummy_sequence
|
||||||
|
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||||
|
with pytest.raises(NoBIGSdbMatchesException):
|
||||||
|
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
|
||||||
|
if name == "should_fail":
|
||||||
|
pytest.fail("Exception should have been thrown, no exception was thrown.")
|
||||||
|
else:
|
||||||
|
assert profile is not None
|
||||||
|
assert isinstance(profile, MLSTProfile)
|
||||||
|
assert profile.clonal_complex == "ST-2 complex"
|
||||||
|
assert profile.sequence_type == "1"
|
||||||
|
|
||||||
|
async def test_bigsdb_index_get_schemas_for_bordetella():
|
||||||
|
async with BIGSdbIndex() as index:
|
||||||
|
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||||
|
assert len(schemas.keys()) > 0
|
||||||
|
assert "MLST" in schemas
|
||||||
|
assert isinstance(schemas["MLST"], int)
|
||||||
|
|
||||||
|
async def test_bigsdb_index_get_databases_has_only_seqdef():
|
||||||
|
async with BIGSdbIndex() as index:
|
||||||
|
databases = await index.get_known_seqdef_dbs()
|
||||||
|
assert len(databases.keys()) > 0
|
||||||
|
for database_name in databases.keys():
|
||||||
|
assert database_name.endswith("seqdef")
|
||||||
|
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
@@ -1,12 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
|
|
||||||
|
|
||||||
async def test_load_sanger_sequence_has_data():
|
|
||||||
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
|
|
||||||
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
|
|
||||||
assert result_data is not None
|
|
||||||
|
|
||||||
async def test_consensus_assembly_with_ncbi():
|
|
||||||
consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
|
|
||||||
# TODO complete implementing this
|
|
@@ -1,5 +0,0 @@
|
|||||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
|
||||||
|
|
||||||
|
|
||||||
async def test_fetch_ncbi_genbank_with_id_works():
|
|
||||||
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
|
|
@@ -1,115 +0,0 @@
|
|||||||
from Bio import SeqIO
|
|
||||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
|
||||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
|
|
||||||
|
|
||||||
|
|
||||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
|
||||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
|
||||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
|
||||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
|
||||||
async for exact_match in exact_matches:
|
|
||||||
assert isinstance(exact_match, Allele)
|
|
||||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
|
||||||
targets_left.remove(exact_match.allele_loci)
|
|
||||||
|
|
||||||
assert len(targets_left) == 0
|
|
||||||
|
|
||||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
|
||||||
async def dummy_allele_generator():
|
|
||||||
dummy_alleles = [
|
|
||||||
Allele("adk", "1"),
|
|
||||||
Allele("fumC", "1"),
|
|
||||||
Allele("glyA", "1"),
|
|
||||||
Allele("tyrB", "1"),
|
|
||||||
Allele("icd", "1"),
|
|
||||||
Allele("pepA", "1"),
|
|
||||||
Allele("pgm", "1"),
|
|
||||||
]
|
|
||||||
for dummy_allele in dummy_alleles:
|
|
||||||
yield dummy_allele
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
|
||||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
|
||||||
assert mlst_st_data is not None
|
|
||||||
assert isinstance(mlst_st_data, MLSTProfile)
|
|
||||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
|
||||||
assert mlst_st_data.sequence_type == "1"
|
|
||||||
|
|
||||||
async def test_institutpasteur_sequence_profiling_is_correct():
|
|
||||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
|
||||||
profile = await dummy_profiler.profile_string(sequence)
|
|
||||||
assert profile is not None
|
|
||||||
assert isinstance(profile, MLSTProfile)
|
|
||||||
assert profile.clonal_complex == "ST-2 complex"
|
|
||||||
assert profile.sequence_type == "1"
|
|
||||||
|
|
||||||
|
|
||||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
|
||||||
dummy_alleles = {
|
|
||||||
Allele("adk", "1"),
|
|
||||||
Allele("atpG", "1"),
|
|
||||||
Allele("frdB", "1"),
|
|
||||||
Allele("fucK", "1"),
|
|
||||||
Allele("mdh", "1"),
|
|
||||||
Allele("pgi", "1"),
|
|
||||||
Allele("recA", "5"),
|
|
||||||
}
|
|
||||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
|
||||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
|
||||||
async for exact_match in exact_matches:
|
|
||||||
assert isinstance(exact_match, Allele)
|
|
||||||
dummy_alleles.remove(exact_match)
|
|
||||||
|
|
||||||
assert len(dummy_alleles) == 0
|
|
||||||
|
|
||||||
async def test_pubmlst_profiling_results_in_correct_st():
|
|
||||||
async def generate_dummy_targets():
|
|
||||||
dummy_alleles = [
|
|
||||||
Allele("adk", "1"),
|
|
||||||
Allele("atpG", "1"),
|
|
||||||
Allele("frdB", "1"),
|
|
||||||
Allele("fucK", "1"),
|
|
||||||
Allele("mdh", "1"),
|
|
||||||
Allele("pgi", "1"),
|
|
||||||
Allele("recA", "5"),
|
|
||||||
]
|
|
||||||
for dummy_allele in dummy_alleles:
|
|
||||||
yield dummy_allele
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
|
||||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
|
||||||
assert mlst_st_data is not None
|
|
||||||
assert isinstance(mlst_st_data, MLSTProfile)
|
|
||||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
|
||||||
assert mlst_st_data.sequence_type == "3"
|
|
||||||
|
|
||||||
async def test_pubmlst_sequence_profiling_is_correct():
|
|
||||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
|
||||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
|
||||||
profile = await dummy_profiler.profile_string(sequence)
|
|
||||||
assert profile is not None
|
|
||||||
assert isinstance(profile, MLSTProfile)
|
|
||||||
assert profile.clonal_complex == "ST-3 complex"
|
|
||||||
assert profile.sequence_type == "3"
|
|
||||||
|
|
||||||
async def test_bigsdb_index_all_databases_is_not_empty():
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
|
||||||
|
|
||||||
async def test_bigsdb_index_references_pubmlst_correctly():
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
|
||||||
|
|
||||||
async def test_bigsdb_index_references_institutpasteur_correctly():
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
|
||||||
|
|
||||||
|
|
||||||
async def test_bigsdb_index_instantiates_correct_profiler():
|
|
||||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
|
||||||
async with BIGSdbIndex() as bigsdb_index:
|
|
||||||
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
|
||||||
profile = await profiler.profile_string(sequence)
|
|
||||||
assert profile.clonal_complex == "ST-2 complex"
|
|
||||||
assert profile.sequence_type == "1"
|
|
68196
tests/resources/12822.fasta
Normal file
68196
tests/resources/12822.fasta
Normal file
File diff suppressed because it is too large
Load Diff
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user