Compare commits
29 Commits
Author | SHA1 | Date | |
---|---|---|---|
ab44dfaa48 | |||
611b956d88 | |||
cb22dfac9b | |||
7ea7ead46a | |||
a3c864b565 | |||
bad7dfc3a8 | |||
4fe0f0f287 | |||
3fd3ef9f20 | |||
206a105bf9 | |||
ad082b8af6 | |||
5118a25b6a | |||
cc6a7563ca | |||
484e31879e | |||
5449ae0c68 | |||
e634647774 | |||
f20a656f45 | |||
0c0a2c9d4c | |||
03fbbe542e | |||
e60dba936c | |||
9589761ddd | |||
2843d0d592 | |||
7bd28db6d4 | |||
463e320386 | |||
d4f890a150 | |||
022200f197 | |||
e66525d341 | |||
1d531aff42 | |||
8febfad282 | |||
42bcfcf61d |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -357,3 +357,4 @@ package
|
||||
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
|
||||
|
||||
output
|
||||
*.private.*
|
37
Jenkinsfile
vendored
37
Jenkinsfile
vendored
@@ -14,8 +14,9 @@ pipeline {
|
||||
}
|
||||
stage("unit tests") {
|
||||
steps {
|
||||
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml"
|
||||
sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml --cov=src --cov-report xml:coverage.xml"
|
||||
xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
|
||||
recordCoverage(tools: [[parser: 'COBERTURA', pattern: 'coverage.xml']])
|
||||
}
|
||||
}
|
||||
stage("build") {
|
||||
@@ -23,26 +24,32 @@ pipeline {
|
||||
sh "python -m build"
|
||||
}
|
||||
}
|
||||
stage("test installation") {
|
||||
steps {
|
||||
sh "python -m pip install dist/*.whl --force-reinstall"
|
||||
sh "automlst -h"
|
||||
}
|
||||
}
|
||||
stage("archive") {
|
||||
steps {
|
||||
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
|
||||
}
|
||||
}
|
||||
stage("publish") {
|
||||
environment {
|
||||
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
|
||||
}
|
||||
when {
|
||||
branch '**/main'
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
parallel {
|
||||
stage ("git.reslate.systems") {
|
||||
environment {
|
||||
TOKEN = credentials('git.reslate.systems')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
}
|
||||
}
|
||||
stage ("pypi.org") {
|
||||
when {
|
||||
tag '*.*'
|
||||
}
|
||||
environment {
|
||||
TOKEN = credentials('pypi.org')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
26
README.md
26
README.md
@@ -1,3 +1,25 @@
|
||||
# FASTA-MLST
|
||||
# autoMLST.Engine
|
||||
|
||||
A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
|
||||
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||
|
||||
## Features
|
||||
|
||||
Briefly, this library can:
|
||||
- Import multiple `FASTA` files
|
||||
- Fetch the available BIGSdb databases that is currently live and available
|
||||
- Fetch the available BIGSdb database schemas for a given MLST database
|
||||
- Retrieve exact/non-exact MLST allele variant IDs based off a sequence
|
||||
- Retrieve MLST sequence type IDs based off a sequence
|
||||
- Output all results to a single CSV
|
||||
|
||||
Furthermore, this library is highly asynchronous where any potentially blocking operation, ranging from parsing FASTAs to performing HTTP requests are at least asynchronous, if not fully multithreaded.
|
||||
|
||||
## Usage
|
||||
|
||||
This library can be installed through pip. Learn how to [setup and install pip first](https://pip.pypa.io/en/stable/installation/).
|
||||
|
||||
Then, it's as easy as running `pip install automlst-engine` in any terminal that has pip in it's path (any terminal where `pip --version` returns a valid version and install path).
|
||||
|
||||
### CLI usage
|
||||
|
||||
This is a independent python library and thus does not have any form of direct user interface. One way of using it could be to create your own Python script that makes calls to this libraries functions. Alternatively, you may use `automlst-cli`, a `Python` package that implements a CLI for calling this library.
|
@@ -3,18 +3,16 @@ requires = ["setuptools>=64", "setuptools_scm>=8"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "automlst"
|
||||
name = "automlst.engine"
|
||||
dynamic = ["version"]
|
||||
readme = "README.md"
|
||||
|
||||
dependencies = [
|
||||
"biopython",
|
||||
"aiohttp[speedups]",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||
|
||||
[project.scripts]
|
||||
automlst = "automlst.cli.program:run"
|
||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||
|
||||
[tool.setuptools_scm]
|
||||
|
||||
|
@@ -4,4 +4,5 @@ pytest
|
||||
pytest-asyncio
|
||||
build
|
||||
twine
|
||||
setuptools_scm
|
||||
setuptools_scm
|
||||
pytest-cov
|
@@ -1,48 +0,0 @@
|
||||
|
||||
from argparse import ArgumentParser
|
||||
import asyncio
|
||||
import datetime
|
||||
from automlst.engine.local.csv import write_mlst_profiles_as_csv
|
||||
from automlst.engine.local.fasta import read_multiple_fastas
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
||||
|
||||
|
||||
def setup_parser(parser: ArgumentParser):
|
||||
parser.description = "Returns MLST exact profile matches."
|
||||
parser.add_argument(
|
||||
"fastas",
|
||||
nargs="+",
|
||||
action='extend',
|
||||
default=[],
|
||||
type=str,
|
||||
help="The FASTA files to process. Multiple can be listed."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"seqdefdb",
|
||||
help="The BIGSdb seqdef database to use for typing."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"schema",
|
||||
type=int,
|
||||
help="The BIGSdb seqdef database schema ID (integer) to use for typing."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"out",
|
||||
default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}',
|
||||
help="The output CSV name (.csv will be appended)."
|
||||
)
|
||||
parser.set_defaults(func=run_asynchronously)
|
||||
|
||||
async def run(args):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
gen_strings = read_multiple_fastas(args.fastas)
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler:
|
||||
mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings)
|
||||
await write_mlst_profiles_as_csv(mlst_profiles, args.out)
|
||||
|
||||
def run_asynchronously(args):
|
||||
asyncio.run(run(args))
|
||||
|
@@ -1,44 +0,0 @@
|
||||
from argparse import ArgumentParser
|
||||
import asyncio
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
||||
|
||||
|
||||
def setup_parser(parser: ArgumentParser):
|
||||
parser.description = "Fetches the latest BIGSdb MLST database definitions."
|
||||
parser.usage = "test"
|
||||
parser.add_argument(
|
||||
"--retrieve-bigsdbs", "-l",
|
||||
action="store_true",
|
||||
dest="list_dbs",
|
||||
required=False,
|
||||
default=False,
|
||||
help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--retrieve-bigsdb-schemas", "-lschemas",
|
||||
nargs="+",
|
||||
action="extend",
|
||||
dest="list_bigsdb_schemas",
|
||||
required=False,
|
||||
default=[],
|
||||
type=str,
|
||||
help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given."
|
||||
)
|
||||
|
||||
parser.set_defaults(func=run_asynchronously)
|
||||
|
||||
async def run(args):
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
if args.list_dbs:
|
||||
known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False)
|
||||
print("\n".join(known_seqdef_dbs.keys()))
|
||||
|
||||
for bigsdb_schema_name in args.list_bigsdb_schemas:
|
||||
schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name)
|
||||
for schema_desc, schema_id in schemas.items():
|
||||
print(f"{schema_desc}: {schema_id}")
|
||||
|
||||
def run_asynchronously(args):
|
||||
asyncio.run(run(args))
|
||||
|
@@ -1,2 +0,0 @@
|
||||
def get_module_base_name(name):
|
||||
return name.split(".")[-1]
|
@@ -1,27 +0,0 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import datetime
|
||||
from os import path
|
||||
import os
|
||||
|
||||
from automlst.cli import exactmatch, info
|
||||
from automlst.cli.meta import get_module_base_name
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.local.abif import read_abif
|
||||
from automlst.engine.local.csv import write_mlst_profiles_as_csv
|
||||
from automlst.engine.local.fasta import read_fasta
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex
|
||||
|
||||
root_parser = argparse.ArgumentParser()
|
||||
subparsers = root_parser.add_subparsers(required=True)
|
||||
|
||||
info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__)))
|
||||
exactmatch.setup_parser(subparsers.add_parser(get_module_base_name(exactmatch.__name__)))
|
||||
|
||||
|
||||
def run():
|
||||
args = root_parser.parse_args()
|
||||
args.func(args)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
@@ -3,23 +3,30 @@ from io import TextIOWrapper
|
||||
from os import PathLike
|
||||
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
||||
result_dict: dict[str, list[str]] = {}
|
||||
result_dict: dict[str, Union[list[str], str]] = {}
|
||||
for loci, alleles in alleles_map.items():
|
||||
result_dict[loci] = list()
|
||||
if len(alleles) == 1:
|
||||
result_dict[loci] = alleles[0].allele_variant
|
||||
for allele in alleles:
|
||||
result_dict[loci].append(allele.allele_variant)
|
||||
result_locis = list()
|
||||
result_locis.append(allele.allele_variant)
|
||||
result_dict[loci] = result_locis
|
||||
return result_dict
|
||||
|
||||
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, MLSTProfile]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
|
||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||
failed = list()
|
||||
with open(handle, "w", newline='') as filehandle:
|
||||
header = None
|
||||
writer: Union[csv.DictWriter, None] = None
|
||||
async for name, mlst_profile in mlst_profiles_iterable:
|
||||
if mlst_profile is None:
|
||||
failed.append(name)
|
||||
continue
|
||||
if writer is None:
|
||||
header = ["id", "st", "clonal-complex", *mlst_profile.alleles.keys()]
|
||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||
@@ -30,4 +37,5 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
|
||||
"id": name,
|
||||
**dict_loci_alleles_variants_from_loci(mlst_profile.alleles)
|
||||
}
|
||||
writer.writerow(rowdict=row_dictionary)
|
||||
writer.writerow(rowdict=row_dictionary)
|
||||
return failed
|
@@ -3,7 +3,7 @@ from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
@@ -1,13 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: int
|
||||
clonal_complex: str
|
@@ -1,13 +1,15 @@
|
||||
from collections import defaultdict
|
||||
from contextlib import AbstractAsyncContextManager
|
||||
from numbers import Number
|
||||
from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
class BigSDBMLSTProfiler(AbstractAsyncContextManager):
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
||||
|
||||
def __init__(self, database_api: str, database_name: str, schema_id: int):
|
||||
self._database_name = database_name
|
||||
@@ -18,56 +20,85 @@ class BigSDBMLSTProfiler(AbstractAsyncContextManager):
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
|
||||
async def fetch_mlst_allele_variants(self, sequence_string: str, exact: bool) -> AsyncGenerator[Allele, Any]:
|
||||
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
|
||||
uri_path = "sequence"
|
||||
response = await self._http_client.post(uri_path, json={
|
||||
"sequence": sequence_string
|
||||
"sequence": sequence_string,
|
||||
"partial_matches": not exact
|
||||
})
|
||||
sequence_response: dict = await response.json()
|
||||
if "exact_matches" not in sequence_response:
|
||||
# TODO throw exception for not finding matches.
|
||||
pass
|
||||
|
||||
if "exact_matches" not in sequence_response:
|
||||
raise ValueError(f"Unable to find exact matches in \"{self._database_name}\" under schema ID \"{self._schema_id}\".")
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
|
||||
if "exact_matches" in sequence_response:
|
||||
# loci -> list of alleles with id and loci
|
||||
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
|
||||
for allele_loci, alleles in exact_matches.items():
|
||||
for allele in alleles:
|
||||
alelle_id = allele["allele_id"]
|
||||
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
|
||||
elif "partial_matches" in sequence_response:
|
||||
if exact:
|
||||
raise NoBIGSdbExactMatchesException(self._database_name, self._schema_id)
|
||||
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
|
||||
for allele_loci, partial_match in partial_matches.items():
|
||||
if len(partial_match) <= 0:
|
||||
continue
|
||||
partial_match_profile = PartialAllelicMatchProfile(
|
||||
percent_identity=float(partial_match["identity"]),
|
||||
mismatches=int(partial_match["mismatches"]),
|
||||
bitscore=float(partial_match["bitscore"]),
|
||||
gaps=int(partial_match["gaps"])
|
||||
)
|
||||
yield Allele(
|
||||
allele_loci=allele_loci,
|
||||
allele_variant=str(partial_match["allele"]),
|
||||
partial_match_profile=partial_match_profile
|
||||
)
|
||||
else:
|
||||
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
|
||||
|
||||
async def fetch_mlst_st(self, alleles: AsyncIterable[Allele]) -> MLSTProfile:
|
||||
|
||||
|
||||
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
|
||||
uri_path = "designations"
|
||||
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
|
||||
if isinstance(alleles, AsyncIterable):
|
||||
async for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
else:
|
||||
for allele in alleles:
|
||||
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
|
||||
request_json = {
|
||||
"designations": allele_request_dict
|
||||
}
|
||||
async with self._http_client.post(uri_path, json=request_json) as response:
|
||||
response_json = await response.json()
|
||||
if "fields" not in response_json:
|
||||
# TODO raise exception about invalid parameters or no exact parameterization found
|
||||
pass
|
||||
schema_fields_returned = response_json["fields"]
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
response_json: dict = await response.json()
|
||||
allele_map: dict[str, list[Allele]] = defaultdict(list)
|
||||
response_json.setdefault("fields", dict())
|
||||
schema_fields_returned: dict[str, str] = response_json["fields"]
|
||||
schema_fields_returned.setdefault("ST", "unknown")
|
||||
schema_fields_returned.setdefault("clonal_complex", "unknown")
|
||||
schema_exact_matches: dict = response_json["exact_matches"]
|
||||
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
|
||||
for exact_match_allele in exact_match_alleles:
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
|
||||
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"], None))
|
||||
if len(allele_map) == 0:
|
||||
raise ValueError("Passed in no alleles.")
|
||||
return MLSTProfile(dict(allele_map), schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
|
||||
|
||||
async def profile_string(self, string: str) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string)
|
||||
async def profile_string(self, string: str, exact: bool = False) -> MLSTProfile:
|
||||
alleles = self.fetch_mlst_allele_variants(string, exact)
|
||||
return await self.fetch_mlst_st(alleles)
|
||||
|
||||
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString]) -> AsyncGenerator[tuple[str, MLSTProfile], Any]:
|
||||
async def profile_multiple_strings(self, namedStrings: AsyncIterable[NamedString], exact: bool = False, stop_on_fail: bool = False) -> AsyncGenerator[tuple[str, Union[MLSTProfile, None]], Any]:
|
||||
async for named_string in namedStrings:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence))
|
||||
|
||||
try:
|
||||
yield (named_string.name, await self.profile_string(named_string.sequence, exact))
|
||||
except NoBIGSdbMatchesException as e:
|
||||
if stop_on_fail:
|
||||
raise e
|
||||
yield (named_string.name, None)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
||||
@@ -107,7 +138,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
async def get_bigsdb_api_from_seqdefdb(self, seqdef_db_name: str) -> str:
|
||||
known_databases = await self.get_known_seqdef_dbs()
|
||||
if seqdef_db_name not in known_databases:
|
||||
raise ValueError(f"The database \"{seqdef_db_name}\" could not be found.")
|
||||
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
|
||||
return known_databases[seqdef_db_name]
|
||||
|
||||
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
|
||||
@@ -124,8 +155,8 @@ class BIGSdbIndex(AbstractAsyncContextManager):
|
||||
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
|
||||
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
|
||||
|
||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BigSDBMLSTProfiler:
|
||||
return BigSDBMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
async def build_profiler_from_seqdefdb(self, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
|
||||
return BIGSdbMLSTProfiler(await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
|
||||
|
||||
async def close(self):
|
||||
await self._http_client.close()
|
0
src/automlst/engine/data/structures/__init__.py
Normal file
0
src/automlst/engine/data/structures/__init__.py
Normal file
21
src/automlst/engine/data/structures/mlst.py
Normal file
21
src/automlst/engine/data/structures/mlst.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Mapping, Sequence, Union
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PartialAllelicMatchProfile:
|
||||
percent_identity: float
|
||||
mismatches: int
|
||||
bitscore: float
|
||||
gaps: int
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Allele:
|
||||
allele_loci: str
|
||||
allele_variant: str
|
||||
partial_match_profile: Union[None, PartialAllelicMatchProfile]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MLSTProfile:
|
||||
alleles: Mapping[str, Sequence[Allele]]
|
||||
sequence_type: str
|
||||
clonal_complex: str
|
21
src/automlst/engine/exceptions/database.py
Normal file
21
src/automlst/engine/exceptions/database.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from typing import Union
|
||||
|
||||
class BIGSDbDatabaseAPIException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
|
||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||
|
||||
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
|
||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
|
||||
|
||||
class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException):
|
||||
def __init__(self, database_name: str, *args):
|
||||
super().__init__(f"No database \"{database_name}\" found.", *args)
|
||||
|
||||
class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException):
|
||||
def __init__(self, database_name: str, database_schema_id: int, *args):
|
||||
super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args)
|
@@ -1,126 +0,0 @@
|
||||
import asyncio
|
||||
from numbers import Number
|
||||
from os import path
|
||||
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
|
||||
from automlst.engine.data.genomics import NamedString, SangerTraceData
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio import SeqIO, Align
|
||||
|
||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
||||
|
||||
|
||||
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
||||
with open(seq_path, "rb") as seq_handle:
|
||||
return SeqIO.read(seq_handle, "abi")
|
||||
|
||||
|
||||
async def read_abif(seq_path: str) -> SangerTraceData:
|
||||
ext = path.splitext(seq_path)[1]
|
||||
if ext.lower() != ".ab1" and ext.lower() != "abi":
|
||||
raise ValueError(
|
||||
'seq_path must have file extension of "ab1", or "abi".')
|
||||
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
|
||||
biopython_annotations = biopython_seq.annotations
|
||||
|
||||
# Lot of type ignoring since Biopython did not define their typing.
|
||||
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
|
||||
trace_data = SangerTraceData(
|
||||
path.basename(seq_path),
|
||||
biopython_seq.seq,
|
||||
biopython_abif_raw.get("APFN2"), # type: ignore
|
||||
biopython_abif_raw.get("APrN1"), # type: ignore
|
||||
biopython_abif_raw.get("APrV1"), # type: ignore
|
||||
biopython_abif_raw.get("APrX1"), # type: ignore
|
||||
biopython_abif_raw.get("APXV1"), # type: ignore
|
||||
biopython_abif_raw.get("CMNT1"), # type: ignore
|
||||
biopython_abif_raw.get("CpEP1"), # type: ignore
|
||||
biopython_abif_raw.get("CTID1"), # type: ignore
|
||||
biopython_abif_raw.get("CTNM1"), # type: ignore
|
||||
biopython_abif_raw.get("CTTL1"), # type: ignore
|
||||
biopython_abif_raw.get("DATA1"), # type: ignore
|
||||
biopython_abif_raw.get("DATA2"), # type: ignore
|
||||
biopython_abif_raw.get("DATA3"), # type: ignore
|
||||
biopython_abif_raw.get("DATA4"), # type: ignore
|
||||
biopython_abif_raw.get("DATA5"), # type: ignore
|
||||
biopython_abif_raw.get("DATA6"), # type: ignore
|
||||
biopython_abif_raw.get("DATA7"), # type: ignore
|
||||
biopython_abif_raw.get("DATA8"), # type: ignore
|
||||
biopython_abif_raw.get("DSam1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN2"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN3"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN4"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW2"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW3"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW4"), # type: ignore
|
||||
biopython_abif_raw.get("DySN1"), # type: ignore
|
||||
biopython_abif_raw.get("EPVt1"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT1"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT2"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT3"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT4"), # type: ignore
|
||||
biopython_abif_raw.get("FWO_1"), # type: ignore
|
||||
biopython_abif_raw.get("GTyp1"), # type: ignore
|
||||
biopython_abif_raw.get("InSc1"), # type: ignore
|
||||
biopython_abif_raw.get("InVt1"), # type: ignore
|
||||
biopython_abif_raw.get("LANE1"), # type: ignore
|
||||
biopython_abif_raw.get("LIMS1"), # type: ignore
|
||||
biopython_abif_raw.get("LNTD1"), # type: ignore
|
||||
biopython_abif_raw.get("LsrP1"), # type: ignore
|
||||
biopython_abif_raw.get("MCHN1"), # type: ignore
|
||||
biopython_abif_raw.get("MODF1"), # type: ignore
|
||||
biopython_abif_raw.get("MODL1"), # type: ignore
|
||||
biopython_abif_raw.get("NAVG1"), # type: ignore
|
||||
biopython_abif_raw.get("NLNE1"), # type: ignore
|
||||
biopython_abif_raw.get("OfSc1"), # type: ignore
|
||||
biopython_abif_raw.get("PDMF1"), # type: ignore
|
||||
biopython_abif_raw.get("PXLB1"), # type: ignore
|
||||
biopython_abif_raw.get("RGCm1"), # type: ignore
|
||||
biopython_abif_raw.get("RGNm1"), # type: ignore
|
||||
biopython_abif_raw.get("RMdV1"), # type: ignore
|
||||
biopython_abif_raw.get("RMdX1"), # type: ignore
|
||||
biopython_abif_raw.get("RMXV1"), # type: ignore
|
||||
biopython_abif_raw.get("RPrN1"), # type: ignore
|
||||
biopython_abif_raw.get("RPrV1"), # type: ignore
|
||||
biopython_abif_raw.get("RUND1"), # type: ignore
|
||||
biopython_abif_raw.get("RUND2"), # type: ignore
|
||||
biopython_abif_raw.get("RUND3"), # type: ignore
|
||||
biopython_abif_raw.get("RUND4"), # type: ignore
|
||||
biopython_abif_raw.get("RunN1"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT1"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT2"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT3"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT4"), # type: ignore
|
||||
biopython_abif_raw.get("Satd"), # type: ignore
|
||||
biopython_abif_raw.get("Scal1"), # type: ignore
|
||||
biopython_abif_raw.get("SCAN1"), # type: ignore
|
||||
biopython_abif_raw.get("SMED1"), # type: ignore
|
||||
biopython_abif_raw.get("SMLt"), # type: ignore
|
||||
biopython_abif_raw.get("SMPL1"), # type: ignore
|
||||
biopython_abif_raw.get("SVER1"), # type: ignore
|
||||
biopython_abif_raw.get("SVER3"), # type: ignore
|
||||
biopython_abif_raw.get("Tmpr1"), # type: ignore
|
||||
biopython_abif_raw.get("TUBE"), # type: ignore
|
||||
biopython_abif_raw.get("User") # type: ignore
|
||||
)
|
||||
return trace_data
|
||||
|
||||
|
||||
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
|
||||
aligner = Align.PairwiseAligner(scoring="blastn")
|
||||
aligner.mode = "local"
|
||||
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
|
||||
0] # take the best alignment
|
||||
# TODO actually assemble the consensus sequence here
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
||||
|
||||
|
||||
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
||||
if isinstance(reference, str):
|
||||
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
|
||||
else:
|
||||
reference_seq: NamedString = reference
|
||||
for sanger_trace in sanger_traces:
|
||||
yield NamedString("NA", "NA")
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
@@ -1,27 +0,0 @@
|
||||
import asyncio
|
||||
from Bio import Entrez
|
||||
from Bio import SeqIO
|
||||
|
||||
# TODO Change this out for a more professional approach
|
||||
Entrez.email = "yunyangdeng@outlook.com"
|
||||
|
||||
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
|
||||
|
||||
|
||||
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
||||
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
||||
record = SeqIO.read(fetch_stream, "genbank")
|
||||
sequence_features = list()
|
||||
for feature in record.features:
|
||||
start = int(feature.location.start)
|
||||
end = int(feature.location.end)
|
||||
qualifiers = feature.qualifiers
|
||||
for qualifier_key in qualifiers:
|
||||
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
|
||||
sequence_features.append(StringAnnotation(
|
||||
type=feature.type,
|
||||
start=start,
|
||||
end=end+1, # Position is exclusive
|
||||
feature_properties=qualifiers
|
||||
))
|
||||
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
@@ -1,4 +1,4 @@
|
||||
from automlst.engine.local.fasta import read_fasta
|
||||
from automlst.engine.data.local.fasta import read_fasta
|
||||
|
||||
|
||||
async def test_fasta_reader_not_none():
|
244
tests/automlst/engine/data/remote/databases/test_bigsdb.py
Normal file
244
tests/automlst/engine/data/remote/databases/test_bigsdb.py
Normal file
@@ -0,0 +1,244 @@
|
||||
import random
|
||||
import re
|
||||
from typing import Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from automlst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
||||
if isinstance(mutation_site_count, float):
|
||||
mutation_site_count = int(mutation_site_count * len(gene))
|
||||
random_locations = rand.choices(range(len(gene)), k=mutation_site_count)
|
||||
scrambled = list(gene)
|
||||
for random_location in random_locations:
|
||||
scrambled[random_location] = rand.choice(alphabet)
|
||||
return "".join(scrambled)
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True):
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_non_exact_returns_non_exact():
|
||||
sequences = list(SeqIO.parse("tests/resources/tohama_I_bpertussis_coding.fasta", "fasta"))
|
||||
mlst_targets = {"adk", "fumc", "glya", "tyrb", "icd", "pepa", "pgm"}
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as profiler:
|
||||
for sequence in sequences:
|
||||
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", sequence.description)
|
||||
if match is None:
|
||||
continue
|
||||
gene = match.group(1)
|
||||
if gene.lower() not in mlst_targets:
|
||||
continue
|
||||
scrambled = gene_scrambler(str(sequence.seq), 0.125)
|
||||
async for partial_match in profiler.fetch_mlst_allele_variants(scrambled, False):
|
||||
assert partial_match.partial_match_profile is not None
|
||||
mlst_targets.remove(gene.lower())
|
||||
|
||||
assert len(mlst_targets) == 0
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||
async def dummy_allele_generator():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "1", None),
|
||||
Allele("glyA", "1", None),
|
||||
Allele("tyrB", "1", None),
|
||||
Allele("icd", "1", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "1", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
assert mlst_st_data.sequence_type == "1"
|
||||
|
||||
async def test_institutpasteur_profiling_non_exact_results_in_list_of_mlsts():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("fumC", "2", None),
|
||||
Allele("glyA", "36", None),
|
||||
Allele("tyrB", "4", None),
|
||||
Allele("icd", "4", None),
|
||||
Allele("pepA", "1", None),
|
||||
Allele("pgm", "5", None),
|
||||
]
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_profile = await dummy_profiler.fetch_mlst_st(dummy_alleles)
|
||||
assert mlst_profile.clonal_complex == "unknown"
|
||||
assert mlst_profile.sequence_type == "unknown"
|
||||
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
|
||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence, exact=True)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
|
||||
assert len(dummy_alleles) == 0
|
||||
|
||||
async def test_pubmlst_profiling_results_in_correct_st():
|
||||
async def generate_dummy_targets():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1", None),
|
||||
Allele("atpG", "1", None),
|
||||
Allele("frdB", "1", None),
|
||||
Allele("fucK", "1", None),
|
||||
Allele("mdh", "1", None),
|
||||
Allele("pgi", "1", None),
|
||||
Allele("recA", "5", None),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||
assert mlst_st_data.sequence_type == "3"
|
||||
|
||||
async def test_pubmlst_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BIGSdbMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-3 complex"
|
||||
assert profile.sequence_type == "3"
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
|
||||
async def test_bigsdb_index_instantiates_correct_profiler():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_same_string_twice():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", sequence), NamedString("seq2", sequence)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences()):
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), True):
|
||||
if name == "should_fail":
|
||||
assert profile is None
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", gene_scrambler(valid_seq, 0.3)), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), False):
|
||||
if name == "should_fail":
|
||||
assert profile is not None
|
||||
assert profile.clonal_complex == "unknown"
|
||||
assert profile.sequence_type == "unknown"
|
||||
assert len(profile.alleles) > 0
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_profile_multiple_strings_fail_second_stop():
|
||||
valid_seq = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
invalid_seq = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
dummy_sequences = [NamedString("seq1", valid_seq), NamedString("should_fail", invalid_seq), NamedString("seq3", valid_seq)]
|
||||
async def generate_async_iterable_sequences():
|
||||
for dummy_sequence in dummy_sequences:
|
||||
yield dummy_sequence
|
||||
async with BIGSdbMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
with pytest.raises(NoBIGSdbMatchesException):
|
||||
async for name, profile in dummy_profiler.profile_multiple_strings(generate_async_iterable_sequences(), exact=True, stop_on_fail=True):
|
||||
if name == "should_fail":
|
||||
pytest.fail("Exception should have been thrown, no exception was thrown.")
|
||||
else:
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
async def test_bigsdb_index_get_schemas_for_bordetella():
|
||||
async with BIGSdbIndex() as index:
|
||||
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
|
||||
assert len(schemas.keys()) > 0
|
||||
assert "MLST" in schemas
|
||||
assert isinstance(schemas["MLST"], int)
|
||||
|
||||
async def test_bigsdb_index_get_databases_has_only_seqdef():
|
||||
async with BIGSdbIndex() as index:
|
||||
databases = await index.get_known_seqdef_dbs()
|
||||
assert len(databases.keys()) > 0
|
||||
for database_name in databases.keys():
|
||||
assert database_name.endswith("seqdef")
|
||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
@@ -1,12 +0,0 @@
|
||||
import os
|
||||
|
||||
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
|
||||
|
||||
async def test_load_sanger_sequence_has_data():
|
||||
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
assert result_data is not None
|
||||
|
||||
async def test_consensus_assembly_with_ncbi():
|
||||
consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
|
||||
# TODO complete implementing this
|
@@ -1,5 +0,0 @@
|
||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
||||
|
||||
|
||||
async def test_fetch_ncbi_genbank_with_id_works():
|
||||
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
|
@@ -1,115 +0,0 @@
|
||||
from Bio import SeqIO
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BigSDBMLSTProfiler
|
||||
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_exact_matches_when_exact():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
|
||||
targets_left.remove(exact_match.allele_loci)
|
||||
|
||||
assert len(targets_left) == 0
|
||||
|
||||
async def test_institutpasteur_profiling_results_in_correct_mlst_st():
|
||||
async def dummy_allele_generator():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("fumC", "1"),
|
||||
Allele("glyA", "1"),
|
||||
Allele("tyrB", "1"),
|
||||
Allele("icd", "1"),
|
||||
Allele("pepA", "1"),
|
||||
Allele("pgm", "1"),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_allele_generator())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-2 complex"
|
||||
assert mlst_st_data.sequence_type == "1"
|
||||
|
||||
async def test_institutpasteur_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BigSDBMLSTProfiler(database_api="https://bigsdb.pasteur.fr/api", database_name="pubmlst_bordetella_seqdef", schema_id=3) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
||||
|
||||
|
||||
async def test_pubmlst_profiling_results_in_exact_matches_when_exact():
|
||||
dummy_alleles = {
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
}
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
|
||||
async for exact_match in exact_matches:
|
||||
assert isinstance(exact_match, Allele)
|
||||
dummy_alleles.remove(exact_match)
|
||||
|
||||
assert len(dummy_alleles) == 0
|
||||
|
||||
async def test_pubmlst_profiling_results_in_correct_st():
|
||||
async def generate_dummy_targets():
|
||||
dummy_alleles = [
|
||||
Allele("adk", "1"),
|
||||
Allele("atpG", "1"),
|
||||
Allele("frdB", "1"),
|
||||
Allele("fucK", "1"),
|
||||
Allele("mdh", "1"),
|
||||
Allele("pgi", "1"),
|
||||
Allele("recA", "5"),
|
||||
]
|
||||
for dummy_allele in dummy_alleles:
|
||||
yield dummy_allele
|
||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
mlst_st_data = await dummy_profiler.fetch_mlst_st(generate_dummy_targets())
|
||||
assert mlst_st_data is not None
|
||||
assert isinstance(mlst_st_data, MLSTProfile)
|
||||
assert mlst_st_data.clonal_complex == "ST-3 complex"
|
||||
assert mlst_st_data.sequence_type == "3"
|
||||
|
||||
async def test_pubmlst_sequence_profiling_is_correct():
|
||||
sequence = str(SeqIO.read("tests/resources/FDAARGOS_1560.fasta", "fasta").seq)
|
||||
async with BigSDBMLSTProfiler(database_api="https://rest.pubmlst.org/", database_name="pubmlst_hinfluenzae_seqdef", schema_id=1) as dummy_profiler:
|
||||
profile = await dummy_profiler.profile_string(sequence)
|
||||
assert profile is not None
|
||||
assert isinstance(profile, MLSTProfile)
|
||||
assert profile.clonal_complex == "ST-3 complex"
|
||||
assert profile.sequence_type == "3"
|
||||
|
||||
async def test_bigsdb_index_all_databases_is_not_empty():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert len(await bigsdb_index.get_known_seqdef_dbs()) > 0
|
||||
|
||||
async def test_bigsdb_index_references_pubmlst_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_hinfluenzae_seqdef")) == "https://rest.pubmlst.org"
|
||||
|
||||
async def test_bigsdb_index_references_institutpasteur_correctly():
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
|
||||
|
||||
|
||||
async def test_bigsdb_index_instantiates_correct_profiler():
|
||||
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
|
||||
async with BIGSdbIndex() as bigsdb_index:
|
||||
async with await bigsdb_index.build_profiler_from_seqdefdb("pubmlst_bordetella_seqdef", 3) as profiler:
|
||||
profile = await profiler.profile_string(sequence)
|
||||
assert profile.clonal_complex == "ST-2 complex"
|
||||
assert profile.sequence_type == "1"
|
68196
tests/resources/12822.fasta
Normal file
68196
tests/resources/12822.fasta
Normal file
File diff suppressed because it is too large
Load Diff
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
59120
tests/resources/tohama_I_bpertussis_coding.fasta
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user