commit 3b2cf916fd3767ce9b850a0cb3e9b02ddf964030 Author: Harrison Deng Date: Fri Jan 10 21:15:59 2025 +0000 Initial commit transfering files over from original automlst.engine project diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..b1ed528 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,32 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/python +{ + "name": "Python 3", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "pip3 install --user -r requirements.txt && pip3 install -e .", + "customizations": { + "vscode": { + "extensions": [ + "mechatroner.rainbow-csv" + ] + } + }, + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" + + "containerEnv": { + "PIP_EXTRA_INDEX_URL": "https://git.reslate.systems/api/packages/ydeng/pypi/simple" + } +} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a31886c --- /dev/null +++ b/.gitignore @@ -0,0 +1,214 @@ +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig +# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python +# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b881eff --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.analysis.autoImportCompletions": true +} \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..8de8402 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,41 @@ +pipeline { + agent { + kubernetes { + cloud 'rsys-devel' + defaultContainer 'pip' + inheritFrom 'pip' + } + } + stages { + stage("install") { + steps { + sh 'python -m pip install -r requirements.txt' + } + } + stage("unit tests") { + steps { + sh returnStatus: true, script: "python -m pytest --junitxml=test_results.xml --cov=src --cov-report xml:coverage.xml" + xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)] + recordCoverage(tools: [[parser: 'COBERTURA', pattern: 'coverage.xml']]) + } + } + stage("build") { + steps { + sh "python -m build" + } + } + stage("archive") { + steps { + archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true + } + } + stage("publish") { + environment { + CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311') + } + steps { + sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*' + } + } + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ddb0cd5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,51 @@ +[build-system] +requires = ["setuptools>=64", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[project] +name = "automlst.cli" +dynamic = ["version"] + +dependencies = [ + "automlst-engine" +] +requires-python = ">=3.11" +description = "A CLI tool to rapidly fetch fetch MLST profiles given sequences for various diseases." + +[project.scripts] +automlst = "automlst.cli.program:run" + +[tool.setuptools_scm] + +[tool.pyright] +extraPaths = ["src"] +exclude = [ + "**/node_modules", + "**/__pycache__" +] +executionEnvironments = [ + {root = "src"} +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.pytest.ini_options] +testpaths = [ + "tests" +] +pythonpath = [ + "src" +] +addopts = [ + "--import-mode=importlib", +] +asyncio_mode = "auto" + +[tool.pylint.main] +source-roots = "src" + +[tool.pylint.format] +# Maximum number of characters on a single line. +max-line-length = 88 + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9eb3cfa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +pytest +pytest-asyncio +pytest-cov +build +twine +setuptools_scm +automlst-engine \ No newline at end of file diff --git a/src/automlst/cli/__init__.py b/src/automlst/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/automlst/cli/info.py b/src/automlst/cli/info.py new file mode 100644 index 0000000..9947f70 --- /dev/null +++ b/src/automlst/cli/info.py @@ -0,0 +1,42 @@ +from argparse import ArgumentParser +import asyncio +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex + +def setup_parser(parser: ArgumentParser): + parser.description = "Fetches the latest BIGSdb MLST database definitions." + parser.add_argument( + "--retrieve-bigsdbs", "-l", + action="store_true", + dest="list_dbs", + required=False, + default=False, + help="Lists all known BIGSdb MLST databases (fetched from known APIs and cached)." + ) + + parser.add_argument( + "--retrieve-bigsdb-schemas", "-lschemas", + nargs="+", + action="extend", + dest="list_bigsdb_schemas", + required=False, + default=[], + type=str, + help="Lists the known schema IDs for a given BIGSdb sequence definition database name. The name, and then the ID of the schema is given." + ) + + parser.set_defaults(func=run_asynchronously) + +async def run(args): + async with BIGSdbIndex() as bigsdb_index: + if args.list_dbs: + known_seqdef_dbs = await bigsdb_index.get_known_seqdef_dbs(force=False) + print("\n".join(known_seqdef_dbs.keys())) + + for bigsdb_schema_name in args.list_bigsdb_schemas: + schemas = await bigsdb_index.get_schemas_for_seqdefdb(bigsdb_schema_name) + for schema_desc, schema_id in schemas.items(): + print(f"{schema_desc}: {schema_id}") + +def run_asynchronously(args): + asyncio.run(run(args)) + diff --git a/src/automlst/cli/meta.py b/src/automlst/cli/meta.py new file mode 100644 index 0000000..d45096d --- /dev/null +++ b/src/automlst/cli/meta.py @@ -0,0 +1,2 @@ +def get_module_base_name(name): + return name.split(".")[-1] diff --git a/src/automlst/cli/program.py b/src/automlst/cli/program.py new file mode 100644 index 0000000..02e1847 --- /dev/null +++ b/src/automlst/cli/program.py @@ -0,0 +1,27 @@ +import argparse +import asyncio +import datetime +from os import path +import os + +from automlst.cli import info, st +from automlst.cli.meta import get_module_base_name +from automlst.engine.data.genomics import NamedString +from automlst.engine.local.abif import read_abif +from automlst.engine.local.csv import write_mlst_profiles_as_csv +from automlst.engine.local.fasta import read_fasta +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex + +root_parser = argparse.ArgumentParser() +subparsers = root_parser.add_subparsers(required=True) + +info.setup_parser(subparsers.add_parser(get_module_base_name(info.__name__))) +st.setup_parser(subparsers.add_parser(get_module_base_name(st.__name__))) + + +def run(): + args = root_parser.parse_args() + args.func(args) + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/src/automlst/cli/st.py b/src/automlst/cli/st.py new file mode 100644 index 0000000..d8670c6 --- /dev/null +++ b/src/automlst/cli/st.py @@ -0,0 +1,69 @@ + +from argparse import ArgumentParser +import asyncio +import datetime +from automlst.engine.local.csv import write_mlst_profiles_as_csv +from automlst.engine.local.fasta import read_multiple_fastas +from automlst.engine.remote.databases.bigsdb import BIGSdbIndex + + +def setup_parser(parser: ArgumentParser): + parser.description = "Returns MLST exact profile matches." + parser.add_argument( + "fastas", + nargs="+", + action='extend', + default=[], + type=str, + help="The FASTA files to process. Multiple can be listed." + ) + + parser.add_argument( + "seqdefdb", + help="The BIGSdb seqdef database to use for typing." + ) + + parser.add_argument( + "schema", + type=int, + help="The BIGSdb seqdef database schema ID (integer) to use for typing." + ) + + parser.add_argument( + "out", + default=f'./{datetime.datetime.now().strftime(r"%Y%m%d%H%M%S")}', + help="The output CSV name (.csv will be appended)." + ) + + parser.add_argument( + "--exact", "-ex", + action="store_true", + dest="exact", + required=False, + default=False, + help="Should run exact matching rather than returning all similar ones" + ) + + parser.add_argument( + "--stop-on-fail", "-sof", + action="store_true", + dest="stop_on_fail", + required=False, + default=False, + help="Should the algorithm stop in the case there are no matches (or partial matches when expecting exact matches)." + ) + parser.set_defaults(func=run_asynchronously) + +async def run(args): + async with BIGSdbIndex() as bigsdb_index: + gen_strings = read_multiple_fastas(args.fastas) + async with await bigsdb_index.build_profiler_from_seqdefdb(args.seqdefdb, args.schema) as mlst_profiler: + mlst_profiles = mlst_profiler.profile_multiple_strings(gen_strings, exact=args.exact) + failed = await write_mlst_profiles_as_csv(mlst_profiles, args.out) + if len(failed) > 0: + print(f"A total of {len(failed)} IDs failed:\n{"\n".join(failed)}") + print(f"Completed fetching MLSTs for {len(args.fastas)} sequences.") + +def run_asynchronously(args): + asyncio.run(run(args)) +