From b8cebb8ba43ccab5ad5d62f9ef1c7cb5ceb2bf42 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 19 Feb 2025 15:49:46 +0000 Subject: [PATCH 01/10] Infrastructure for concurrent processing implemented --- src/autobigs/engine/analysis/bigsdb.py | 68 +++++++++++++--------- src/autobigs/engine/exceptions/database.py | 6 +- src/autobigs/engine/reading.py | 5 +- 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/src/autobigs/engine/analysis/bigsdb.py b/src/autobigs/engine/analysis/bigsdb.py index 0b52ce9..d9e11e9 100644 --- a/src/autobigs/engine/analysis/bigsdb.py +++ b/src/autobigs/engine/analysis/bigsdb.py @@ -22,15 +22,15 @@ from Bio.Align import PairwiseAligner class BIGSdbMLSTProfiler(AbstractAsyncContextManager): @abstractmethod - def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]: + def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]: pass @abstractmethod - async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: + async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]: pass @abstractmethod - async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile: + async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]: pass @abstractmethod @@ -52,14 +52,14 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): async def __aenter__(self): return self - async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]: + async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]: # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes uri_path = "sequence" - if isinstance(query_sequence_strings, str): + if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString): query_sequence_strings = [query_sequence_strings] for sequence_string in query_sequence_strings: async with self._http_client.post(uri_path, json={ - "sequence": sequence_string, + "sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence, "partial_matches": True }) as response: sequence_response: dict = await response.json() @@ -70,7 +70,8 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): for allele_loci, alleles in exact_matches.items(): for allele in alleles: alelle_id = allele["allele_id"] - yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None) + result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None) + yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele) elif "partial_matches" in sequence_response: partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"] for allele_loci, partial_match in partial_matches.items(): @@ -82,23 +83,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): gaps=int(partial_match["gaps"]), match_metric=int(partial_match["bitscore"]) ) - yield Allele( + result_allele = Allele( allele_locus=allele_loci, allele_variant=str(partial_match["allele"]), partial_match_profile=partial_match_profile ) + yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele) else: - raise NoBIGSdbMatchesException(self._database_name, self._schema_id) + raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None) - async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile: + async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]: uri_path = "designations" allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list) + names_list = [] + def insert_allele_to_request_dict(allele: Union[Allele, tuple[str, Allele]]): + if isinstance(allele, Allele): + allele_val = allele + else: + allele_val = allele[1] + names_list.append(allele[0]) + allele_request_dict[allele_val.allele_locus].append({"allele": str(allele_val.allele_variant)}) + if isinstance(alleles, AsyncIterable): async for allele in alleles: - allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)}) + insert_allele_to_request_dict(allele) else: for allele in alleles: - allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)}) + insert_allele_to_request_dict(allele) request_json = { "designations": allele_request_dict } @@ -111,30 +122,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): schema_fields_returned.setdefault("clonal_complex", "unknown") schema_exact_matches: dict = response_json["exact_matches"] for exact_match_locus, exact_match_alleles in schema_exact_matches.items(): - if len(exact_match_alleles) > 1: - raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})") allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)) if len(allele_set) == 0: raise ValueError("Passed in no alleles.") - return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + if len(names_list) > 0: + result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile) + return result_mlst_profile - async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile: + async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]: alleles = self.determine_mlst_allele_variants(query_sequence_strings) return await self.determine_mlst_st(alleles) async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: + tasks = [] async for named_strings in query_named_string_groups: - names: list[str] = list() - sequences: list[str] = list() - for named_string in named_strings: - names.append(named_string.name) - sequences.append(named_string.sequence) - try: - yield NamedMLSTProfile("-".join(names), (await self.profile_string(sequences))) - except NoBIGSdbMatchesException as e: - if stop_on_fail: - raise e - yield NamedMLSTProfile("-".join(names), None) + tasks.append(self.profile_string(named_strings)) + for task in asyncio.as_completed(tasks): + try: + yield await task + except NoBIGSdbMatchesException as e: + if stop_on_fail: + raise e + causal_name = e.get_causal_query_name() + if causal_name is None: + raise ValueError("Missing query name despite requiring names.") + else: + yield NamedMLSTProfile(causal_name, None) async def close(self): await self._http_client.close() diff --git a/src/autobigs/engine/exceptions/database.py b/src/autobigs/engine/exceptions/database.py index c60190a..10787d2 100644 --- a/src/autobigs/engine/exceptions/database.py +++ b/src/autobigs/engine/exceptions/database.py @@ -5,8 +5,12 @@ class BIGSDbDatabaseAPIException(Exception): class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException): - def __init__(self, database_name: str, database_schema_id: int, *args): + def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args): + self._query_name = query_name super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) + + def get_causal_query_name(self) -> Union[str, None]: + return self._query_name class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException): def __init__(self, database_name: str, database_schema_id: int, *args): diff --git a/src/autobigs/engine/reading.py b/src/autobigs/engine/reading.py index 6618427..9949da4 100644 --- a/src/autobigs/engine/reading.py +++ b/src/autobigs/engine/reading.py @@ -13,5 +13,8 @@ async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString] return results async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]: + tasks = [] for handle in handles: - yield await read_fasta(handle) \ No newline at end of file + tasks.append(read_fasta(handle)) + for task in asyncio.as_completed(tasks): + yield await task \ No newline at end of file From 2822a483e30b5a258cb125c89cb6b0b34e0dabff Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 05:37:56 +0000 Subject: [PATCH 02/10] Initial attempt at switching to a conda based build environment --- .devcontainer/Dockerfile | 11 +++ .devcontainer/devcontainer.json | 19 ++-- .devcontainer/noop.txt | 3 + .gitignore | 159 ++------------------------------ .vscode/extensions.json | 5 + Jenkinsfile | 10 +- autobigs-engine/meta.yaml | 44 +++++++++ environment.yml | 15 +++ requirements.txt | 8 -- scripts/patch_recipe.py | 103 +++++++++++++++++++++ 10 files changed, 202 insertions(+), 175 deletions(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/noop.txt create mode 100644 .vscode/extensions.json create mode 100644 autobigs-engine/meta.yaml create mode 100644 environment.yml delete mode 100644 requirements.txt create mode 100644 scripts/patch_recipe.py diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..2738918 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,11 @@ +FROM mcr.microsoft.com/devcontainers/anaconda:1-3 + +# Copy environment.yml (if found) to a temp location so we update the environment. Also +# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists. +COPY environment.yml* .devcontainer/noop.txt /tmp/conda-tmp/ +RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \ + && rm -rf /tmp/conda-tmp + +# [Optional] Uncomment this section to install additional OS packages. +# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ +# && apt-get -y install --no-install-recommends diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index c58d0e4..70e91b2 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,9 +1,11 @@ // For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/python +// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda { - "name": "Python 3", - // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile - "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", + "name": "Anaconda (Python 3)", + "build": { + "context": "..", + "dockerfile": "Dockerfile" + } // Features to add to the dev container. More info: https://containers.dev/features. // "features": {}, @@ -12,14 +14,7 @@ // "forwardPorts": [], // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "pip3 install --user -r requirements.txt", - "customizations": { - "vscode": { - "extensions": [ - "mechatroner.rainbow-csv" - ] - } - } + // "postCreateCommand": "python --version", // Configure tool-specific properties. // "customizations": {}, diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt new file mode 100644 index 0000000..dde8dc3 --- /dev/null +++ b/.devcontainer/noop.txt @@ -0,0 +1,3 @@ +This file copied into the container along with environment.yml* from the parent +folder. This file is included to prevents the Dockerfile COPY instruction from +failing if no environment.yml is found. \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2a36fc4..2438221 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig -# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node -# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,svelte,python,linux,node +# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python +# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python ### Linux ### *~ @@ -17,146 +17,6 @@ # .nfs files are created when an open file is removed but is still being accessed .nfs* -### Node ### -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -lerna-debug.log* -.pnpm-debug.log* - -# Diagnostic reports (https://nodejs.org/api/report.html) -report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage -*.lcov - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (https://nodejs.org/api/addons.html) -build/Release - -# Dependency directories -node_modules/ -jspm_packages/ - -# Snowpack dependency directory (https://snowpack.dev/) -web_modules/ - -# TypeScript cache -*.tsbuildinfo - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional stylelint cache -.stylelintcache - -# Microbundle cache -.rpt2_cache/ -.rts2_cache_cjs/ -.rts2_cache_es/ -.rts2_cache_umd/ - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variable files -.env -.env.development.local -.env.test.local -.env.production.local -.env.local - -# parcel-bundler cache (https://parceljs.org/) -.cache -.parcel-cache - -# Next.js build output -.next -out - -# Nuxt.js build / generate output -.nuxt -dist - -# Gatsby files -.cache/ -# Comment in the public line in if your project uses Gatsby and not Next.js -# https://nextjs.org/blog/next-9-1#public-directory-support -# public - -# vuepress build output -.vuepress/dist - -# vuepress v2.x temp and cache directory -.temp - -# Docusaurus cache and generated files -.docusaurus - -# Serverless directories -.serverless/ - -# FuseBox cache -.fusebox/ - -# DynamoDB Local files -.dynamodb/ - -# TernJS port file -.tern-port - -# Stores VSCode versions used for testing VSCode extensions -.vscode-test - -# yarn v2 -.yarn/cache -.yarn/unplugged -.yarn/build-state.yml -.yarn/install-state.gz -.pnp.* - -### Node Patch ### -# Serverless Webpack directories -.webpack/ - -# Optional stylelint cache - -# SvelteKit build / generate output -.svelte-kit - ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ @@ -202,6 +62,7 @@ htmlcov/ .nox/ .coverage .coverage.* +.cache nosetests.xml coverage.xml *.cover @@ -215,6 +76,7 @@ cover/ *.pot # Django stuff: +*.log local_settings.py db.sqlite3 db.sqlite3-journal @@ -278,6 +140,7 @@ celerybeat.pid *.sage.py # Environments +.env .venv env/ venv/ @@ -326,13 +189,6 @@ poetry.toml # LSP config files pyrightconfig.json -### Svelte ### -# gitignore template for the SvelteKit, frontend web component framework -# website: https://kit.svelte.dev/ - -.svelte-kit/ -package - ### VisualStudioCode ### .vscode/* !.vscode/settings.json @@ -352,9 +208,8 @@ package .history .ionide -# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node +# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) -output -*.private.* \ No newline at end of file +conda-bld \ No newline at end of file diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..1bddac4 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "piotrpalarz.vscode-gitignore-generator" + ] +} \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index fb4051e..3fad1de 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { stages { stage("install") { steps { - sh 'python -m pip install -r requirements.txt' + sh 'conda env update -n base -f environment.yml' } } stage("unit tests") { @@ -22,11 +22,14 @@ pipeline { stage("build") { steps { sh "python -m build" + sh "grayskull pypi dist/*.tar.gz --maintainers 'Harrison Deng'" + sh "python scripts/patch_recipe.py" + sh 'conda build autobigs-engine -c bioconda --output-folder conda-bld --verify' } } stage("archive") { steps { - archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true + archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true } } stage("publish") { @@ -36,7 +39,8 @@ pipeline { CREDS = credentials('username-password-rs-git') } steps { - sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*' + sh 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*' + sh 'curl --user ${CREDS_USR}:${CRED_PSW} --upload-file conda-bld/**/*.conda https://git.reslate.systems/api/packages/${CRED_USR}/conda/$(basename conda-bld/**/*.conda)' } } stage ("pypi.org") { diff --git a/autobigs-engine/meta.yaml b/autobigs-engine/meta.yaml new file mode 100644 index 0000000..c27ddbc --- /dev/null +++ b/autobigs-engine/meta.yaml @@ -0,0 +1,44 @@ +{% set name = "autoBIGS.engine" %} +{% set version = "0.12.1.dev1+gb8cebb8.d20250221" %} + +package: + name: {{ name|lower|replace(".", "-") }} + version: {{ version }} + +source: + url: file:///workspaces/autoBIGS.engine/dist/autobigs_engine-0.12.1.dev1%2Bgb8cebb8.d20250221.tar.gz + sha256: c86441b94f935cfa414ff28ca4c026a070e0fb15988ea3bb7d1a942859a09b16 + +build: + noarch: python + script: {{ PYTHON }} -m pip install . -vv --no-deps --no-build-isolation + number: 0 + run_exports: + - {{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }} +requirements: + host: + - python >=3.12 + - setuptools >=64 + - setuptools-scm >=8 + - pip + run: + - python >=3.12 + - biopython ==1.85 + - aiohttp ==3.11.* + +test: + imports: + - autobigs + commands: + - pip check + requires: + - pip + +about: + summary: A library to rapidly fetch fetch MLST profiles given sequences for various diseases. + license: GPL-3.0-or-later + license_file: LICENSE + home: https://github.com/Syph-and-VPD-Lab/autoBIGS.engine +extra: + recipe-maintainers: + - Harrison Deng diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..cb1fa6e --- /dev/null +++ b/environment.yml @@ -0,0 +1,15 @@ +name: ci +channels: + - bioconda + - conda-forge +dependencies: + - aiohttp==3.11.* + - biopython==1.85 + - pytest + - pytest-asyncio + - python-build + - conda-build + - twine + - setuptools_scm + - pytest-cov + - grayskull \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0b8be9b..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -aiohttp[speedups]==3.11.* -biopython==1.85 -pytest -pytest-asyncio -build -twine -setuptools_scm -pytest-cov \ No newline at end of file diff --git a/scripts/patch_recipe.py b/scripts/patch_recipe.py new file mode 100644 index 0000000..e31403b --- /dev/null +++ b/scripts/patch_recipe.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +import argparse +from os import fdopen, path +import os +import re +import shutil +from sys import argv +import tempfile + +INDENTATION = " " +GRAYSKULL_OUTPUT_PATH = "autoBIGS.engine" +RUN_EXPORTED_VALUE = r'{{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}' +LICENSE_SUFFIX = "-or-later" +HOME_PAGE = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine" + +def _calc_indentation(line: str): + return len(re.findall(INDENTATION, line.split(line.strip())[0])) if line != "\n" else 0 + +def read_grayskull_output(): + original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH) + original_meta = path.join(original_recipe, "meta.yaml") + meta_file = open(original_meta) + lines = meta_file.readlines() + meta_file.close() + return lines + +def update_naming_scheme(lines): + modified_lines = [] + for line in lines: + matches = re.finditer(r"\{\{\s*name\|lower()\s+\}\}", line) + modified_line = line + for match in matches: + modified_line = modified_line[:match.start(1)] + r'|replace(".", "-")' + modified_line[match.end(1):] + modified_lines.append(modified_line) + return modified_lines + +def inject_run_exports(lines: list[str]): + package_indent = False + modified_lines = [] + for line in lines: + indentation_count = _calc_indentation(line) + if line == "build:\n" and indentation_count == 0: + package_indent = True + modified_lines.append(line) + elif package_indent and indentation_count == 0: + modified_lines.append(INDENTATION*1 + "run_exports:\n") + modified_lines.append(INDENTATION*2 + "- " + RUN_EXPORTED_VALUE + "\n") + package_indent = False + else: + modified_lines.append(line) + return modified_lines + +def suffix_license(lines: list[str]): + about_indent = False + modified_lines = [] + for line in lines: + indentation_count = _calc_indentation(line) + if line == "about:\n" and indentation_count == 0: + about_indent = True + modified_lines.append(line) + elif about_indent and indentation_count == 1 and line.lstrip().startswith("license:"): + modified_lines.append(line.rstrip() + LICENSE_SUFFIX + "\n") + about_indent = False + else: + modified_lines.append(line) + return modified_lines + +def inject_home_page(lines: list[str]): + about_indent = False + modified_lines = [] + for line in lines: + indentation_count = _calc_indentation(line) + if line == "about:\n" and indentation_count == 0: + about_indent = True + modified_lines.append(line) + elif about_indent and indentation_count == 0: + modified_lines.append(INDENTATION + "home: " + HOME_PAGE + "\n") + about_indent = False + else: + modified_lines.append(line) + return modified_lines + +def write_to_original(lines: list[str]): + original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH) + original_meta = path.join(original_recipe, "meta.yaml") + with open(original_meta, "w") as file: + file.writelines(lines) + +def rename_recipe_dir(): + new_recipe_name = path.abspath(path.join(GRAYSKULL_OUTPUT_PATH.replace(".", "-").lower())) + shutil.rmtree(new_recipe_name, ignore_errors=True) + os.replace(path.abspath(GRAYSKULL_OUTPUT_PATH), new_recipe_name) + +if __name__ == "__main__": + original_grayskull_out = read_grayskull_output() + modified_recipe_meta = None + modified_recipe_meta = update_naming_scheme(original_grayskull_out) + modified_recipe_meta = inject_run_exports(modified_recipe_meta) + modified_recipe_meta = suffix_license(modified_recipe_meta) + modified_recipe_meta = inject_home_page(modified_recipe_meta) + write_to_original(modified_recipe_meta) + rename_recipe_dir() \ No newline at end of file From 576dc303f492883c21e30d9b84956e917a1e16f7 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 05:52:34 +0000 Subject: [PATCH 03/10] Changed requested kubernetes container to be miniforge --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3fad1de..219d79c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,8 +2,8 @@ pipeline { agent { kubernetes { cloud 'rsys-devel' - defaultContainer 'pip' - inheritFrom 'pip' + defaultContainer 'miniforge3' + inheritFrom 'miniforge' } } stages { From fd536862e28a26de73d71676caf02aa034230ec7 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 05:53:08 +0000 Subject: [PATCH 04/10] Twine version specified to 6.0.1 to avoid Twine issue 15611 --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index cb1fa6e..a62d668 100644 --- a/environment.yml +++ b/environment.yml @@ -9,7 +9,7 @@ dependencies: - pytest-asyncio - python-build - conda-build - - twine + - twine==6.0.1 - setuptools_scm - pytest-cov - grayskull \ No newline at end of file From 276665f5fdafd9ab974a36313a613cc84f843a69 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 06:01:39 +0000 Subject: [PATCH 05/10] Added curl to environment requirements --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index a62d668..fe2edb2 100644 --- a/environment.yml +++ b/environment.yml @@ -12,4 +12,5 @@ dependencies: - twine==6.0.1 - setuptools_scm - pytest-cov - - grayskull \ No newline at end of file + - grayskull + - curl \ No newline at end of file From f4064f087ee5f72f7bcf3ee49856616c0856b58d Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 06:12:35 +0000 Subject: [PATCH 06/10] Fixed typos in pipeline script --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 219d79c..e3261b1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -40,7 +40,7 @@ pipeline { } steps { sh 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*' - sh 'curl --user ${CREDS_USR}:${CRED_PSW} --upload-file conda-bld/**/*.conda https://git.reslate.systems/api/packages/${CRED_USR}/conda/$(basename conda-bld/**/*.conda)' + sh 'curl --user ${CREDS_USR}:${CREDS_PSW} --upload-file conda-bld/**/*.conda https://git.reslate.systems/api/packages/${CREDS_USR}/conda/$(basename conda-bld/**/*.conda)' } } stage ("pypi.org") { From 79fcce8b8401554f69376f79787a786e5a97f31c Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 06:22:27 +0000 Subject: [PATCH 07/10] Recipe meta.yaml also archived as artifact --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index e3261b1..adf8677 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -29,7 +29,7 @@ pipeline { } stage("archive") { steps { - archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true + archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda, autobigs-engine/*.yaml', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true } } stage("publish") { From 06dbb56c28f1e44f8a8a0d194d15a9b5d3302ef8 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 21 Feb 2025 06:34:59 +0000 Subject: [PATCH 08/10] Revert "Recipe meta.yaml also archived as artifact" This reverts commit 79fcce8b8401554f69376f79787a786e5a97f31c. --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index adf8677..e3261b1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -29,7 +29,7 @@ pipeline { } stage("archive") { steps { - archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda, autobigs-engine/*.yaml', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true + archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true } } stage("publish") { From 27ae89fde75d4dd12a25eef9e15194257e51ab53 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 26 Feb 2025 04:50:54 +0000 Subject: [PATCH 09/10] Replaced schema with scheme --- .vscode/launch.json | 25 ----------- README.md | 2 +- src/autobigs/engine/analysis/bigsdb.py | 44 +++++++++---------- src/autobigs/engine/exceptions/database.py | 14 +++--- src/autobigs/engine/structures/genomics.py | 2 +- tests/autobigs/engine/analysis/test_bigsdb.py | 40 ++++++++--------- 6 files changed, 51 insertions(+), 76 deletions(-) delete mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 448fd4b..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - - { - "name": "autobigs info -lschema pubmlst_bordetella_seqdef", - "type": "debugpy", - "request": "launch", - "program": "${workspaceFolder}/src/autobigs/cli/program.py", - "console": "integratedTerminal", - "args": [ - "info", - "-lschemas", - "pubmlst_bordetella_seqdef" - ], - "cwd": "${workspaceFolder}/src", - "env": { - "PYTHONPATH": "${workspaceFolder}/src" - } - } - ] -} \ No newline at end of file diff --git a/README.md b/README.md index 0563e02..03e7e87 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ A python library implementing common BIGSdb MLST schemes and databases accesses Briefly, this library can: - Import multiple `FASTA` files - Fetch the available BIGSdb databases that is currently live and available -- Fetch the available BIGSdb database schemas for a given MLST database +- Fetch the available BIGSdb database schemes for a given MLST database - Retrieve exact/non-exact MLST allele variant IDs based off a sequence - Retrieve MLST sequence type IDs based off a sequence - Output all results to a single CSV diff --git a/src/autobigs/engine/analysis/bigsdb.py b/src/autobigs/engine/analysis/bigsdb.py index d9e11e9..d186753 100644 --- a/src/autobigs/engine/analysis/bigsdb.py +++ b/src/autobigs/engine/analysis/bigsdb.py @@ -43,10 +43,10 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager): class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): - def __init__(self, database_api: str, database_name: str, schema_id: int): + def __init__(self, database_api: str, database_name: str, scheme_id: int): self._database_name = database_name - self._schema_id = schema_id - self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/" + self._scheme_id = scheme_id + self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._scheme_id}/" self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60)) async def __aenter__(self): @@ -90,7 +90,7 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): ) yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele) else: - raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None) + raise NoBIGSdbMatchesException(self._database_name, self._scheme_id, sequence_string.name if isinstance(sequence_string, NamedString) else None) async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]: uri_path = "designations" @@ -117,15 +117,15 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): response_json: dict = await response.json() allele_set: Set[Allele] = set() response_json.setdefault("fields", dict()) - schema_fields_returned: dict[str, str] = response_json["fields"] - schema_fields_returned.setdefault("ST", "unknown") - schema_fields_returned.setdefault("clonal_complex", "unknown") - schema_exact_matches: dict = response_json["exact_matches"] - for exact_match_locus, exact_match_alleles in schema_exact_matches.items(): + scheme_fields_returned: dict[str, str] = response_json["fields"] + scheme_fields_returned.setdefault("ST", "unknown") + scheme_fields_returned.setdefault("clonal_complex", "unknown") + scheme_exact_matches: dict = response_json["exact_matches"] + for exact_match_locus, exact_match_alleles in scheme_exact_matches.items(): allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)) if len(allele_set) == 0: raise ValueError("Passed in no alleles.") - result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + result_mlst_profile = MLSTProfile(allele_set, scheme_fields_returned["ST"], scheme_fields_returned["clonal_complex"]) if len(names_list) > 0: result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile) return result_mlst_profile @@ -165,7 +165,7 @@ class BIGSdbIndex(AbstractAsyncContextManager): def __init__(self): self._http_client = ClientSession() self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None - self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict() + self._seqdefdb_schemes: dict[str, Union[Mapping[str, int], None]] = dict() super().__init__() async def __aenter__(self): @@ -191,22 +191,22 @@ class BIGSdbIndex(AbstractAsyncContextManager): raise NoSuchBIGSdbDatabaseException(seqdef_db_name) return known_databases[seqdef_db_name] - async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]: - if seqdef_db_name in self._seqdefdb_schemas and not force: - return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional + async def get_schemes_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]: + if seqdef_db_name in self._seqdefdb_schemes and not force: + return self._seqdefdb_schemes[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes" async with self._http_client.get(uri_path) as response: response_json = await response.json() - schema_descriptions: Mapping[str, int] = dict() + scheme_descriptions: Mapping[str, int] = dict() for scheme_definition in response_json["schemes"]: scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) scheme_desc: str = scheme_definition["description"] - schema_descriptions[scheme_desc] = scheme_id - self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions - return self._seqdefdb_schemas[seqdef_db_name] # type: ignore + scheme_descriptions[scheme_desc] = scheme_id + self._seqdefdb_schemes[seqdef_db_name] = scheme_descriptions + return self._seqdefdb_schemes[seqdef_db_name] # type: ignore - async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler: - return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) + async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, scheme_id: int) -> BIGSdbMLSTProfiler: + return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, scheme_id) async def close(self): await self._http_client.close() @@ -214,7 +214,7 @@ class BIGSdbIndex(AbstractAsyncContextManager): async def __aexit__(self, exc_type, exc_value, traceback): await self.close() -def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int): +def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, scheme_id: int): if local: raise NotImplementedError() - return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id) \ No newline at end of file + return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, scheme_id=scheme_id) \ No newline at end of file diff --git a/src/autobigs/engine/exceptions/database.py b/src/autobigs/engine/exceptions/database.py index 10787d2..ab88535 100644 --- a/src/autobigs/engine/exceptions/database.py +++ b/src/autobigs/engine/exceptions/database.py @@ -5,21 +5,21 @@ class BIGSDbDatabaseAPIException(Exception): class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException): - def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args): + def __init__(self, database_name: str, database_scheme_id: int, query_name: Union[None, str], *args): self._query_name = query_name - super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) + super().__init__(f"No matches found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args) def get_causal_query_name(self) -> Union[str, None]: return self._query_name class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException): - def __init__(self, database_name: str, database_schema_id: int, *args): - super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) + def __init__(self, database_name: str, database_scheme_id: int, *args): + super().__init__(f"No exact match found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args) class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException): def __init__(self, database_name: str, *args): super().__init__(f"No database \"{database_name}\" found.", *args) -class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException): - def __init__(self, database_name: str, database_schema_id: int, *args): - super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args) +class NoSuchBigSdbschemeException(BIGSDbDatabaseAPIException): + def __init__(self, database_name: str, database_scheme_id: int, *args): + super().__init__(f"No scheme with ID {database_scheme_id} in \"{database_name}\" found.", *args) diff --git a/src/autobigs/engine/structures/genomics.py b/src/autobigs/engine/structures/genomics.py index 6dfb59b..cd76c70 100644 --- a/src/autobigs/engine/structures/genomics.py +++ b/src/autobigs/engine/structures/genomics.py @@ -25,7 +25,7 @@ class SangerTraceData(NamedString): analysis_proto_settings_name: str analysis_rpto_settings_ver: str analysis_proto_xml_data: str - analysis_proto_xml_schema_ver: str + analysis_proto_xml_scheme_ver: str sample_comment: Union[None, str] capillary_machine: bool container_identifier: str diff --git a/tests/autobigs/engine/analysis/test_bigsdb.py b/tests/autobigs/engine/analysis/test_bigsdb.py index c49f1ae..ed01fd3 100644 --- a/tests/autobigs/engine/analysis/test_bigsdb.py +++ b/tests/autobigs/engine/analysis/test_bigsdb.py @@ -71,14 +71,14 @@ hinfluenzae_2014_102_bad_profile = MLSTProfile(( ), "unknown", "unknown") -@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ +@pytest.mark.parametrize("local_db,database_api,database_name,scheme_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile), ]) class TestBIGSdbMLSTProfiler: - async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): sequence = get_first_sequence_from_fasta(seq_path) - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles) targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys()) async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]): @@ -89,10 +89,10 @@ class TestBIGSdbMLSTProfiler: assert len(targets_left) == 0 - async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path) mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()} - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as profiler: for target_sequence in target_sequences: match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description) if match is None: @@ -107,26 +107,26 @@ class TestBIGSdbMLSTProfiler: assert len(mlst_targets) == 0 - async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles) assert mlst_st_data is not None assert isinstance(mlst_st_data, MLSTProfile) assert mlst_st_data.clonal_complex == expected_profile.clonal_complex assert mlst_st_data.sequence_type == expected_profile.sequence_type - async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): dummy_alleles = bad_profile.alleles - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles) assert mlst_profile.clonal_complex == "unknown" assert mlst_profile.sequence_type == "unknown" - async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): sequence = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)): name, profile = named_profile.name, named_profile.mlst_profile assert profile is not None @@ -134,10 +134,10 @@ class TestBIGSdbMLSTProfiler: assert profile.clonal_complex == expected_profile.clonal_complex assert profile.sequence_type == expected_profile.sequence_type - async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): valid_seq = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True): name, profile = name_profile.name, name_profile.mlst_profile @@ -151,11 +151,11 @@ class TestBIGSdbMLSTProfiler: assert profile.clonal_complex == expected_profile.clonal_complex assert profile.sequence_type == expected_profile.sequence_type - async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): valid_seq = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False): name, profile = named_profile.name, named_profile.mlst_profile @@ -183,12 +183,12 @@ class TestBIGSdbIndex: async with BIGSdbIndex() as bigsdb_index: assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api" - async def test_bigsdb_index_get_schemas_for_bordetella(self): + async def test_bigsdb_index_get_schemes_for_bordetella(self): async with BIGSdbIndex() as index: - schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") - assert len(schemas.keys()) > 0 - assert "MLST" in schemas - assert isinstance(schemas["MLST"], int) + schemes = await index.get_schemes_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") + assert len(schemes.keys()) > 0 + assert "MLST" in schemes + assert isinstance(schemes["MLST"], int) async def test_bigsdb_index_get_databases_has_only_seqdef(self): async with BIGSdbIndex() as index: From 4b34036d17fec4988a28a50bc8fa7b09d883c38a Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 26 Feb 2025 05:16:24 +0000 Subject: [PATCH 10/10] Fixed concurrent profile_multiple_strings implementation --- src/autobigs/engine/analysis/bigsdb.py | 30 +++++++++++-------- tests/autobigs/engine/analysis/test_bigsdb.py | 3 ++ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/autobigs/engine/analysis/bigsdb.py b/src/autobigs/engine/analysis/bigsdb.py index d186753..1195c30 100644 --- a/src/autobigs/engine/analysis/bigsdb.py +++ b/src/autobigs/engine/analysis/bigsdb.py @@ -7,7 +7,7 @@ from os import path import os import shutil import tempfile -from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequence, Set, Union +from typing import Any, AsyncGenerator, AsyncIterable, Coroutine, Iterable, Mapping, Sequence, Set, Union from aiohttp import ClientSession, ClientTimeout @@ -135,20 +135,24 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): return await self.determine_mlst_st(alleles) async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: - tasks = [] + tasks: list[Coroutine[Any, Any, Union[NamedMLSTProfile, MLSTProfile]]] = [] async for named_strings in query_named_string_groups: tasks.append(self.profile_string(named_strings)) - for task in asyncio.as_completed(tasks): - try: - yield await task - except NoBIGSdbMatchesException as e: - if stop_on_fail: - raise e - causal_name = e.get_causal_query_name() - if causal_name is None: - raise ValueError("Missing query name despite requiring names.") - else: - yield NamedMLSTProfile(causal_name, None) + for task in asyncio.as_completed(tasks): + named_mlst_profile = await task + try: + if isinstance(named_mlst_profile, NamedMLSTProfile): + yield named_mlst_profile + else: + raise TypeError("MLST profile is not named.") + except NoBIGSdbMatchesException as e: + if stop_on_fail: + raise e + causal_name = e.get_causal_query_name() + if causal_name is None: + raise ValueError("Missing query name despite requiring names.") + else: + yield NamedMLSTProfile(causal_name, None) async def close(self): await self._http_client.close() diff --git a/tests/autobigs/engine/analysis/test_bigsdb.py b/tests/autobigs/engine/analysis/test_bigsdb.py index ed01fd3..233e311 100644 --- a/tests/autobigs/engine/analysis/test_bigsdb.py +++ b/tests/autobigs/engine/analysis/test_bigsdb.py @@ -102,6 +102,7 @@ class TestBIGSdbMLSTProfiler: continue scrambled = gene_scrambler(str(target_sequence.seq), 0.125) async for partial_match in profiler.determine_mlst_allele_variants([scrambled]): + assert isinstance(partial_match, Allele) assert partial_match.partial_match_profile is not None mlst_targets.remove(gene) @@ -119,6 +120,7 @@ class TestBIGSdbMLSTProfiler: dummy_alleles = bad_profile.alleles async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles) + assert isinstance(mlst_profile, MLSTProfile) assert mlst_profile.clonal_complex == "unknown" assert mlst_profile.sequence_type == "unknown" @@ -207,5 +209,6 @@ class TestBIGSdbIndex: async with await bigsdb_index.build_profiler_from_seqdefdb(local, "pubmlst_bordetella_seqdef", 3) as profiler: assert isinstance(profiler, BIGSdbMLSTProfiler) profile = await profiler.profile_string(sequence) + assert isinstance(profile, MLSTProfile) assert profile.clonal_complex == "ST-2 complex" assert profile.sequence_type == "1"