Compare commits

...

8 Commits

Author SHA1 Message Date
06dbb56c28 Revert "Recipe meta.yaml also archived as artifact"
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
This reverts commit 79fcce8b8401554f69376f79787a786e5a97f31c.
2025-02-21 06:34:59 +00:00
79fcce8b84 Recipe meta.yaml also archived as artifact
Some checks reported errors
autoBIGS.engine/pipeline/head Something is wrong with the build of this commit
2025-02-21 06:22:27 +00:00
f4064f087e Fixed typos in pipeline script
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-02-21 06:12:35 +00:00
276665f5fd Added curl to environment requirements
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-02-21 06:01:39 +00:00
fd536862e2 Twine version specified to 6.0.1 to avoid Twine issue 15611
Some checks failed
autoBIGS.engine/pipeline/head There was a failure building this commit
2025-02-21 05:53:08 +00:00
576dc303f4 Changed requested kubernetes container to be miniforge 2025-02-21 05:52:34 +00:00
2822a483e3 Initial attempt at switching to a conda based build environment
Some checks failed
autoBIGS.engine/pipeline/head There was a failure building this commit
2025-02-21 05:37:56 +00:00
b8cebb8ba4 Infrastructure for concurrent processing implemented
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-02-19 15:49:46 +00:00
13 changed files with 255 additions and 206 deletions

11
.devcontainer/Dockerfile Normal file
View File

@ -0,0 +1,11 @@
FROM mcr.microsoft.com/devcontainers/anaconda:1-3
# Copy environment.yml (if found) to a temp location so we update the environment. Also
# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
COPY environment.yml* .devcontainer/noop.txt /tmp/conda-tmp/
RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
&& rm -rf /tmp/conda-tmp
# [Optional] Uncomment this section to install additional OS packages.
# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
# && apt-get -y install --no-install-recommends <your-package-list-here>

View File

@ -1,9 +1,11 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
{
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
"name": "Anaconda (Python 3)",
"build": {
"context": "..",
"dockerfile": "Dockerfile"
}
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
@ -12,14 +14,7 @@
// "forwardPorts": [],
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "pip3 install --user -r requirements.txt",
"customizations": {
"vscode": {
"extensions": [
"mechatroner.rainbow-csv"
]
}
}
// "postCreateCommand": "python --version",
// Configure tool-specific properties.
// "customizations": {},

3
.devcontainer/noop.txt Normal file
View File

@ -0,0 +1,3 @@
This file copied into the container along with environment.yml* from the parent
folder. This file is included to prevents the Dockerfile COPY instruction from
failing if no environment.yml is found.

159
.gitignore vendored
View File

@ -1,6 +1,6 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,svelte,python,linux,node
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python
### Linux ###
*~
@ -17,146 +17,6 @@
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
### Node Patch ###
# Serverless Webpack directories
.webpack/
# Optional stylelint cache
# SvelteKit build / generate output
.svelte-kit
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
@ -202,6 +62,7 @@ htmlcov/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
@ -215,6 +76,7 @@ cover/
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
@ -278,6 +140,7 @@ celerybeat.pid
*.sage.py
# Environments
.env
.venv
env/
venv/
@ -326,13 +189,6 @@ poetry.toml
# LSP config files
pyrightconfig.json
### Svelte ###
# gitignore template for the SvelteKit, frontend web component framework
# website: https://kit.svelte.dev/
.svelte-kit/
package
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
@ -352,9 +208,8 @@ package
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
output
*.private.*
conda-bld

5
.vscode/extensions.json vendored Normal file
View File

@ -0,0 +1,5 @@
{
"recommendations": [
"piotrpalarz.vscode-gitignore-generator"
]
}

14
Jenkinsfile vendored
View File

@ -2,14 +2,14 @@ pipeline {
agent {
kubernetes {
cloud 'rsys-devel'
defaultContainer 'pip'
inheritFrom 'pip'
defaultContainer 'miniforge3'
inheritFrom 'miniforge'
}
}
stages {
stage("install") {
steps {
sh 'python -m pip install -r requirements.txt'
sh 'conda env update -n base -f environment.yml'
}
}
stage("unit tests") {
@ -22,11 +22,14 @@ pipeline {
stage("build") {
steps {
sh "python -m build"
sh "grayskull pypi dist/*.tar.gz --maintainers 'Harrison Deng'"
sh "python scripts/patch_recipe.py"
sh 'conda build autobigs-engine -c bioconda --output-folder conda-bld --verify'
}
}
stage("archive") {
steps {
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
}
}
stage("publish") {
@ -36,7 +39,8 @@ pipeline {
CREDS = credentials('username-password-rs-git')
}
steps {
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
sh 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
sh 'curl --user ${CREDS_USR}:${CREDS_PSW} --upload-file conda-bld/**/*.conda https://git.reslate.systems/api/packages/${CREDS_USR}/conda/$(basename conda-bld/**/*.conda)'
}
}
stage ("pypi.org") {

44
autobigs-engine/meta.yaml Normal file
View File

@ -0,0 +1,44 @@
{% set name = "autoBIGS.engine" %}
{% set version = "0.12.1.dev1+gb8cebb8.d20250221" %}
package:
name: {{ name|lower|replace(".", "-") }}
version: {{ version }}
source:
url: file:///workspaces/autoBIGS.engine/dist/autobigs_engine-0.12.1.dev1%2Bgb8cebb8.d20250221.tar.gz
sha256: c86441b94f935cfa414ff28ca4c026a070e0fb15988ea3bb7d1a942859a09b16
build:
noarch: python
script: {{ PYTHON }} -m pip install . -vv --no-deps --no-build-isolation
number: 0
run_exports:
- {{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}
requirements:
host:
- python >=3.12
- setuptools >=64
- setuptools-scm >=8
- pip
run:
- python >=3.12
- biopython ==1.85
- aiohttp ==3.11.*
test:
imports:
- autobigs
commands:
- pip check
requires:
- pip
about:
summary: A library to rapidly fetch fetch MLST profiles given sequences for various diseases.
license: GPL-3.0-or-later
license_file: LICENSE
home: https://github.com/Syph-and-VPD-Lab/autoBIGS.engine
extra:
recipe-maintainers:
- Harrison Deng

16
environment.yml Normal file
View File

@ -0,0 +1,16 @@
name: ci
channels:
- bioconda
- conda-forge
dependencies:
- aiohttp==3.11.*
- biopython==1.85
- pytest
- pytest-asyncio
- python-build
- conda-build
- twine==6.0.1
- setuptools_scm
- pytest-cov
- grayskull
- curl

View File

@ -1,8 +0,0 @@
aiohttp[speedups]==3.11.*
biopython==1.85
pytest
pytest-asyncio
build
twine
setuptools_scm
pytest-cov

103
scripts/patch_recipe.py Normal file
View File

@ -0,0 +1,103 @@
#!/usr/bin/env python3
import argparse
from os import fdopen, path
import os
import re
import shutil
from sys import argv
import tempfile
INDENTATION = " "
GRAYSKULL_OUTPUT_PATH = "autoBIGS.engine"
RUN_EXPORTED_VALUE = r'{{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}'
LICENSE_SUFFIX = "-or-later"
HOME_PAGE = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
def _calc_indentation(line: str):
return len(re.findall(INDENTATION, line.split(line.strip())[0])) if line != "\n" else 0
def read_grayskull_output():
original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH)
original_meta = path.join(original_recipe, "meta.yaml")
meta_file = open(original_meta)
lines = meta_file.readlines()
meta_file.close()
return lines
def update_naming_scheme(lines):
modified_lines = []
for line in lines:
matches = re.finditer(r"\{\{\s*name\|lower()\s+\}\}", line)
modified_line = line
for match in matches:
modified_line = modified_line[:match.start(1)] + r'|replace(".", "-")' + modified_line[match.end(1):]
modified_lines.append(modified_line)
return modified_lines
def inject_run_exports(lines: list[str]):
package_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "build:\n" and indentation_count == 0:
package_indent = True
modified_lines.append(line)
elif package_indent and indentation_count == 0:
modified_lines.append(INDENTATION*1 + "run_exports:\n")
modified_lines.append(INDENTATION*2 + "- " + RUN_EXPORTED_VALUE + "\n")
package_indent = False
else:
modified_lines.append(line)
return modified_lines
def suffix_license(lines: list[str]):
about_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "about:\n" and indentation_count == 0:
about_indent = True
modified_lines.append(line)
elif about_indent and indentation_count == 1 and line.lstrip().startswith("license:"):
modified_lines.append(line.rstrip() + LICENSE_SUFFIX + "\n")
about_indent = False
else:
modified_lines.append(line)
return modified_lines
def inject_home_page(lines: list[str]):
about_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "about:\n" and indentation_count == 0:
about_indent = True
modified_lines.append(line)
elif about_indent and indentation_count == 0:
modified_lines.append(INDENTATION + "home: " + HOME_PAGE + "\n")
about_indent = False
else:
modified_lines.append(line)
return modified_lines
def write_to_original(lines: list[str]):
original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH)
original_meta = path.join(original_recipe, "meta.yaml")
with open(original_meta, "w") as file:
file.writelines(lines)
def rename_recipe_dir():
new_recipe_name = path.abspath(path.join(GRAYSKULL_OUTPUT_PATH.replace(".", "-").lower()))
shutil.rmtree(new_recipe_name, ignore_errors=True)
os.replace(path.abspath(GRAYSKULL_OUTPUT_PATH), new_recipe_name)
if __name__ == "__main__":
original_grayskull_out = read_grayskull_output()
modified_recipe_meta = None
modified_recipe_meta = update_naming_scheme(original_grayskull_out)
modified_recipe_meta = inject_run_exports(modified_recipe_meta)
modified_recipe_meta = suffix_license(modified_recipe_meta)
modified_recipe_meta = inject_home_page(modified_recipe_meta)
write_to_original(modified_recipe_meta)
rename_recipe_dir()

View File

@ -22,15 +22,15 @@ from Bio.Align import PairwiseAligner
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
pass
@abstractmethod
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
pass
@abstractmethod
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
pass
@abstractmethod
@ -52,14 +52,14 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
async def __aenter__(self):
return self
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence"
if isinstance(query_sequence_strings, str):
if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString):
query_sequence_strings = [query_sequence_strings]
for sequence_string in query_sequence_strings:
async with self._http_client.post(uri_path, json={
"sequence": sequence_string,
"sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
"partial_matches": True
}) as response:
sequence_response: dict = await response.json()
@ -70,7 +70,8 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
elif "partial_matches" in sequence_response:
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
@ -82,23 +83,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
gaps=int(partial_match["gaps"]),
match_metric=int(partial_match["bitscore"])
)
yield Allele(
result_allele = Allele(
allele_locus=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
uri_path = "designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
names_list = []
def insert_allele_to_request_dict(allele: Union[Allele, tuple[str, Allele]]):
if isinstance(allele, Allele):
allele_val = allele
else:
allele_val = allele[1]
names_list.append(allele[0])
allele_request_dict[allele_val.allele_locus].append({"allele": str(allele_val.allele_variant)})
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
insert_allele_to_request_dict(allele)
else:
for allele in alleles:
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
insert_allele_to_request_dict(allele)
request_json = {
"designations": allele_request_dict
}
@ -111,30 +122,33 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
schema_fields_returned.setdefault("clonal_complex", "unknown")
schema_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
if len(exact_match_alleles) > 1:
raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
if len(allele_set) == 0:
raise ValueError("Passed in no alleles.")
return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
if len(names_list) > 0:
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile)
return result_mlst_profile
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
return await self.determine_mlst_st(alleles)
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
tasks = []
async for named_strings in query_named_string_groups:
names: list[str] = list()
sequences: list[str] = list()
for named_string in named_strings:
names.append(named_string.name)
sequences.append(named_string.sequence)
tasks.append(self.profile_string(named_strings))
for task in asyncio.as_completed(tasks):
try:
yield NamedMLSTProfile("-".join(names), (await self.profile_string(sequences)))
yield await task
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
yield NamedMLSTProfile("-".join(names), None)
causal_name = e.get_causal_query_name()
if causal_name is None:
raise ValueError("Missing query name despite requiring names.")
else:
yield NamedMLSTProfile(causal_name, None)
async def close(self):
await self._http_client.close()

View File

@ -5,9 +5,13 @@ class BIGSDbDatabaseAPIException(Exception):
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args):
self._query_name = query_name
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
def get_causal_query_name(self) -> Union[str, None]:
return self._query_name
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)

View File

@ -13,5 +13,8 @@ async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]
return results
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
tasks = []
for handle in handles:
yield await read_fasta(handle)
tasks.append(read_fasta(handle))
for task in asyncio.as_completed(tasks):
yield await task