Compare commits

..

No commits in common. "develop" and "features/local-typing" have entirely different histories.

27 changed files with 51405 additions and 56336 deletions

View File

@ -1,11 +0,0 @@
FROM mcr.microsoft.com/devcontainers/anaconda:1-3
# Copy environment.yml (if found) to a temp location so we update the environment. Also
# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
COPY environment.yml* .devcontainer/noop.txt /tmp/conda-tmp/
RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
&& rm -rf /tmp/conda-tmp
# [Optional] Uncomment this section to install additional OS packages.
# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
# && apt-get -y install --no-install-recommends <your-package-list-here>

View File

@ -1,11 +1,9 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Anaconda (Python 3)",
"build": {
"context": "..",
"dockerfile": "Dockerfile"
}
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
@ -14,7 +12,14 @@
// "forwardPorts": [],
// Use 'postCreateCommand' to run commands after the container is created.
// "postCreateCommand": "python --version",
"postCreateCommand": "pip3 install --user -r requirements.txt",
"customizations": {
"vscode": {
"extensions": [
"mechatroner.rainbow-csv"
]
}
}
// Configure tool-specific properties.
// "customizations": {},

View File

@ -1,3 +0,0 @@
This file copied into the container along with environment.yml* from the parent
folder. This file is included to prevents the Dockerfile COPY instruction from
failing if no environment.yml is found.

159
.gitignore vendored
View File

@ -1,6 +1,6 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,svelte,python,linux,node
### Linux ###
*~
@ -17,6 +17,146 @@
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
### Node Patch ###
# Serverless Webpack directories
.webpack/
# Optional stylelint cache
# SvelteKit build / generate output
.svelte-kit
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
@ -62,7 +202,6 @@ htmlcov/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
@ -76,7 +215,6 @@ cover/
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
@ -140,7 +278,6 @@ celerybeat.pid
*.sage.py
# Environments
.env
.venv
env/
venv/
@ -189,6 +326,13 @@ poetry.toml
# LSP config files
pyrightconfig.json
### Svelte ###
# gitignore template for the SvelteKit, frontend web component framework
# website: https://kit.svelte.dev/
.svelte-kit/
package
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
@ -208,8 +352,9 @@ pyrightconfig.json
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
conda-bld
output
*.private.*

View File

@ -1,5 +0,0 @@
{
"recommendations": [
"piotrpalarz.vscode-gitignore-generator"
]
}

14
Jenkinsfile vendored
View File

@ -2,14 +2,14 @@ pipeline {
agent {
kubernetes {
cloud 'rsys-devel'
defaultContainer 'miniforge3'
inheritFrom 'miniforge'
defaultContainer 'pip'
inheritFrom 'pip'
}
}
stages {
stage("install") {
steps {
sh 'conda env update -n base -f environment.yml'
sh 'python -m pip install -r requirements.txt'
}
}
stage("unit tests") {
@ -22,14 +22,11 @@ pipeline {
stage("build") {
steps {
sh "python -m build"
sh "grayskull pypi dist/*.tar.gz --maintainers 'Harrison Deng'"
sh "python scripts/patch_recipe.py"
sh 'conda build autobigs-engine -c bioconda --output-folder conda-bld --verify'
}
}
stage("archive") {
steps {
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl, conda-bld/**/*.conda', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
}
}
stage("publish") {
@ -39,8 +36,7 @@ pipeline {
CREDS = credentials('username-password-rs-git')
}
steps {
sh 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
sh 'curl --user ${CREDS_USR}:${CREDS_PSW} --upload-file conda-bld/**/*.conda https://git.reslate.systems/api/packages/${CREDS_USR}/conda/$(basename conda-bld/**/*.conda)'
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
}
}
stage ("pypi.org") {

View File

@ -1,6 +1,6 @@
# autoBIGS.Engine
A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
## Features

View File

@ -1,44 +0,0 @@
{% set name = "autoBIGS.engine" %}
{% set version = "0.12.1.dev1+gb8cebb8.d20250221" %}
package:
name: {{ name|lower|replace(".", "-") }}
version: {{ version }}
source:
url: file:///workspaces/autoBIGS.engine/dist/autobigs_engine-0.12.1.dev1%2Bgb8cebb8.d20250221.tar.gz
sha256: c86441b94f935cfa414ff28ca4c026a070e0fb15988ea3bb7d1a942859a09b16
build:
noarch: python
script: {{ PYTHON }} -m pip install . -vv --no-deps --no-build-isolation
number: 0
run_exports:
- {{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}
requirements:
host:
- python >=3.12
- setuptools >=64
- setuptools-scm >=8
- pip
run:
- python >=3.12
- biopython ==1.85
- aiohttp ==3.11.*
test:
imports:
- autobigs
commands:
- pip check
requires:
- pip
about:
summary: A library to rapidly fetch fetch MLST profiles given sequences for various diseases.
license: GPL-3.0-or-later
license_file: LICENSE
home: https://github.com/Syph-and-VPD-Lab/autoBIGS.engine
extra:
recipe-maintainers:
- Harrison Deng

View File

@ -1,16 +0,0 @@
name: ci
channels:
- bioconda
- conda-forge
dependencies:
- aiohttp==3.11.*
- biopython==1.85
- pytest
- pytest-asyncio
- python-build
- conda-build
- twine==6.0.1
- setuptools_scm
- pytest-cov
- grayskull
- curl

View File

@ -13,12 +13,11 @@ dependencies = [
]
requires-python = ">=3.12"
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
license = {text = "GPL-3.0-or-later"}
[project.urls]
Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"
Homepage = "https://github.com/RealYHD/autoBIGS.engine"
Source = "https://github.com/RealYHD/autoBIGS.engine"
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
[tool.setuptools_scm]

8
requirements.txt Normal file
View File

@ -0,0 +1,8 @@
aiohttp[speedups]==3.11.*
biopython==1.85
pytest
pytest-asyncio
build
twine
setuptools_scm
pytest-cov

View File

@ -1,103 +0,0 @@
#!/usr/bin/env python3
import argparse
from os import fdopen, path
import os
import re
import shutil
from sys import argv
import tempfile
INDENTATION = " "
GRAYSKULL_OUTPUT_PATH = "autoBIGS.engine"
RUN_EXPORTED_VALUE = r'{{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}'
LICENSE_SUFFIX = "-or-later"
HOME_PAGE = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
def _calc_indentation(line: str):
return len(re.findall(INDENTATION, line.split(line.strip())[0])) if line != "\n" else 0
def read_grayskull_output():
original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH)
original_meta = path.join(original_recipe, "meta.yaml")
meta_file = open(original_meta)
lines = meta_file.readlines()
meta_file.close()
return lines
def update_naming_scheme(lines):
modified_lines = []
for line in lines:
matches = re.finditer(r"\{\{\s*name\|lower()\s+\}\}", line)
modified_line = line
for match in matches:
modified_line = modified_line[:match.start(1)] + r'|replace(".", "-")' + modified_line[match.end(1):]
modified_lines.append(modified_line)
return modified_lines
def inject_run_exports(lines: list[str]):
package_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "build:\n" and indentation_count == 0:
package_indent = True
modified_lines.append(line)
elif package_indent and indentation_count == 0:
modified_lines.append(INDENTATION*1 + "run_exports:\n")
modified_lines.append(INDENTATION*2 + "- " + RUN_EXPORTED_VALUE + "\n")
package_indent = False
else:
modified_lines.append(line)
return modified_lines
def suffix_license(lines: list[str]):
about_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "about:\n" and indentation_count == 0:
about_indent = True
modified_lines.append(line)
elif about_indent and indentation_count == 1 and line.lstrip().startswith("license:"):
modified_lines.append(line.rstrip() + LICENSE_SUFFIX + "\n")
about_indent = False
else:
modified_lines.append(line)
return modified_lines
def inject_home_page(lines: list[str]):
about_indent = False
modified_lines = []
for line in lines:
indentation_count = _calc_indentation(line)
if line == "about:\n" and indentation_count == 0:
about_indent = True
modified_lines.append(line)
elif about_indent and indentation_count == 0:
modified_lines.append(INDENTATION + "home: " + HOME_PAGE + "\n")
about_indent = False
else:
modified_lines.append(line)
return modified_lines
def write_to_original(lines: list[str]):
original_recipe = path.abspath(GRAYSKULL_OUTPUT_PATH)
original_meta = path.join(original_recipe, "meta.yaml")
with open(original_meta, "w") as file:
file.writelines(lines)
def rename_recipe_dir():
new_recipe_name = path.abspath(path.join(GRAYSKULL_OUTPUT_PATH.replace(".", "-").lower()))
shutil.rmtree(new_recipe_name, ignore_errors=True)
os.replace(path.abspath(GRAYSKULL_OUTPUT_PATH), new_recipe_name)
if __name__ == "__main__":
original_grayskull_out = read_grayskull_output()
modified_recipe_meta = None
modified_recipe_meta = update_naming_scheme(original_grayskull_out)
modified_recipe_meta = inject_run_exports(modified_recipe_meta)
modified_recipe_meta = suffix_license(modified_recipe_meta)
modified_recipe_meta = inject_home_page(modified_recipe_meta)
write_to_original(modified_recipe_meta)
rename_recipe_dir()

View File

@ -0,0 +1,70 @@
import asyncio
from concurrent.futures import Future, ThreadPoolExecutor
from contextlib import AbstractContextManager
from typing import Any, Set, Union
from Bio.Align import PairwiseAligner
from queue import Queue
from autobigs.engine.structures.alignment import AlignmentStats, PairwiseAlignment
class AsyncBiopythonPairwiseAlignmentEngine(AbstractContextManager):
def __enter__(self):
self._thread_pool = ThreadPoolExecutor(self._max_threads, thread_name_prefix="async-pairwise-alignment")
return self
def __init__(self, aligner: PairwiseAligner, max_threads: int = 4):
self._max_threads = max_threads
self._aligner = aligner
self._work_left: Set[Future] = set()
self._work_complete: Queue[Future] = Queue()
def align(self, reference: str, query: str, **associated_data):
work = self._thread_pool.submit(
self.work, reference, query, **associated_data)
work.add_done_callback(self._on_complete)
self._work_left.add(work)
def _on_complete(self, future: Future):
self._work_left.remove(future)
self._work_complete.put(future)
def work(self, reference, query, **associated_data):
alignments = self._aligner.align(reference, query)
top_alignment = alignments[0]
top_alignment_stats = top_alignment.counts()
top_alignment_gaps = top_alignment_stats.gaps
top_alignment_identities = top_alignment_stats.identities
top_alignment_mismatches = top_alignment_stats.mismatches
top_alignment_score = top_alignment.score # type: ignore
return PairwiseAlignment(
top_alignment.sequences[0],
top_alignment.sequences[1],
tuple(top_alignment.indices[0]),
tuple(top_alignment.indices[1]),
AlignmentStats(
percent_identity=top_alignment_identities/top_alignment.length,
mismatches=top_alignment_mismatches,
gaps=top_alignment_gaps,
match_metric=top_alignment_score
)), associated_data
async def next_completed(self) -> Union[tuple[PairwiseAlignment, dict[str, Any]], None]:
if self._work_complete.empty() and len(self._work_left):
return None
completed_alignment = await asyncio.wrap_future(self._work_complete.get())
return completed_alignment
def __exit__(self, exc_type, exc_value, traceback):
self.shutdown()
def __aiter__(self):
return self
async def __anext__(self):
result = await self.next_completed()
if result is None:
raise StopAsyncIteration
return result
def shutdown(self):
self._thread_pool.shutdown(wait=True, cancel_futures=True)

View File

@ -11,6 +11,7 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen
from aiohttp import ClientSession, ClientTimeout
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
from autobigs.engine.reading import read_fasta
from autobigs.engine.structures.alignment import PairwiseAlignment
from autobigs.engine.structures.genomics import NamedString
@ -22,15 +23,15 @@ from Bio.Align import PairwiseAligner
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
@abstractmethod
def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
pass
@abstractmethod
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
pass
@abstractmethod
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
pass
@abstractmethod
@ -52,14 +53,14 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
async def __aenter__(self):
return self
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[Union[NamedString, str]], Union[NamedString, str]]) -> AsyncGenerator[Union[Allele, tuple[str, Allele]], Any]:
async def determine_mlst_allele_variants(self, query_sequence_strings: Union[Iterable[str], str]) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "sequence"
if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString):
if isinstance(query_sequence_strings, str):
query_sequence_strings = [query_sequence_strings]
for sequence_string in query_sequence_strings:
async with self._http_client.post(uri_path, json={
"sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
"sequence": sequence_string,
"partial_matches": True
}) as response:
sequence_response: dict = await response.json()
@ -70,8 +71,7 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
yield Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
elif "partial_matches" in sequence_response:
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
@ -83,33 +83,23 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
gaps=int(partial_match["gaps"]),
match_metric=int(partial_match["bitscore"])
)
result_allele = Allele(
yield Allele(
allele_locus=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
raise NoBIGSdbMatchesException(self._database_name, self._schema_id)
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
names_list = []
def insert_allele_to_request_dict(allele: Union[Allele, tuple[str, Allele]]):
if isinstance(allele, Allele):
allele_val = allele
else:
allele_val = allele[1]
names_list.append(allele[0])
allele_request_dict[allele_val.allele_locus].append({"allele": str(allele_val.allele_variant)})
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
insert_allele_to_request_dict(allele)
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
else:
for allele in alleles:
insert_allele_to_request_dict(allele)
allele_request_dict[allele.allele_locus].append({"allele": str(allele.allele_variant)})
request_json = {
"designations": allele_request_dict
}
@ -122,33 +112,26 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
schema_fields_returned.setdefault("clonal_complex", "unknown")
schema_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
if len(exact_match_alleles) > 1:
raise ValueError(f"Unexpected number of alleles returned for exact match (Expected 1, retrieved {len(exact_match_alleles)})")
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
if len(allele_set) == 0:
raise ValueError("Passed in no alleles.")
result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
if len(names_list) > 0:
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile)
return result_mlst_profile
return MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
return await self.determine_mlst_st(alleles)
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
tasks = []
async for named_strings in query_named_string_groups:
tasks.append(self.profile_string(named_strings))
for task in asyncio.as_completed(tasks):
for named_string in named_strings:
try:
yield await task
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
causal_name = e.get_causal_query_name()
if causal_name is None:
raise ValueError("Missing query name despite requiring names.")
else:
yield NamedMLSTProfile(causal_name, None)
yield NamedMLSTProfile(named_string.name, None)
async def close(self):
await self._http_client.close()
@ -156,6 +139,141 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()
class LocalBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
async def __aenter__(self):
if self._prepare:
await self.update_scheme_locis()
await asyncio.gather(
self.download_alleles_cache_data(),
self.download_scheme_profiles()
)
await self.load_scheme_profiles()
return self
def __init__(self, database_api: str, database_name: str, schema_id: int, cache_path: Union[str, None] = None, prepare: bool =True):
self._database_api = database_api
self._database_name = database_name
self._schema_id = schema_id
self._base_url = f"{self._database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
if cache_path is None:
self._cache_path = tempfile.mkdtemp("BIGSdb")
self._cleanup_required = True
else:
self._cache_path = cache_path
self._cleanup_required = False
self._loci: list[str] = []
self._profiles_st_map = {}
self._prepare = prepare
async def update_scheme_locis(self):
self._loci.clear()
async with self._http_client.get(f"/api/db/{self._database_name}/schemes/{self._schema_id}") as schema_response:
schema_json = await schema_response.json()
for locus in schema_json["loci"]:
locus_name = path.basename(locus)
self._loci.append(locus_name)
self._loci.sort()
async def load_scheme_profiles(self):
self._profiles_st_map.clear()
with open(self.get_scheme_profile_path()) as profile_cache_handle:
reader = csv.DictReader(profile_cache_handle, delimiter="\t")
for line in reader:
alleles = []
for locus in self._loci:
alleles.append(line[locus])
self._profiles_st_map[tuple(alleles)] = (line["ST"], line["clonal_complex"])
def get_locus_cache_path(self, locus) -> str:
return path.join(self._cache_path, locus + "." + "fasta")
def get_scheme_profile_path(self):
return path.join(self._cache_path, "profiles.csv")
async def download_alleles_cache_data(self):
for locus in self._loci:
with open(self.get_locus_cache_path(locus), "wb") as fasta_handle:
async with self._http_client.get(f"/api/db/{self._database_name}/loci/{locus}/alleles_fasta") as fasta_response:
async for chunk, eof in fasta_response.content.iter_chunks():
fasta_handle.write(chunk)
async def download_scheme_profiles(self):
with open(self.get_scheme_profile_path(), "wb") as profile_cache_handle:
async with self._http_client.get("profiles_csv") as profiles_response:
async for chunk, eof in profiles_response.content.iter_chunks():
profile_cache_handle.write(chunk)
await self.load_scheme_profiles()
async def determine_mlst_allele_variants(self, query_sequence_strings: Iterable[str]) -> AsyncGenerator[Allele, Any]:
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
with AsyncBiopythonPairwiseAlignmentEngine(aligner, max_threads=4) as aligner_engine:
for query_sequence_string in query_sequence_strings:
for locus in self._loci:
async for allele_variant in read_fasta(self.get_locus_cache_path(locus)):
aligner_engine.align(allele_variant.sequence, query_sequence_string, variant_name=allele_variant.name, full=True)
break # start a bunch of full alignments for each variant to select segments
alignment_rankings: dict[str, set[tuple[PairwiseAlignment, str]]] = defaultdict(set)
async for alignment_result, additional_information in aligner_engine:
result_variant_name = additional_information["variant_name"]
result_locus, variant_id = result_variant_name.split("_")
full_alignment = additional_information["full"]
if full_alignment:
if alignment_result.alignment_stats.gaps == 0 and alignment_result.alignment_stats.mismatches == 0:
# I.e., 100% exactly the same
yield Allele(result_locus, variant_id, None)
continue
else:
alignment_rankings[result_locus].add((alignment_result, variant_id))
interest_sequence = full_alignment[alignment_result.query_indices[0]:alignment_result.query_indices[-1]]
async for allele_variant in read_fasta(self.get_locus_cache_path(result_locus)):
if result_variant_name == allele_variant.name:
continue # Skip if we just finished aligning this
aligner_engine.align(allele_variant.sequence, interest_sequence, variant_name=result_variant_name.name, full=False)
else:
alignment_rankings[result_locus].add((alignment_result, variant_id))
for final_locus, alignments in alignment_rankings.items():
closest_alignment, closest_variant_id = sorted(alignments, key=lambda index: index[0].alignment_stats.match_metric)[0]
yield Allele(final_locus, closest_variant_id, closest_alignment.alignment_stats)
async def determine_mlst_st(self, alleles):
allele_variants: dict[str, Allele] = {}
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_variants[allele.allele_locus] = allele
else:
for allele in alleles:
allele_variants[allele.allele_locus] = allele
ordered_profile = []
for locus in self._loci:
ordered_profile.append(allele_variants[locus].allele_variant)
st, clonal_complex = self._profiles_st_map[tuple(ordered_profile)]
return MLSTProfile(set(allele_variants.values()), st, clonal_complex)
async def profile_string(self, query_sequence_strings: Iterable[str]) -> MLSTProfile:
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
return await self.determine_mlst_st(alleles)
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
async for named_strings in query_named_string_groups:
for named_string in named_strings:
try:
yield NamedMLSTProfile(named_string.name, await self.profile_string([named_string.sequence]))
except NoBIGSdbMatchesException as e:
if stop_on_fail:
raise e
yield NamedMLSTProfile(named_string.name, None)
async def close(self):
await self._http_client.close()
if self._cleanup_required:
shutil.rmtree(self._cache_path)
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()
class BIGSdbIndex(AbstractAsyncContextManager):
KNOWN_BIGSDB_APIS = {
"https://bigsdb.pasteur.fr/api",
@ -216,5 +334,5 @@ class BIGSdbIndex(AbstractAsyncContextManager):
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
if local:
raise NotImplementedError()
return LocalBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)

View File

@ -0,0 +1,26 @@
import asyncio
from contextlib import AbstractAsyncContextManager
import tempfile
from typing import Iterable, Union
from Bio import Entrez
from Bio import SeqIO
from autobigs.engine.structures.genomics import AnnotatedString, StringAnnotation
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)

View File

@ -5,12 +5,8 @@ class BIGSDbDatabaseAPIException(Exception):
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args):
self._query_name = query_name
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
def get_causal_query_name(self) -> Union[str, None]:
return self._query_name
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
def __init__(self, database_name: str, database_schema_id: int, *args):

View File

@ -5,16 +5,12 @@ from Bio import SeqIO
from autobigs.engine.structures.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
results = []
for fasta_sequence in await fasta_sequences:
results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
return results
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
tasks = []
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
for handle in handles:
tasks.append(read_fasta(handle))
for task in asyncio.as_completed(tasks):
yield await task
async for named_seq in read_fasta(handle):
yield named_seq

View File

@ -3,32 +3,28 @@ import csv
from os import PathLike
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
from autobigs.engine.structures.mlst import Allele, MLSTProfile
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
result = defaultdict(list)
for allele in alleles:
result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
result[allele.allele_locus].append(allele.allele_variant)
for locus in result.keys():
if len(result[locus]) == 1:
result[locus] = result[locus][0] # Take the only one
else:
result[locus] = tuple(result[locus]) # type: ignore
return dict(result)
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[NamedMLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
failed = list()
with open(handle, "w", newline='') as filehandle:
header = None
writer: Union[csv.DictWriter, None] = None
async for named_mlst_profile in mlst_profiles_iterable:
name = named_mlst_profile.name
mlst_profile = named_mlst_profile.mlst_profile
async for name, mlst_profile in mlst_profiles_iterable:
if mlst_profile is None:
failed.append(name)
continue
allele_mapping = alleles_to_text_map(mlst_profile.alleles)
allele_mapping = alleles_to_map(mlst_profile.alleles)
if writer is None:
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
writer = csv.DictWriter(filehandle, fieldnames=header)

View File

@ -0,0 +1,42 @@
from Bio import SeqIO
from Bio.Align import PairwiseAligner
from pytest import mark
from pytest import fixture
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
from autobigs.engine.structures.alignment import PairwiseAlignment
@fixture
def tohamaI_bpertussis_adk():
return str(SeqIO.read("tests/resources/tohama_I_bpertussis_adk.fasta", format="fasta").seq)
@fixture
def tohamaI_bpertussis_genome():
return str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", format="fasta").seq)
@fixture
def fdaargos_1560_hinfluenza_adk():
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza_adk.fasta", format="fasta").seq)
@fixture
def fdaargos_1560_hinfluenza_genome():
return str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", format="fasta").seq)
@fixture(params=[1, 2])
def dummy_engine(request):
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
with AsyncBiopythonPairwiseAlignmentEngine(aligner, request.param) as engine:
yield engine
class TestAsyncPairwiseAlignmentEngine:
async def test_single_alignment_no_errors_single_alignment(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk: str, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
async for alignment, additional_information in dummy_engine:
assert isinstance(alignment, PairwiseAlignment)
async def test_single_alignment_no_errors_multiple(self, tohamaI_bpertussis_genome, tohamaI_bpertussis_adk, fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk, dummy_engine: AsyncBiopythonPairwiseAlignmentEngine):
dummy_engine.align(tohamaI_bpertussis_genome, tohamaI_bpertussis_adk)
dummy_engine.align(fdaargos_1560_hinfluenza_genome, fdaargos_1560_hinfluenza_adk)
async for alignment, additional_information in dummy_engine:
assert isinstance(alignment, PairwiseAlignment)

View File

@ -9,7 +9,7 @@ from autobigs.engine.structures import mlst
from autobigs.engine.structures.genomics import NamedString
from autobigs.engine.structures.mlst import Allele, MLSTProfile
from autobigs.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
from autobigs.engine.analysis.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler, LocalBIGSdbMLSTProfiler, RemoteBIGSdbMLSTProfiler
async def generate_async_iterable(normal_iterable):
for dummy_sequence in normal_iterable:
@ -50,30 +50,33 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
Allele("pgm", "5", None),
), "unknown", "unknown")
hinfluenzae_2014_102_profile = MLSTProfile((
Allele("adk", "28", None),
Allele("atpG", "33", None),
Allele("frdB", "7", None),
Allele("fucK", "18", None),
Allele("mdh", "11", None),
Allele("pgi", "125", None),
Allele("recA", "89", None)
), "478", "unknown")
hinfluenzae_2014_102_bad_profile = MLSTProfile((
Allele("adk", "3", None),
Allele("atpG", "121", None),
Allele("frdB", "6", None),
Allele("fucK", "5", None),
Allele("mdh", "12", None),
Allele("pgi", "4", None),
hinfluenzae_fdaargos_profile = MLSTProfile((
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("recA", "5", None)
), "unknown", "unknown")
), "3", "ST-3 complex")
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("recA", "5", None)
), "3", "ST-3 complex")
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
(True, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
])
class TestBIGSdbMLSTProfiler:
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
@ -199,6 +202,7 @@ class TestBIGSdbIndex:
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
@pytest.mark.parametrize("local", [
(True),
(False)
])
async def test_bigsdb_index_instantiates_correct_profiler(self, local):

View File

@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta
async def test_fasta_reader_not_none():
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
for named_string in named_strings:
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
async for named_string in named_strings:
assert named_string.name == "BX470248.1"

View File

@ -1,47 +0,0 @@
from typing import AsyncIterable, Iterable
import pytest
from autobigs.engine.structures.alignment import AlignmentStats
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
import tempfile
from csv import reader
from os import path
@pytest.fixture
def dummy_alphabet_mlst_profile():
return NamedMLSTProfile("name", MLSTProfile((
Allele("A", "1", None),
Allele("D", "1", None),
Allele("B", "1", None),
Allele("C", "1", None),
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
), "mysterious", "very mysterious"))
async def iterable_to_asynciterable(iterable: Iterable):
for iterated in iterable:
yield iterated
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
dummy_profiles = [dummy_alphabet_mlst_profile]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)
target_columns = lines[4:]
assert target_columns == sorted(target_columns)
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: NamedMLSTProfile):
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.mlst_profile.alleles) # type: ignore
expected_mapping = {
"A": "1",
"B": "1",
"C": ("1", "2*"),
"D": "1"
}
for allele_name, allele_ids in mapping.items():
assert allele_name in expected_mapping
assert allele_ids == expected_mapping[allele_name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,11 @@
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
AAAATCTTAGGCTAA

File diff suppressed because it is too large Load Diff