Compare commits

...

11 Commits

Author SHA1 Message Date
f8d92a4aad Added predetermined headers feature
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-14 14:26:43 +00:00
34bf02c75a Increased timeout to 5 minutes
Some checks reported errors
autoBIGS.engine/pipeline/tag Something is wrong with the build of this commit
autoBIGS.engine/pipeline/head There was a failure building this commit
2025-03-13 18:37:33 +00:00
3cb10a4609 Added client error as acceptable exception
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-13 18:13:38 +00:00
1776f5aa51 Fixed exception syntax
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-13 16:51:15 +00:00
96d715fdcb Added a server disconnect error catch
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-13 16:27:59 +00:00
e088d1080b Added functionality to retry determining ST
Some checks reported errors
autoBIGS.engine/pipeline/head Something is wrong with the build of this commit
2025-03-13 16:13:14 +00:00
8ffc7c7fb5 Added retry functionality for allele variant determination
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-13 15:54:35 +00:00
af7edf0942 Changed development server publication condition
Some checks failed
autoBIGS.engine/pipeline/tag This commit looks good
autoBIGS.engine/pipeline/head There was a failure building this commit
2025-03-13 14:37:16 +00:00
481870db97 Updated tests to reflect new fasta read naming
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good
2025-03-13 14:25:44 +00:00
62fdada9c1 Added original filename to csv output
Some checks reported errors
autoBIGS.engine/pipeline/head Something is wrong with the build of this commit
2025-03-13 14:17:08 +00:00
3074997db6 Removed unused file 2025-03-13 14:01:00 +00:00
9 changed files with 129 additions and 105 deletions

View File

@ -1,5 +1,6 @@
{
"recommendations": [
"piotrpalarz.vscode-gitignore-generator"
"piotrpalarz.vscode-gitignore-generator",
"gruntfuggly.todo-tree"
]
}

6
Jenkinsfile vendored
View File

@ -36,7 +36,9 @@ pipeline {
parallel {
stage ("git.reslate.systems") {
when {
branch '**/main'
not {
tag '*.*.*'
}
}
environment {
@ -49,7 +51,7 @@ pipeline {
}
stage ("pypi.org") {
when {
tag '*.*'
tag '*.*.*'
}
environment {
TOKEN = credentials('pypi.org')

View File

@ -1,44 +0,0 @@
{% set name = "autoBIGS.engine" %}
{% set version = "0.12.1.dev1+gb8cebb8.d20250221" %}
package:
name: {{ name|lower|replace(".", "-") }}
version: {{ version }}
source:
url: file:///workspaces/autoBIGS.engine/dist/autobigs_engine-0.12.1.dev1%2Bgb8cebb8.d20250221.tar.gz
sha256: c86441b94f935cfa414ff28ca4c026a070e0fb15988ea3bb7d1a942859a09b16
build:
noarch: python
script: {{ PYTHON }} -m pip install . -vv --no-deps --no-build-isolation
number: 0
run_exports:
- {{ pin_subpackage( name|lower|replace(".", "-"), max_pin="x.x") }}
requirements:
host:
- python >=3.12
- setuptools >=64
- setuptools-scm >=8
- pip
run:
- python >=3.12
- biopython ==1.85
- aiohttp ==3.11.*
test:
imports:
- autobigs
commands:
- pip check
requires:
- pip
about:
summary: A library to rapidly fetch fetch MLST profiles given sequences for various diseases.
license: GPL-3.0-or-later
license_file: LICENSE
home: https://github.com/Syph-and-VPD-Lab/autoBIGS.engine
extra:
recipe-maintainers:
- Harrison Deng

View File

@ -9,7 +9,7 @@ import shutil
import tempfile
from typing import Any, AsyncGenerator, AsyncIterable, Coroutine, Iterable, Mapping, Sequence, Set, Union
from aiohttp import ClientSession, ClientTimeout
from aiohttp import ClientOSError, ClientSession, ClientTimeout, ServerDisconnectedError
from autobigs.engine.reading import read_fasta
from autobigs.engine.structures.alignment import PairwiseAlignment
@ -43,11 +43,12 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
def __init__(self, database_api: str, database_name: str, scheme_id: int):
def __init__(self, database_api: str, database_name: str, scheme_id: int, retry_requests: int = 5):
self._retry_limit = retry_requests
self._database_name = database_name
self._scheme_id = scheme_id
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._scheme_id}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(300))
async def __aenter__(self):
return self
@ -57,40 +58,59 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
uri_path = "sequence"
if isinstance(query_sequence_strings, str) or isinstance(query_sequence_strings, NamedString):
query_sequence_strings = [query_sequence_strings]
for sequence_string in query_sequence_strings:
async with self._http_client.post(uri_path, json={
"sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
"partial_matches": True
}) as response:
sequence_response: dict = await response.json()
attempts = 0
success = False
last_error = None
while not success and attempts < self._retry_limit:
attempts += 1
request = self._http_client.post(uri_path, json={
"sequence": sequence_string if isinstance(sequence_string, str) else sequence_string.sequence,
"partial_matches": True
})
try:
async with request as response:
sequence_response: dict = await response.json()
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
elif "partial_matches" in sequence_response:
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = AlignmentStats(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
gaps=int(partial_match["gaps"]),
match_metric=int(partial_match["bitscore"])
)
result_allele = Allele(
allele_locus=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
if "exact_matches" in sequence_response:
# loci -> list of alleles with id and loci
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
result_allele = Allele(allele_locus=allele_loci, allele_variant=alelle_id, partial_match_profile=None)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
elif "partial_matches" in sequence_response:
partial_matches: dict[str, dict[str, Union[str, float, int]]] = sequence_response["partial_matches"]
for allele_loci, partial_match in partial_matches.items():
if len(partial_match) <= 0:
continue
partial_match_profile = AlignmentStats(
percent_identity=float(partial_match["identity"]),
mismatches=int(partial_match["mismatches"]),
gaps=int(partial_match["gaps"]),
match_metric=int(partial_match["bitscore"])
)
result_allele = Allele(
allele_locus=allele_loci,
allele_variant=str(partial_match["allele"]),
partial_match_profile=partial_match_profile
)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
else:
raise NoBIGSdbMatchesException(self._database_name, self._scheme_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
except (ConnectionError, ServerDisconnectedError, ClientOSError) as e: # Errors we will retry
last_error = e
success = False
await asyncio.sleep(5) # In case the connection issue is due to rate issues
else:
raise NoBIGSdbMatchesException(self._database_name, self._scheme_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
success = True
if not success and last_error is not None:
try:
raise last_error
except (ConnectionError, ServerDisconnectedError, ClientOSError) as e: # Non-fatal errors
yield Allele("error", "error", None)
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
uri_path = "designations"
@ -113,23 +133,43 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
request_json = {
"designations": allele_request_dict
}
async with self._http_client.post(uri_path, json=request_json) as response:
response_json: dict = await response.json()
allele_set: Set[Allele] = set()
response_json.setdefault("fields", dict())
scheme_fields_returned: dict[str, str] = response_json["fields"]
scheme_fields_returned.setdefault("ST", "unknown")
scheme_fields_returned.setdefault("clonal_complex", "unknown")
scheme_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in scheme_exact_matches.items():
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
if len(allele_set) == 0:
raise ValueError("Passed in no alleles.")
result_mlst_profile = MLSTProfile(allele_set, scheme_fields_returned["ST"], scheme_fields_returned["clonal_complex"])
if len(names_list) > 0:
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)) if len(set(names_list)) > 1 else names_list[0], result_mlst_profile)
return result_mlst_profile
attempts = 0
success = False
last_error = None
while attempts < self._retry_limit and not success:
attempts += 1
try:
async with self._http_client.post(uri_path, json=request_json) as response:
response_json: dict = await response.json()
allele_set: Set[Allele] = set()
response_json.setdefault("fields", dict())
scheme_fields_returned: dict[str, str] = response_json["fields"]
scheme_fields_returned.setdefault("ST", "unknown")
scheme_fields_returned.setdefault("clonal_complex", "unknown")
scheme_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in scheme_exact_matches.items():
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
if len(allele_set) == 0:
raise ValueError("Passed in no alleles.")
result_mlst_profile = MLSTProfile(allele_set, scheme_fields_returned["ST"], scheme_fields_returned["clonal_complex"])
if len(names_list) > 0:
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)) if len(set(names_list)) > 1 else names_list[0], result_mlst_profile)
return result_mlst_profile
except (ConnectionError, ServerDisconnectedError, ClientOSError) as e:
last_error = e
success = False
await asyncio.sleep(5)
else:
success = True
try:
if last_error is not None:
raise last_error
except (ConnectionError, ServerDisconnectedError, ClientOSError) as e:
result_mlst_profile = NamedMLSTProfile((str(tuple(names_list)) if len(set(names_list)) > 1 else names_list[0]) + ":Error", None)
raise ValueError("Last error was not recorded.")
async def profile_string(self, query_sequence_strings: Iterable[Union[NamedString, str]]) -> Union[NamedMLSTProfile, MLSTProfile]:
alleles = self.determine_mlst_allele_variants(query_sequence_strings)
return await self.determine_mlst_st(alleles)
@ -211,6 +251,16 @@ class BIGSdbIndex(AbstractAsyncContextManager):
async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, scheme_id: int) -> BIGSdbMLSTProfiler:
return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, scheme_id)
async def get_scheme_loci(self, dbseqdef_name: str, scheme_id: int) -> list[str]:
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name)}/db/{dbseqdef_name}/schemes/{scheme_id}"
async with self._http_client.get(uri_path) as response:
response_json = await response.json()
loci = response_json["loci"]
results = []
for locus in loci:
results.append(path.basename(locus))
return results
async def close(self):
await self._http_client.close()

View File

@ -1,5 +1,6 @@
import asyncio
from io import TextIOWrapper
from os import path
from typing import Any, AsyncGenerator, Iterable, Union
from Bio import SeqIO
@ -9,7 +10,7 @@ async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
results = []
for fasta_sequence in await fasta_sequences:
results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
results.append(NamedString("{0}:{1}".format(path.basename(handle.name if isinstance(handle, TextIOWrapper) else handle), fasta_sequence.id), str(fasta_sequence.seq)))
return results
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:

View File

@ -1,7 +1,7 @@
from collections import defaultdict
import csv
from os import PathLike
from typing import AsyncIterable, Collection, Mapping, Sequence, Union
from typing import AsyncIterable, Collection, Iterable, Mapping, Sequence, Union
from autobigs.engine.structures.mlst import Allele, MLSTProfile, NamedMLSTProfile
@ -17,7 +17,7 @@ def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Seque
result[locus] = tuple(result[locus]) # type: ignore
return dict(result)
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[NamedMLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[NamedMLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]], allele_names: Iterable[str]) -> Sequence[str]:
failed = list()
with open(handle, "w", newline='') as filehandle:
header = None
@ -30,7 +30,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[Named
continue
allele_mapping = alleles_to_text_map(mlst_profile.alleles)
if writer is None:
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
header = ["id", "st", "clonal-complex", *sorted(allele_names)]
writer = csv.DictWriter(filehandle, fieldnames=header)
writer.writeheader()
row_dictionary = {

View File

@ -222,3 +222,12 @@ class TestBIGSdbIndex:
assert isinstance(profile, MLSTProfile)
assert profile.clonal_complex == "ST-2 complex"
assert profile.sequence_type == "1"
@pytest.mark.parametrize(["bigsdb_name", "scheme_id", "expected"], [
("pubmlst_bordetella_seqdef", 3, ["adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"])
])
async def test_bigsdb_index_fetches_loci_names(self, bigsdb_name, scheme_id, expected):
async with BIGSdbIndex() as bigsdb_index:
loci = await bigsdb_index.get_scheme_loci(bigsdb_name, scheme_id)
assert set(loci) == set(expected)

View File

@ -4,4 +4,9 @@ from autobigs.engine.reading import read_fasta
async def test_fasta_reader_not_none():
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
for named_string in named_strings:
assert named_string.name == "BX470248.1"
assert named_string.name is not None
async def test_fasta_reader_name_contains_file_and_id():
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
for named_string in named_strings:
assert named_string.name == "tohama_I_bpertussis.fasta:BX470248.1"

View File

@ -27,7 +27,7 @@ async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile
dummy_profiles = [dummy_alphabet_mlst_profile]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path, ["A", "D", "B", "C"])
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)
@ -38,7 +38,7 @@ async def test_csv_writing_sample_name_not_repeated_when_single_sequence(dummy_a
dummy_profiles = [dummy_alphabet_mlst_profile]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path, ["A", "D", "B", "C"])
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)
@ -63,7 +63,7 @@ async def test_csv_writing_includes_asterisk_for_non_exact(dummy_alphabet_mlst_p
dummy_profiles = [dummy_alphabet_mlst_profile]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path, ["A", "D", "B", "C"])
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)