Compare commits

...

14 Commits

Author SHA1 Message Date
5a03c7e8d8 Multiple string profiling now respects grouped queries (for non-WGS)
All checks were successful
automlst.engine/pipeline/head This commit looks good
2025-02-18 15:34:18 +00:00
ddf9cde175 Added a license text to pyproject.toml 2025-02-14 20:47:06 +00:00
2e8cdd8da9 Updated URL links
All checks were successful
automlst.engine/pipeline/head This commit looks good
2025-02-14 20:37:13 +00:00
d0318536b2 Changed FASTA reading to group based on file for merging partial targets 2025-02-14 14:35:53 +00:00
765cf9d418 Merge branch 'features/improved-oop-architecture' into features/non-exact-notation 2025-02-12 17:53:25 +00:00
348c3d00b4 Updated README.md to be more clear 2025-02-12 17:52:53 +00:00
1c3f7f9ed8 Removed test for instantiating local MLST profiler 2025-02-12 17:46:55 +00:00
e4ddaf2e8c Changed to a MLST typable sequence for pubMLST tests 2025-02-12 17:43:26 +00:00
73aade2bde Merge branch 'features/improved-oop-architecture' into features/non-exact-notation 2025-02-12 17:07:51 +00:00
af8590baa7 Removed import of deleted feature 2025-02-12 17:07:10 +00:00
36bca1b70d Merge branch 'features/improved-oop-architecture' into features/non-exact-notation 2025-02-12 17:02:22 +00:00
fbfd993269 Copied tests over from CSV tests and updated to reflect current code base 2025-02-12 16:36:59 +00:00
ba606c35a9 conversion of collection of alleles to map now produces results with tuples instead of lists 2025-02-12 16:36:31 +00:00
4183840ba0 Added notation to indicate inexact matching in CSV 2025-02-12 15:59:19 +00:00
13 changed files with 56082 additions and 50929 deletions

View File

@ -1,6 +1,6 @@
# autoBIGS.Engine # autoBIGS.Engine
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`. A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
## Features ## Features

View File

@ -13,11 +13,12 @@ dependencies = [
] ]
requires-python = ">=3.12" requires-python = ">=3.12"
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases." description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
license = {text = "GPL-3.0-or-later"}
[project.urls] [project.urls]
Homepage = "https://github.com/RealYHD/autoBIGS.engine" Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
Source = "https://github.com/RealYHD/autoBIGS.engine" Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues" Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"
[tool.setuptools_scm] [tool.setuptools_scm]

View File

@ -11,7 +11,6 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen
from aiohttp import ClientSession, ClientTimeout from aiohttp import ClientSession, ClientTimeout
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
from autobigs.engine.reading import read_fasta from autobigs.engine.reading import read_fasta
from autobigs.engine.structures.alignment import PairwiseAlignment from autobigs.engine.structures.alignment import PairwiseAlignment
from autobigs.engine.structures.genomics import NamedString from autobigs.engine.structures.genomics import NamedString
@ -125,13 +124,17 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]: async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
async for named_strings in query_named_string_groups: async for named_strings in query_named_string_groups:
names: list[str] = list()
sequences: list[str] = list()
for named_string in named_strings: for named_string in named_strings:
try: names.append(named_string.name)
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence]))) sequences.append(named_string.sequence)
except NoBIGSdbMatchesException as e: try:
if stop_on_fail: yield NamedMLSTProfile("-".join(names), (await self.profile_string(sequences)))
raise e except NoBIGSdbMatchesException as e:
yield NamedMLSTProfile(named_string.name, None) if stop_on_fail:
raise e
yield NamedMLSTProfile("-".join(names), None)
async def close(self): async def close(self):
await self._http_client.close() await self._http_client.close()

View File

@ -5,12 +5,13 @@ from Bio import SeqIO
from autobigs.engine.structures.genomics import NamedString from autobigs.engine.structures.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]: async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta") fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
results = []
for fasta_sequence in await fasta_sequences: for fasta_sequence in await fasta_sequences:
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq)) results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
return results
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]: async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
for handle in handles: for handle in handles:
async for named_seq in read_fasta(handle): yield await read_fasta(handle)
yield named_seq

View File

@ -6,13 +6,15 @@ from typing import AsyncIterable, Collection, Mapping, Sequence, Union
from autobigs.engine.structures.mlst import Allele, MLSTProfile from autobigs.engine.structures.mlst import Allele, MLSTProfile
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]: def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
result = defaultdict(list) result = defaultdict(list)
for allele in alleles: for allele in alleles:
result[allele.allele_locus].append(allele.allele_variant) result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
for locus in result.keys(): for locus in result.keys():
if len(result[locus]) == 1: if len(result[locus]) == 1:
result[locus] = result[locus][0] # Take the only one result[locus] = result[locus][0] # Take the only one
else:
result[locus] = tuple(result[locus]) # type: ignore
return dict(result) return dict(result)
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]: async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
@ -24,7 +26,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
if mlst_profile is None: if mlst_profile is None:
failed.append(name) failed.append(name)
continue continue
allele_mapping = alleles_to_map(mlst_profile.alleles) allele_mapping = alleles_to_text_map(mlst_profile.alleles)
if writer is None: if writer is None:
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())] header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
writer = csv.DictWriter(filehandle, fieldnames=header) writer = csv.DictWriter(filehandle, fieldnames=header)

View File

@ -50,17 +50,17 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
Allele("pgm", "5", None), Allele("pgm", "5", None),
), "unknown", "unknown") ), "unknown", "unknown")
hinfluenzae_fdaargos_profile = MLSTProfile(( hinfluenzae_2014_102_profile = MLSTProfile((
Allele("adk", "1", None), Allele("adk", "28", None),
Allele("atpG", "1", None), Allele("atpG", "33", None),
Allele("frdB", "1", None), Allele("frdB", "7", None),
Allele("fucK", "1", None), Allele("fucK", "18", None),
Allele("mdh", "1", None), Allele("mdh", "11", None),
Allele("pgi", "1", None), Allele("pgi", "125", None),
Allele("recA", "5", None) Allele("recA", "89", None)
), "3", "ST-3 complex") ), "478", "unknown")
hinfluenzae_fdaargos_bad_profile = MLSTProfile(( hinfluenzae_2014_102_bad_profile = MLSTProfile((
Allele("adk", "3", None), Allele("adk", "3", None),
Allele("atpG", "121", None), Allele("atpG", "121", None),
Allele("frdB", "6", None), Allele("frdB", "6", None),
@ -68,15 +68,12 @@ hinfluenzae_fdaargos_bad_profile = MLSTProfile((
Allele("mdh", "12", None), Allele("mdh", "12", None),
Allele("pgi", "4", None), Allele("pgi", "4", None),
Allele("recA", "5", None) Allele("recA", "5", None)
), "3", "ST-3 complex") ), "unknown", "unknown")
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ @pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile), (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
]) ])
class TestBIGSdbMLSTProfiler: class TestBIGSdbMLSTProfiler:
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
@ -202,7 +199,6 @@ class TestBIGSdbIndex:
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api" assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
@pytest.mark.parametrize("local", [ @pytest.mark.parametrize("local", [
(True),
(False) (False)
]) ])
async def test_bigsdb_index_instantiates_correct_profiler(self, local): async def test_bigsdb_index_instantiates_correct_profiler(self, local):

View File

@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta
async def test_fasta_reader_not_none(): async def test_fasta_reader_not_none():
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta") named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
async for named_string in named_strings: for named_string in named_strings:
assert named_string.name == "BX470248.1" assert named_string.name == "BX470248.1"

View File

@ -0,0 +1,47 @@
from typing import AsyncIterable, Iterable
import pytest
from autobigs.engine.structures.alignment import AlignmentStats
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
from autobigs.engine.structures.mlst import Allele, MLSTProfile
import tempfile
from csv import reader
from os import path
@pytest.fixture
def dummy_alphabet_mlst_profile():
return MLSTProfile((
Allele("A", "1", None),
Allele("D", "1", None),
Allele("B", "1", None),
Allele("C", "1", None),
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
), "mysterious", "very mysterious")
async def iterable_to_asynciterable(iterable: Iterable):
for iterated in iterable:
yield iterated
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)
target_columns = lines[4:]
assert target_columns == sorted(target_columns)
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
expected_mapping = {
"A": "1",
"B": "1",
"C": ("1", "2*"),
"D": "1"
}
for allele_name, allele_ids in mapping.items():
assert allele_name in expected_mapping
assert allele_ids == expected_mapping[allele_name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
AAAATCTTAGGCTAA

File diff suppressed because it is too large Load Diff