Compare commits
14 Commits
09a693b696
...
5a03c7e8d8
Author | SHA1 | Date | |
---|---|---|---|
5a03c7e8d8 | |||
ddf9cde175 | |||
2e8cdd8da9 | |||
d0318536b2 | |||
765cf9d418 | |||
348c3d00b4 | |||
1c3f7f9ed8 | |||
e4ddaf2e8c | |||
73aade2bde | |||
af8590baa7 | |||
36bca1b70d | |||
fbfd993269 | |||
ba606c35a9 | |||
4183840ba0 |
@ -1,6 +1,6 @@
|
|||||||
# autoBIGS.Engine
|
# autoBIGS.Engine
|
||||||
|
|
||||||
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
@ -13,11 +13,12 @@ dependencies = [
|
|||||||
]
|
]
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
description = "A library to rapidly fetch fetch MLST profiles given sequences for various diseases."
|
||||||
|
license = {text = "GPL-3.0-or-later"}
|
||||||
|
|
||||||
[project.urls]
|
[project.urls]
|
||||||
Homepage = "https://github.com/RealYHD/autoBIGS.engine"
|
Homepage = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
||||||
Source = "https://github.com/RealYHD/autoBIGS.engine"
|
Source = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine"
|
||||||
Issues = "https://github.com/RealYHD/autoBIGS.engine/issues"
|
Issues = "https://github.com/Syph-and-VPD-Lab/autoBIGS.engine/issues"
|
||||||
|
|
||||||
[tool.setuptools_scm]
|
[tool.setuptools_scm]
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen
|
|||||||
|
|
||||||
from aiohttp import ClientSession, ClientTimeout
|
from aiohttp import ClientSession, ClientTimeout
|
||||||
|
|
||||||
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
|
|
||||||
from autobigs.engine.reading import read_fasta
|
from autobigs.engine.reading import read_fasta
|
||||||
from autobigs.engine.structures.alignment import PairwiseAlignment
|
from autobigs.engine.structures.alignment import PairwiseAlignment
|
||||||
from autobigs.engine.structures.genomics import NamedString
|
from autobigs.engine.structures.genomics import NamedString
|
||||||
@ -125,13 +124,17 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
|
|||||||
|
|
||||||
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
async def profile_multiple_strings(self, query_named_string_groups: AsyncIterable[Iterable[NamedString]], stop_on_fail: bool = False) -> AsyncGenerator[NamedMLSTProfile, Any]:
|
||||||
async for named_strings in query_named_string_groups:
|
async for named_strings in query_named_string_groups:
|
||||||
|
names: list[str] = list()
|
||||||
|
sequences: list[str] = list()
|
||||||
for named_string in named_strings:
|
for named_string in named_strings:
|
||||||
|
names.append(named_string.name)
|
||||||
|
sequences.append(named_string.sequence)
|
||||||
try:
|
try:
|
||||||
yield NamedMLSTProfile(named_string.name, (await self.profile_string([named_string.sequence])))
|
yield NamedMLSTProfile("-".join(names), (await self.profile_string(sequences)))
|
||||||
except NoBIGSdbMatchesException as e:
|
except NoBIGSdbMatchesException as e:
|
||||||
if stop_on_fail:
|
if stop_on_fail:
|
||||||
raise e
|
raise e
|
||||||
yield NamedMLSTProfile(named_string.name, None)
|
yield NamedMLSTProfile("-".join(names), None)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._http_client.close()
|
await self._http_client.close()
|
||||||
|
@ -5,12 +5,13 @@ from Bio import SeqIO
|
|||||||
|
|
||||||
from autobigs.engine.structures.genomics import NamedString
|
from autobigs.engine.structures.genomics import NamedString
|
||||||
|
|
||||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
|
||||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
||||||
|
results = []
|
||||||
for fasta_sequence in await fasta_sequences:
|
for fasta_sequence in await fasta_sequences:
|
||||||
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
|
results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
|
||||||
|
return results
|
||||||
|
|
||||||
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
|
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
|
||||||
for handle in handles:
|
for handle in handles:
|
||||||
async for named_seq in read_fasta(handle):
|
yield await read_fasta(handle)
|
||||||
yield named_seq
|
|
@ -6,13 +6,15 @@ from typing import AsyncIterable, Collection, Mapping, Sequence, Union
|
|||||||
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||||
|
|
||||||
|
|
||||||
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
|
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
|
||||||
result = defaultdict(list)
|
result = defaultdict(list)
|
||||||
for allele in alleles:
|
for allele in alleles:
|
||||||
result[allele.allele_locus].append(allele.allele_variant)
|
result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
|
||||||
for locus in result.keys():
|
for locus in result.keys():
|
||||||
if len(result[locus]) == 1:
|
if len(result[locus]) == 1:
|
||||||
result[locus] = result[locus][0] # Take the only one
|
result[locus] = result[locus][0] # Take the only one
|
||||||
|
else:
|
||||||
|
result[locus] = tuple(result[locus]) # type: ignore
|
||||||
return dict(result)
|
return dict(result)
|
||||||
|
|
||||||
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
|
||||||
@ -24,7 +26,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
|
|||||||
if mlst_profile is None:
|
if mlst_profile is None:
|
||||||
failed.append(name)
|
failed.append(name)
|
||||||
continue
|
continue
|
||||||
allele_mapping = alleles_to_map(mlst_profile.alleles)
|
allele_mapping = alleles_to_text_map(mlst_profile.alleles)
|
||||||
if writer is None:
|
if writer is None:
|
||||||
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
|
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
|
||||||
writer = csv.DictWriter(filehandle, fieldnames=header)
|
writer = csv.DictWriter(filehandle, fieldnames=header)
|
||||||
|
@ -50,17 +50,17 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
|
|||||||
Allele("pgm", "5", None),
|
Allele("pgm", "5", None),
|
||||||
), "unknown", "unknown")
|
), "unknown", "unknown")
|
||||||
|
|
||||||
hinfluenzae_fdaargos_profile = MLSTProfile((
|
hinfluenzae_2014_102_profile = MLSTProfile((
|
||||||
Allele("adk", "1", None),
|
Allele("adk", "28", None),
|
||||||
Allele("atpG", "1", None),
|
Allele("atpG", "33", None),
|
||||||
Allele("frdB", "1", None),
|
Allele("frdB", "7", None),
|
||||||
Allele("fucK", "1", None),
|
Allele("fucK", "18", None),
|
||||||
Allele("mdh", "1", None),
|
Allele("mdh", "11", None),
|
||||||
Allele("pgi", "1", None),
|
Allele("pgi", "125", None),
|
||||||
Allele("recA", "5", None)
|
Allele("recA", "89", None)
|
||||||
), "3", "ST-3 complex")
|
), "478", "unknown")
|
||||||
|
|
||||||
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
|
hinfluenzae_2014_102_bad_profile = MLSTProfile((
|
||||||
Allele("adk", "3", None),
|
Allele("adk", "3", None),
|
||||||
Allele("atpG", "121", None),
|
Allele("atpG", "121", None),
|
||||||
Allele("frdB", "6", None),
|
Allele("frdB", "6", None),
|
||||||
@ -68,15 +68,12 @@ hinfluenzae_fdaargos_bad_profile = MLSTProfile((
|
|||||||
Allele("mdh", "12", None),
|
Allele("mdh", "12", None),
|
||||||
Allele("pgi", "4", None),
|
Allele("pgi", "4", None),
|
||||||
Allele("recA", "5", None)
|
Allele("recA", "5", None)
|
||||||
), "3", "ST-3 complex")
|
), "unknown", "unknown")
|
||||||
|
|
||||||
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
|
|
||||||
|
|
||||||
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
|
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
|
||||||
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
|
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
|
||||||
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile),
|
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
|
||||||
])
|
])
|
||||||
class TestBIGSdbMLSTProfiler:
|
class TestBIGSdbMLSTProfiler:
|
||||||
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
|
||||||
@ -202,7 +199,6 @@ class TestBIGSdbIndex:
|
|||||||
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
|
||||||
|
|
||||||
@pytest.mark.parametrize("local", [
|
@pytest.mark.parametrize("local", [
|
||||||
(True),
|
|
||||||
(False)
|
(False)
|
||||||
])
|
])
|
||||||
async def test_bigsdb_index_instantiates_correct_profiler(self, local):
|
async def test_bigsdb_index_instantiates_correct_profiler(self, local):
|
||||||
|
@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta
|
|||||||
|
|
||||||
|
|
||||||
async def test_fasta_reader_not_none():
|
async def test_fasta_reader_not_none():
|
||||||
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
|
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
|
||||||
async for named_string in named_strings:
|
for named_string in named_strings:
|
||||||
assert named_string.name == "BX470248.1"
|
assert named_string.name == "BX470248.1"
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
from typing import AsyncIterable, Iterable
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from autobigs.engine.structures.alignment import AlignmentStats
|
||||||
|
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
|
||||||
|
from autobigs.engine.structures.mlst import Allele, MLSTProfile
|
||||||
|
import tempfile
|
||||||
|
from csv import reader
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def dummy_alphabet_mlst_profile():
|
||||||
|
return MLSTProfile((
|
||||||
|
Allele("A", "1", None),
|
||||||
|
Allele("D", "1", None),
|
||||||
|
Allele("B", "1", None),
|
||||||
|
Allele("C", "1", None),
|
||||||
|
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
|
||||||
|
), "mysterious", "very mysterious")
|
||||||
|
|
||||||
|
async def iterable_to_asynciterable(iterable: Iterable):
|
||||||
|
for iterated in iterable:
|
||||||
|
yield iterated
|
||||||
|
|
||||||
|
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
|
||||||
|
dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
output_path = path.join(temp_dir, "out.csv")
|
||||||
|
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
|
||||||
|
with open(output_path) as csv_handle:
|
||||||
|
csv_reader = reader(csv_handle)
|
||||||
|
lines = list(csv_reader)
|
||||||
|
target_columns = lines[4:]
|
||||||
|
assert target_columns == sorted(target_columns)
|
||||||
|
|
||||||
|
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
|
||||||
|
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
|
||||||
|
expected_mapping = {
|
||||||
|
"A": "1",
|
||||||
|
"B": "1",
|
||||||
|
"C": ("1", "2*"),
|
||||||
|
"D": "1"
|
||||||
|
}
|
||||||
|
for allele_name, allele_ids in mapping.items():
|
||||||
|
assert allele_name in expected_mapping
|
||||||
|
assert allele_ids == expected_mapping[allele_name]
|
28244
tests/resources/2014-102_hinfluenza.fasta
Normal file
28244
tests/resources/2014-102_hinfluenza.fasta
Normal file
File diff suppressed because it is too large
Load Diff
27751
tests/resources/2014-102_hinfluenza_features.fasta
Normal file
27751
tests/resources/2014-102_hinfluenza_features.fasta
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,11 +0,0 @@
|
|||||||
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
|
|
||||||
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
|
|
||||||
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
|
|
||||||
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
|
|
||||||
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
|
|
||||||
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
|
|
||||||
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
|
|
||||||
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
|
|
||||||
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
|
|
||||||
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
|
|
||||||
AAAATCTTAGGCTAA
|
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user