11 Commits

12 changed files with 56068 additions and 50920 deletions

View File

@@ -1,6 +1,6 @@
# autoBIGS.Engine
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
A python library implementing common BIGSdb MLST schemes and databases accesses for the purpose of typing sequences automatically. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
## Features

View File

@@ -11,7 +11,6 @@ from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Mapping, Sequen
from aiohttp import ClientSession, ClientTimeout
from autobigs.engine.analysis.aligners import AsyncBiopythonPairwiseAlignmentEngine
from autobigs.engine.reading import read_fasta
from autobigs.engine.structures.alignment import PairwiseAlignment
from autobigs.engine.structures.genomics import NamedString

View File

@@ -5,12 +5,13 @@ from Bio import SeqIO
from autobigs.engine.structures.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
async def read_fasta(handle: Union[str, TextIOWrapper]) -> Iterable[NamedString]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
results = []
for fasta_sequence in await fasta_sequences:
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
results.append(NamedString(fasta_sequence.id, str(fasta_sequence.seq)))
return results
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[NamedString, Any]:
async def read_multiple_fastas(handles: Iterable[Union[str, TextIOWrapper]]) -> AsyncGenerator[Iterable[NamedString], Any]:
for handle in handles:
async for named_seq in read_fasta(handle):
yield named_seq
yield await read_fasta(handle)

View File

@@ -6,13 +6,15 @@ from typing import AsyncIterable, Collection, Mapping, Sequence, Union
from autobigs.engine.structures.mlst import Allele, MLSTProfile
def alleles_to_map(alleles: Collection[Allele]) -> Mapping[str, Union[list[str], str]]:
def alleles_to_text_map(alleles: Collection[Allele]) -> Mapping[str, Union[Sequence[str], str]]:
result = defaultdict(list)
for allele in alleles:
result[allele.allele_locus].append(allele.allele_variant)
result[allele.allele_locus].append(allele.allele_variant + ("*" if allele.partial_match_profile is not None else ""))
for locus in result.keys():
if len(result[locus]) == 1:
result[locus] = result[locus][0] # Take the only one
else:
result[locus] = tuple(result[locus]) # type: ignore
return dict(result)
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple[str, Union[MLSTProfile, None]]], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]) -> Sequence[str]:
@@ -24,7 +26,7 @@ async def write_mlst_profiles_as_csv(mlst_profiles_iterable: AsyncIterable[tuple
if mlst_profile is None:
failed.append(name)
continue
allele_mapping = alleles_to_map(mlst_profile.alleles)
allele_mapping = alleles_to_text_map(mlst_profile.alleles)
if writer is None:
header = ["id", "st", "clonal-complex", *sorted(allele_mapping.keys())]
writer = csv.DictWriter(filehandle, fieldnames=header)

View File

@@ -50,17 +50,17 @@ bpertussis_tohamaI_bad_profile = MLSTProfile((
Allele("pgm", "5", None),
), "unknown", "unknown")
hinfluenzae_fdaargos_profile = MLSTProfile((
Allele("adk", "1", None),
Allele("atpG", "1", None),
Allele("frdB", "1", None),
Allele("fucK", "1", None),
Allele("mdh", "1", None),
Allele("pgi", "1", None),
Allele("recA", "5", None)
), "3", "ST-3 complex")
hinfluenzae_2014_102_profile = MLSTProfile((
Allele("adk", "28", None),
Allele("atpG", "33", None),
Allele("frdB", "7", None),
Allele("fucK", "18", None),
Allele("mdh", "11", None),
Allele("pgi", "125", None),
Allele("recA", "89", None)
), "478", "unknown")
hinfluenzae_fdaargos_bad_profile = MLSTProfile((
hinfluenzae_2014_102_bad_profile = MLSTProfile((
Allele("adk", "3", None),
Allele("atpG", "121", None),
Allele("frdB", "6", None),
@@ -68,15 +68,12 @@ hinfluenzae_fdaargos_bad_profile = MLSTProfile((
Allele("mdh", "12", None),
Allele("pgi", "4", None),
Allele("recA", "5", None)
), "3", "ST-3 complex")
), "unknown", "unknown")
hinfluenzae_fdaargos_sequence = str(SeqIO.read("tests/resources/fdaargos_1560_hinfluenza.fasta", "fasta").seq)
hinfluenzae_fdaargos_fragmented_sequence = tuple(SeqIO.parse("tests/resources/tohama_I_bpertussis_features.fasta", "fasta"))
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "fdaargos_1560_hinfluenza.fasta", "fdaargos_1560_hinfluenza_features.fasta", hinfluenzae_fdaargos_profile, hinfluenzae_fdaargos_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
])
class TestBIGSdbMLSTProfiler:
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
@@ -202,7 +199,6 @@ class TestBIGSdbIndex:
assert databases["pubmlst_bordetella_seqdef"] == "https://bigsdb.pasteur.fr/api"
@pytest.mark.parametrize("local", [
(True),
(False)
])
async def test_bigsdb_index_instantiates_correct_profiler(self, local):

View File

@@ -2,6 +2,6 @@ from autobigs.engine.reading import read_fasta
async def test_fasta_reader_not_none():
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
async for named_string in named_strings:
named_strings = await read_fasta("tests/resources/tohama_I_bpertussis.fasta")
for named_string in named_strings:
assert named_string.name == "BX470248.1"

View File

@@ -0,0 +1,47 @@
from typing import AsyncIterable, Iterable
import pytest
from autobigs.engine.structures.alignment import AlignmentStats
from autobigs.engine.writing import alleles_to_text_map, write_mlst_profiles_as_csv
from autobigs.engine.structures.mlst import Allele, MLSTProfile
import tempfile
from csv import reader
from os import path
@pytest.fixture
def dummy_alphabet_mlst_profile():
return MLSTProfile((
Allele("A", "1", None),
Allele("D", "1", None),
Allele("B", "1", None),
Allele("C", "1", None),
Allele("C", "2", AlignmentStats(90, 10, 0, 90))
), "mysterious", "very mysterious")
async def iterable_to_asynciterable(iterable: Iterable):
for iterated in iterable:
yield iterated
async def test_column_order_is_same_as_expected_file(dummy_alphabet_mlst_profile: MLSTProfile):
dummy_profiles = [("test_1", dummy_alphabet_mlst_profile)]
with tempfile.TemporaryDirectory() as temp_dir:
output_path = path.join(temp_dir, "out.csv")
await write_mlst_profiles_as_csv(iterable_to_asynciterable(dummy_profiles), output_path)
with open(output_path) as csv_handle:
csv_reader = reader(csv_handle)
lines = list(csv_reader)
target_columns = lines[4:]
assert target_columns == sorted(target_columns)
async def test_alleles_to_text_map_mapping_is_correct(dummy_alphabet_mlst_profile: MLSTProfile):
mapping = alleles_to_text_map(dummy_alphabet_mlst_profile.alleles)
expected_mapping = {
"A": "1",
"B": "1",
"C": ("1", "2*"),
"D": "1"
}
for allele_name, allele_ids in mapping.items():
assert allele_name in expected_mapping
assert allele_ids == expected_mapping[allele_name]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +0,0 @@
>lcl|CP085952.1_gene_371 [gene=adk] [locus_tag=LK401_01855] [location=complement(365128..365772)] [gbkey=Gene]
ATGAAAATTATTCTTTTAGGTGCACCGGGTGCAGGTAAAGGCACTCAAGCACAATTTATTATGAACAAAT
TTGGTATCCCGCAAATTTCAACTGGTGATATGTTCCGTGCTGCAATCAAAGCGGGGACTGAACTTGGCAA
ACAAGCTAAAGCATTAATGGATGAAGGTAAATTAGTGCCAGATGAATTAACCGTTGCCCTTGTAAAAGAT
CGTATTGCTCAAGCTGACTGCACAAATGGTTTCTTGTTAGATGGTTTCCCTCGTACTATTCCACAAGCGG
ATGCACTGAAAGATTCAGGTGTTAAAATTGACTTTGTTTTAGAATTTGATGTGCCAGACGAAGTGATTGT
TGAACGTATGAGTGGCCGTCGCGTACACCAAGCGTCTGGCCGTTCTTACCACATCGTTTATAATCCACCA
AAAGTGGAAGGTAAAGATGATGTAACAGGCGAAGATTTAATTATTCGTGCAGACGATAAACCAGAAACTG
TATTAGATCGTTTAGCCGTATATCATAAACAAACTAGCCCATTAATTGATTATTACCAAGCAGAAGCGAA
AGCGGGGAATACTCAATATTTCCGTTTAGACGGTACACAAAAAGTAGAAGAAGTTAGCCAAGAGTTAGAT
AAAATCTTAGGCTAA

File diff suppressed because it is too large Load Diff