Replaced schema with scheme
All checks were successful
autoBIGS.engine/pipeline/head This commit looks good

This commit is contained in:
Harrison Deng 2025-02-26 04:50:54 +00:00
parent 06dbb56c28
commit 27ae89fde7
6 changed files with 51 additions and 76 deletions

25
.vscode/launch.json vendored
View File

@ -1,25 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "autobigs info -lschema pubmlst_bordetella_seqdef",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/autobigs/cli/program.py",
"console": "integratedTerminal",
"args": [
"info",
"-lschemas",
"pubmlst_bordetella_seqdef"
],
"cwd": "${workspaceFolder}/src",
"env": {
"PYTHONPATH": "${workspaceFolder}/src"
}
}
]
}

View File

@ -7,7 +7,7 @@ A python library implementing common BIGSdb MLST schemes and databases accesses
Briefly, this library can:
- Import multiple `FASTA` files
- Fetch the available BIGSdb databases that is currently live and available
- Fetch the available BIGSdb database schemas for a given MLST database
- Fetch the available BIGSdb database schemes for a given MLST database
- Retrieve exact/non-exact MLST allele variant IDs based off a sequence
- Retrieve MLST sequence type IDs based off a sequence
- Output all results to a single CSV

View File

@ -43,10 +43,10 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
def __init__(self, database_api: str, database_name: str, schema_id: int):
def __init__(self, database_api: str, database_name: str, scheme_id: int):
self._database_name = database_name
self._schema_id = schema_id
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/"
self._scheme_id = scheme_id
self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._scheme_id}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60))
async def __aenter__(self):
@ -90,7 +90,7 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
)
yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele)
else:
raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
raise NoBIGSdbMatchesException(self._database_name, self._scheme_id, sequence_string.name if isinstance(sequence_string, NamedString) else None)
async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]:
uri_path = "designations"
@ -117,15 +117,15 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler):
response_json: dict = await response.json()
allele_set: Set[Allele] = set()
response_json.setdefault("fields", dict())
schema_fields_returned: dict[str, str] = response_json["fields"]
schema_fields_returned.setdefault("ST", "unknown")
schema_fields_returned.setdefault("clonal_complex", "unknown")
schema_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in schema_exact_matches.items():
scheme_fields_returned: dict[str, str] = response_json["fields"]
scheme_fields_returned.setdefault("ST", "unknown")
scheme_fields_returned.setdefault("clonal_complex", "unknown")
scheme_exact_matches: dict = response_json["exact_matches"]
for exact_match_locus, exact_match_alleles in scheme_exact_matches.items():
allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None))
if len(allele_set) == 0:
raise ValueError("Passed in no alleles.")
result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
result_mlst_profile = MLSTProfile(allele_set, scheme_fields_returned["ST"], scheme_fields_returned["clonal_complex"])
if len(names_list) > 0:
result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile)
return result_mlst_profile
@ -165,7 +165,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
def __init__(self):
self._http_client = ClientSession()
self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None
self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict()
self._seqdefdb_schemes: dict[str, Union[Mapping[str, int], None]] = dict()
super().__init__()
async def __aenter__(self):
@ -191,22 +191,22 @@ class BIGSdbIndex(AbstractAsyncContextManager):
raise NoSuchBIGSdbDatabaseException(seqdef_db_name)
return known_databases[seqdef_db_name]
async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
if seqdef_db_name in self._seqdefdb_schemas and not force:
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
async def get_schemes_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]:
if seqdef_db_name in self._seqdefdb_schemes and not force:
return self._seqdefdb_schemes[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional
uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes"
async with self._http_client.get(uri_path) as response:
response_json = await response.json()
schema_descriptions: Mapping[str, int] = dict()
scheme_descriptions: Mapping[str, int] = dict()
for scheme_definition in response_json["schemes"]:
scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1])
scheme_desc: str = scheme_definition["description"]
schema_descriptions[scheme_desc] = scheme_id
self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions
return self._seqdefdb_schemas[seqdef_db_name] # type: ignore
scheme_descriptions[scheme_desc] = scheme_id
self._seqdefdb_schemes[seqdef_db_name] = scheme_descriptions
return self._seqdefdb_schemes[seqdef_db_name] # type: ignore
async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler:
return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id)
async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, scheme_id: int) -> BIGSdbMLSTProfiler:
return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, scheme_id)
async def close(self):
await self._http_client.close()
@ -214,7 +214,7 @@ class BIGSdbIndex(AbstractAsyncContextManager):
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int):
def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, scheme_id: int):
if local:
raise NotImplementedError()
return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id)
return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, scheme_id=scheme_id)

View File

@ -5,21 +5,21 @@ class BIGSDbDatabaseAPIException(Exception):
class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args):
def __init__(self, database_name: str, database_scheme_id: int, query_name: Union[None, str], *args):
self._query_name = query_name
super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
super().__init__(f"No matches found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args)
def get_causal_query_name(self) -> Union[str, None]:
return self._query_name
class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args)
def __init__(self, database_name: str, database_scheme_id: int, *args):
super().__init__(f"No exact match found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args)
class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, *args):
super().__init__(f"No database \"{database_name}\" found.", *args)
class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_schema_id: int, *args):
super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args)
class NoSuchBigSdbschemeException(BIGSDbDatabaseAPIException):
def __init__(self, database_name: str, database_scheme_id: int, *args):
super().__init__(f"No scheme with ID {database_scheme_id} in \"{database_name}\" found.", *args)

View File

@ -25,7 +25,7 @@ class SangerTraceData(NamedString):
analysis_proto_settings_name: str
analysis_rpto_settings_ver: str
analysis_proto_xml_data: str
analysis_proto_xml_schema_ver: str
analysis_proto_xml_scheme_ver: str
sample_comment: Union[None, str]
capillary_machine: bool
container_identifier: str

View File

@ -71,14 +71,14 @@ hinfluenzae_2014_102_bad_profile = MLSTProfile((
), "unknown", "unknown")
@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
@pytest.mark.parametrize("local_db,database_api,database_name,scheme_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [
(False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile),
(False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile),
])
class TestBIGSdbMLSTProfiler:
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
sequence = get_first_sequence_from_fasta(seq_path)
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles)
targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys())
async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]):
@ -89,10 +89,10 @@ class TestBIGSdbMLSTProfiler:
assert len(targets_left) == 0
async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path)
mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()}
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as profiler:
for target_sequence in target_sequences:
match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description)
if match is None:
@ -107,26 +107,26 @@ class TestBIGSdbMLSTProfiler:
assert len(mlst_targets) == 0
async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == expected_profile.clonal_complex
assert mlst_st_data.sequence_type == expected_profile.sequence_type
async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
dummy_alleles = bad_profile.alleles
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles)
assert mlst_profile.clonal_complex == "unknown"
assert mlst_profile.sequence_type == "unknown"
async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
sequence = get_first_sequence_from_fasta(seq_path)
dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]]
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)):
name, profile = named_profile.name, named_profile.mlst_profile
assert profile is not None
@ -134,10 +134,10 @@ class TestBIGSdbMLSTProfiler:
assert profile.clonal_complex == expected_profile.clonal_complex
assert profile.sequence_type == expected_profile.sequence_type
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
valid_seq = get_first_sequence_from_fasta(seq_path)
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True):
name, profile = name_profile.name, name_profile.mlst_profile
@ -151,11 +151,11 @@ class TestBIGSdbMLSTProfiler:
assert profile.clonal_complex == expected_profile.clonal_complex
assert profile.sequence_type == expected_profile.sequence_type
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile):
valid_seq = get_first_sequence_from_fasta(seq_path)
dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]]
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler:
async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler:
async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False):
name, profile = named_profile.name, named_profile.mlst_profile
@ -183,12 +183,12 @@ class TestBIGSdbIndex:
async with BIGSdbIndex() as bigsdb_index:
assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api"
async def test_bigsdb_index_get_schemas_for_bordetella(self):
async def test_bigsdb_index_get_schemes_for_bordetella(self):
async with BIGSdbIndex() as index:
schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
assert len(schemas.keys()) > 0
assert "MLST" in schemas
assert isinstance(schemas["MLST"], int)
schemes = await index.get_schemes_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef")
assert len(schemes.keys()) > 0
assert "MLST" in schemes
assert isinstance(schemes["MLST"], int)
async def test_bigsdb_index_get_databases_has_only_seqdef(self):
async with BIGSdbIndex() as index: