diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 448fd4b..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - - { - "name": "autobigs info -lschema pubmlst_bordetella_seqdef", - "type": "debugpy", - "request": "launch", - "program": "${workspaceFolder}/src/autobigs/cli/program.py", - "console": "integratedTerminal", - "args": [ - "info", - "-lschemas", - "pubmlst_bordetella_seqdef" - ], - "cwd": "${workspaceFolder}/src", - "env": { - "PYTHONPATH": "${workspaceFolder}/src" - } - } - ] -} \ No newline at end of file diff --git a/README.md b/README.md index 0563e02..03e7e87 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ A python library implementing common BIGSdb MLST schemes and databases accesses Briefly, this library can: - Import multiple `FASTA` files - Fetch the available BIGSdb databases that is currently live and available -- Fetch the available BIGSdb database schemas for a given MLST database +- Fetch the available BIGSdb database schemes for a given MLST database - Retrieve exact/non-exact MLST allele variant IDs based off a sequence - Retrieve MLST sequence type IDs based off a sequence - Output all results to a single CSV diff --git a/src/autobigs/engine/analysis/bigsdb.py b/src/autobigs/engine/analysis/bigsdb.py index d9e11e9..d186753 100644 --- a/src/autobigs/engine/analysis/bigsdb.py +++ b/src/autobigs/engine/analysis/bigsdb.py @@ -43,10 +43,10 @@ class BIGSdbMLSTProfiler(AbstractAsyncContextManager): class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): - def __init__(self, database_api: str, database_name: str, schema_id: int): + def __init__(self, database_api: str, database_name: str, scheme_id: int): self._database_name = database_name - self._schema_id = schema_id - self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._schema_id}/" + self._scheme_id = scheme_id + self._base_url = f"{database_api}/db/{self._database_name}/schemes/{self._scheme_id}/" self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(60)) async def __aenter__(self): @@ -90,7 +90,7 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): ) yield result_allele if isinstance(sequence_string, str) else (sequence_string.name, result_allele) else: - raise NoBIGSdbMatchesException(self._database_name, self._schema_id, sequence_string.name if isinstance(sequence_string, NamedString) else None) + raise NoBIGSdbMatchesException(self._database_name, self._scheme_id, sequence_string.name if isinstance(sequence_string, NamedString) else None) async def determine_mlst_st(self, alleles: Union[AsyncIterable[Union[Allele, tuple[str, Allele]]], Iterable[Union[Allele, tuple[str, Allele]]]]) -> Union[MLSTProfile, NamedMLSTProfile]: uri_path = "designations" @@ -117,15 +117,15 @@ class RemoteBIGSdbMLSTProfiler(BIGSdbMLSTProfiler): response_json: dict = await response.json() allele_set: Set[Allele] = set() response_json.setdefault("fields", dict()) - schema_fields_returned: dict[str, str] = response_json["fields"] - schema_fields_returned.setdefault("ST", "unknown") - schema_fields_returned.setdefault("clonal_complex", "unknown") - schema_exact_matches: dict = response_json["exact_matches"] - for exact_match_locus, exact_match_alleles in schema_exact_matches.items(): + scheme_fields_returned: dict[str, str] = response_json["fields"] + scheme_fields_returned.setdefault("ST", "unknown") + scheme_fields_returned.setdefault("clonal_complex", "unknown") + scheme_exact_matches: dict = response_json["exact_matches"] + for exact_match_locus, exact_match_alleles in scheme_exact_matches.items(): allele_set.add(Allele(exact_match_locus, exact_match_alleles[0]["allele_id"], None)) if len(allele_set) == 0: raise ValueError("Passed in no alleles.") - result_mlst_profile = MLSTProfile(allele_set, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"]) + result_mlst_profile = MLSTProfile(allele_set, scheme_fields_returned["ST"], scheme_fields_returned["clonal_complex"]) if len(names_list) > 0: result_mlst_profile = NamedMLSTProfile(str(tuple(names_list)), result_mlst_profile) return result_mlst_profile @@ -165,7 +165,7 @@ class BIGSdbIndex(AbstractAsyncContextManager): def __init__(self): self._http_client = ClientSession() self._known_seqdef_dbs_origin: Union[Mapping[str, str], None] = None - self._seqdefdb_schemas: dict[str, Union[Mapping[str, int], None]] = dict() + self._seqdefdb_schemes: dict[str, Union[Mapping[str, int], None]] = dict() super().__init__() async def __aenter__(self): @@ -191,22 +191,22 @@ class BIGSdbIndex(AbstractAsyncContextManager): raise NoSuchBIGSdbDatabaseException(seqdef_db_name) return known_databases[seqdef_db_name] - async def get_schemas_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]: - if seqdef_db_name in self._seqdefdb_schemas and not force: - return self._seqdefdb_schemas[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional + async def get_schemes_for_seqdefdb(self, seqdef_db_name: str, force: bool = False) -> Mapping[str, int]: + if seqdef_db_name in self._seqdefdb_schemes and not force: + return self._seqdefdb_schemes[seqdef_db_name] # type: ignore since it's guaranteed to not be none by conditional uri_path = f"{await self.get_bigsdb_api_from_seqdefdb(seqdef_db_name)}/db/{seqdef_db_name}/schemes" async with self._http_client.get(uri_path) as response: response_json = await response.json() - schema_descriptions: Mapping[str, int] = dict() + scheme_descriptions: Mapping[str, int] = dict() for scheme_definition in response_json["schemes"]: scheme_id: int = int(str(scheme_definition["scheme"]).split("/")[-1]) scheme_desc: str = scheme_definition["description"] - schema_descriptions[scheme_desc] = scheme_id - self._seqdefdb_schemas[seqdef_db_name] = schema_descriptions - return self._seqdefdb_schemas[seqdef_db_name] # type: ignore + scheme_descriptions[scheme_desc] = scheme_id + self._seqdefdb_schemes[seqdef_db_name] = scheme_descriptions + return self._seqdefdb_schemes[seqdef_db_name] # type: ignore - async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, schema_id: int) -> BIGSdbMLSTProfiler: - return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, schema_id) + async def build_profiler_from_seqdefdb(self, local: bool, dbseqdef_name: str, scheme_id: int) -> BIGSdbMLSTProfiler: + return get_BIGSdb_MLST_profiler(local, await self.get_bigsdb_api_from_seqdefdb(dbseqdef_name), dbseqdef_name, scheme_id) async def close(self): await self._http_client.close() @@ -214,7 +214,7 @@ class BIGSdbIndex(AbstractAsyncContextManager): async def __aexit__(self, exc_type, exc_value, traceback): await self.close() -def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, schema_id: int): +def get_BIGSdb_MLST_profiler(local: bool, database_api: str, database_name: str, scheme_id: int): if local: raise NotImplementedError() - return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, schema_id=schema_id) \ No newline at end of file + return RemoteBIGSdbMLSTProfiler(database_api=database_api, database_name=database_name, scheme_id=scheme_id) \ No newline at end of file diff --git a/src/autobigs/engine/exceptions/database.py b/src/autobigs/engine/exceptions/database.py index 10787d2..ab88535 100644 --- a/src/autobigs/engine/exceptions/database.py +++ b/src/autobigs/engine/exceptions/database.py @@ -5,21 +5,21 @@ class BIGSDbDatabaseAPIException(Exception): class NoBIGSdbMatchesException(BIGSDbDatabaseAPIException): - def __init__(self, database_name: str, database_schema_id: int, query_name: Union[None, str], *args): + def __init__(self, database_name: str, database_scheme_id: int, query_name: Union[None, str], *args): self._query_name = query_name - super().__init__(f"No matches found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) + super().__init__(f"No matches found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args) def get_causal_query_name(self) -> Union[str, None]: return self._query_name class NoBIGSdbExactMatchesException(NoBIGSdbMatchesException): - def __init__(self, database_name: str, database_schema_id: int, *args): - super().__init__(f"No exact match found with schema with ID {database_schema_id} in the database \"{database_name}\".", *args) + def __init__(self, database_name: str, database_scheme_id: int, *args): + super().__init__(f"No exact match found with scheme with ID {database_scheme_id} in the database \"{database_name}\".", *args) class NoSuchBIGSdbDatabaseException(BIGSDbDatabaseAPIException): def __init__(self, database_name: str, *args): super().__init__(f"No database \"{database_name}\" found.", *args) -class NoSuchBigSdbSchemaException(BIGSDbDatabaseAPIException): - def __init__(self, database_name: str, database_schema_id: int, *args): - super().__init__(f"No schema with ID {database_schema_id} in \"{database_name}\" found.", *args) +class NoSuchBigSdbschemeException(BIGSDbDatabaseAPIException): + def __init__(self, database_name: str, database_scheme_id: int, *args): + super().__init__(f"No scheme with ID {database_scheme_id} in \"{database_name}\" found.", *args) diff --git a/src/autobigs/engine/structures/genomics.py b/src/autobigs/engine/structures/genomics.py index 6dfb59b..cd76c70 100644 --- a/src/autobigs/engine/structures/genomics.py +++ b/src/autobigs/engine/structures/genomics.py @@ -25,7 +25,7 @@ class SangerTraceData(NamedString): analysis_proto_settings_name: str analysis_rpto_settings_ver: str analysis_proto_xml_data: str - analysis_proto_xml_schema_ver: str + analysis_proto_xml_scheme_ver: str sample_comment: Union[None, str] capillary_machine: bool container_identifier: str diff --git a/tests/autobigs/engine/analysis/test_bigsdb.py b/tests/autobigs/engine/analysis/test_bigsdb.py index c49f1ae..ed01fd3 100644 --- a/tests/autobigs/engine/analysis/test_bigsdb.py +++ b/tests/autobigs/engine/analysis/test_bigsdb.py @@ -71,14 +71,14 @@ hinfluenzae_2014_102_bad_profile = MLSTProfile(( ), "unknown", "unknown") -@pytest.mark.parametrize("local_db,database_api,database_name,schema_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ +@pytest.mark.parametrize("local_db,database_api,database_name,scheme_id,seq_path,feature_seqs_path,expected_profile,bad_profile", [ (False, "https://bigsdb.pasteur.fr/api", "pubmlst_bordetella_seqdef", 3, "tohama_I_bpertussis.fasta", "tohama_I_bpertussis_features.fasta", bpertussis_tohamaI_profile, bpertussis_tohamaI_bad_profile), (False, "https://rest.pubmlst.org", "pubmlst_hinfluenzae_seqdef", 1, "2014-102_hinfluenza.fasta", "2014-102_hinfluenza_features.fasta", hinfluenzae_2014_102_profile, hinfluenzae_2014_102_bad_profile), ]) class TestBIGSdbMLSTProfiler: - async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_profiling_results_in_exact_matches_when_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): sequence = get_first_sequence_from_fasta(seq_path) - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: expected_alleles = mlst.alleles_to_mapping(expected_profile.alleles) targets_left = set(mlst.alleles_to_mapping(expected_profile.alleles).keys()) async for exact_match in dummy_profiler.determine_mlst_allele_variants(query_sequence_strings=[sequence]): @@ -89,10 +89,10 @@ class TestBIGSdbMLSTProfiler: assert len(targets_left) == 0 - async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_sequence_profiling_non_exact_returns_non_exact(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): target_sequences = get_multiple_sequences_from_fasta(feature_seqs_path) mlst_targets = {x.lower() for x in mlst.alleles_to_mapping(expected_profile.alleles).keys()} - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as profiler: for target_sequence in target_sequences: match = re.fullmatch(r".*\[gene=([\w\d]+)\].*", target_sequence.description) if match is None: @@ -107,26 +107,26 @@ class TestBIGSdbMLSTProfiler: assert len(mlst_targets) == 0 - async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async def test_profiling_results_in_correct_mlst_st(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: mlst_st_data = await dummy_profiler.determine_mlst_st(expected_profile.alleles) assert mlst_st_data is not None assert isinstance(mlst_st_data, MLSTProfile) assert mlst_st_data.clonal_complex == expected_profile.clonal_complex assert mlst_st_data.sequence_type == expected_profile.sequence_type - async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_profiling_non_exact_results_in_list_of_mlsts(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): dummy_alleles = bad_profile.alleles - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: mlst_profile = await dummy_profiler.determine_mlst_st(dummy_alleles) assert mlst_profile.clonal_complex == "unknown" assert mlst_profile.sequence_type == "unknown" - async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_same_string_twice(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): sequence = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", sequence)], [NamedString("seq2", sequence)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences)): name, profile = named_profile.name, named_profile.mlst_profile assert profile is not None @@ -134,10 +134,10 @@ class TestBIGSdbMLSTProfiler: assert profile.clonal_complex == expected_profile.clonal_complex assert profile.sequence_type == expected_profile.sequence_type - async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_exactmatch_fail_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): valid_seq = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for name_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), True): name, profile = name_profile.name, name_profile.mlst_profile @@ -151,11 +151,11 @@ class TestBIGSdbMLSTProfiler: assert profile.clonal_complex == expected_profile.clonal_complex assert profile.sequence_type == expected_profile.sequence_type - async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, schema_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): + async def test_bigsdb_profile_multiple_strings_nonexact_second_no_stop(self, local_db, database_api, database_name, scheme_id, seq_path: str, feature_seqs_path: str, expected_profile: MLSTProfile, bad_profile: MLSTProfile): valid_seq = get_first_sequence_from_fasta(seq_path) dummy_sequences = [[NamedString("seq1", valid_seq)], [NamedString("should_fail", gene_scrambler(valid_seq, 0.3))], [NamedString("seq3", valid_seq)]] - async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, schema_id) as dummy_profiler: + async with bigsdb.get_BIGSdb_MLST_profiler(local_db, database_api, database_name, scheme_id) as dummy_profiler: async for named_profile in dummy_profiler.profile_multiple_strings(generate_async_iterable(dummy_sequences), False): name, profile = named_profile.name, named_profile.mlst_profile @@ -183,12 +183,12 @@ class TestBIGSdbIndex: async with BIGSdbIndex() as bigsdb_index: assert (await bigsdb_index.get_bigsdb_api_from_seqdefdb("pubmlst_bordetella_seqdef")) == "https://bigsdb.pasteur.fr/api" - async def test_bigsdb_index_get_schemas_for_bordetella(self): + async def test_bigsdb_index_get_schemes_for_bordetella(self): async with BIGSdbIndex() as index: - schemas = await index.get_schemas_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") - assert len(schemas.keys()) > 0 - assert "MLST" in schemas - assert isinstance(schemas["MLST"], int) + schemes = await index.get_schemes_for_seqdefdb(seqdef_db_name="pubmlst_bordetella_seqdef") + assert len(schemes.keys()) > 0 + assert "MLST" in schemes + assert isinstance(schemes["MLST"], int) async def test_bigsdb_index_get_databases_has_only_seqdef(self): async with BIGSdbIndex() as index: