9 Commits
0.2.0 ... 0.5.1

18 changed files with 75 additions and 195 deletions

25
Jenkinsfile vendored
View File

@@ -30,11 +30,26 @@ pipeline {
}
}
stage("publish") {
environment {
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
}
steps {
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
parallel {
stage ("git.reslate.systems") {
environment {
TOKEN = credentials('git.reslate.systems')
}
steps {
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
}
}
stage ("pypi.org") {
when {
tag '*.*'
}
environment {
TOKEN = credentials('pypi.org')
}
steps {
sh returnStatus: true, script: 'python -m twine upload -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
}
}
}
}
}

View File

@@ -1,12 +1,25 @@
# autoMLST
# autoMLST.Engine
A CLI/library for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
# Components
## Features
## automlst.cli
Briefly, this library can:
- Import multiple `FASTA` files
- Fetch the available BIGSdb databases that is currently live and available
- Fetch the available BIGSdb database schemas for a given MLST database
- Retrieve exact/non-exact MLST allele variant IDs based off a sequence
- Retrieve MLST sequence type IDs based off a sequence
- Output all results to a single CSV
The command line interface, sets up very minimal and mostly makes calls to the library. Uses argparse and is split into two parts:
Furthermore, this library is highly asynchronous where any potentially blocking operation, ranging from parsing FASTAs to performing HTTP requests are at least asynchronous, if not fully multithreaded.
- `automlst info`: Provides user information on available databases to pull from, and the schemas available.
- `automlst exactmatch`: Provides users the ability to request exact match results from a given database and schema
## Usage
This library can be installed through pip. Learn how to [setup and install pip first](https://pip.pypa.io/en/stable/installation/).
Then, it's as easy as running `pip install automlst-engine` in any terminal that has pip in it's path (any terminal where `pip --version` returns a valid version and install path).
### CLI usage
This is a independent python library and thus does not have any form of direct user interface. One way of using it could be to create your own Python script that makes calls to this libraries functions. Alternatively, you may use `automlst-cli`, a `Python` package that implements a CLI for calling this library.

View File

@@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "automlst.engine"
dynamic = ["version"]
readme = "README.md"
dependencies = [
"biopython",

View File

@@ -1,9 +1,8 @@
import csv
from io import TextIOWrapper
from os import PathLike
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
from typing import AsyncIterable, Mapping, Sequence, Union
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
@@ -11,10 +10,11 @@ def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Alle
for loci, alleles in alleles_map.items():
if len(alleles) == 1:
result_dict[loci] = alleles[0].allele_variant
for allele in alleles:
else:
result_locis = list()
result_locis.append(allele.allele_variant)
result_dict[loci] = result_locis
for allele in alleles:
result_locis.append(allele.allele_variant)
result_dict[loci] = result_locis
return result_dict

View File

@@ -3,7 +3,7 @@ from io import TextIOWrapper
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
from Bio import SeqIO
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.structures.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")

View File

@@ -5,8 +5,8 @@ from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, It
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
from automlst.engine.data.structures.genomics import NamedString
from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):

View File

@@ -1,126 +0,0 @@
import asyncio
from numbers import Number
from os import path
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
from automlst.engine.data.genomics import NamedString, SangerTraceData
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, Align
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
with open(seq_path, "rb") as seq_handle:
return SeqIO.read(seq_handle, "abi")
async def read_abif(seq_path: str) -> SangerTraceData:
ext = path.splitext(seq_path)[1]
if ext.lower() != ".ab1" and ext.lower() != "abi":
raise ValueError(
'seq_path must have file extension of "ab1", or "abi".')
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
biopython_annotations = biopython_seq.annotations
# Lot of type ignoring since Biopython did not define their typing.
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
trace_data = SangerTraceData(
path.basename(seq_path),
biopython_seq.seq,
biopython_abif_raw.get("APFN2"), # type: ignore
biopython_abif_raw.get("APrN1"), # type: ignore
biopython_abif_raw.get("APrV1"), # type: ignore
biopython_abif_raw.get("APrX1"), # type: ignore
biopython_abif_raw.get("APXV1"), # type: ignore
biopython_abif_raw.get("CMNT1"), # type: ignore
biopython_abif_raw.get("CpEP1"), # type: ignore
biopython_abif_raw.get("CTID1"), # type: ignore
biopython_abif_raw.get("CTNM1"), # type: ignore
biopython_abif_raw.get("CTTL1"), # type: ignore
biopython_abif_raw.get("DATA1"), # type: ignore
biopython_abif_raw.get("DATA2"), # type: ignore
biopython_abif_raw.get("DATA3"), # type: ignore
biopython_abif_raw.get("DATA4"), # type: ignore
biopython_abif_raw.get("DATA5"), # type: ignore
biopython_abif_raw.get("DATA6"), # type: ignore
biopython_abif_raw.get("DATA7"), # type: ignore
biopython_abif_raw.get("DATA8"), # type: ignore
biopython_abif_raw.get("DSam1"), # type: ignore
biopython_abif_raw.get("DyeN1"), # type: ignore
biopython_abif_raw.get("DyeN2"), # type: ignore
biopython_abif_raw.get("DyeN3"), # type: ignore
biopython_abif_raw.get("DyeN4"), # type: ignore
biopython_abif_raw.get("DyeW1"), # type: ignore
biopython_abif_raw.get("DyeW2"), # type: ignore
biopython_abif_raw.get("DyeW3"), # type: ignore
biopython_abif_raw.get("DyeW4"), # type: ignore
biopython_abif_raw.get("DySN1"), # type: ignore
biopython_abif_raw.get("EPVt1"), # type: ignore
biopython_abif_raw.get("EVNT1"), # type: ignore
biopython_abif_raw.get("EVNT2"), # type: ignore
biopython_abif_raw.get("EVNT3"), # type: ignore
biopython_abif_raw.get("EVNT4"), # type: ignore
biopython_abif_raw.get("FWO_1"), # type: ignore
biopython_abif_raw.get("GTyp1"), # type: ignore
biopython_abif_raw.get("InSc1"), # type: ignore
biopython_abif_raw.get("InVt1"), # type: ignore
biopython_abif_raw.get("LANE1"), # type: ignore
biopython_abif_raw.get("LIMS1"), # type: ignore
biopython_abif_raw.get("LNTD1"), # type: ignore
biopython_abif_raw.get("LsrP1"), # type: ignore
biopython_abif_raw.get("MCHN1"), # type: ignore
biopython_abif_raw.get("MODF1"), # type: ignore
biopython_abif_raw.get("MODL1"), # type: ignore
biopython_abif_raw.get("NAVG1"), # type: ignore
biopython_abif_raw.get("NLNE1"), # type: ignore
biopython_abif_raw.get("OfSc1"), # type: ignore
biopython_abif_raw.get("PDMF1"), # type: ignore
biopython_abif_raw.get("PXLB1"), # type: ignore
biopython_abif_raw.get("RGCm1"), # type: ignore
biopython_abif_raw.get("RGNm1"), # type: ignore
biopython_abif_raw.get("RMdV1"), # type: ignore
biopython_abif_raw.get("RMdX1"), # type: ignore
biopython_abif_raw.get("RMXV1"), # type: ignore
biopython_abif_raw.get("RPrN1"), # type: ignore
biopython_abif_raw.get("RPrV1"), # type: ignore
biopython_abif_raw.get("RUND1"), # type: ignore
biopython_abif_raw.get("RUND2"), # type: ignore
biopython_abif_raw.get("RUND3"), # type: ignore
biopython_abif_raw.get("RUND4"), # type: ignore
biopython_abif_raw.get("RunN1"), # type: ignore
biopython_abif_raw.get("RUNT1"), # type: ignore
biopython_abif_raw.get("RUNT2"), # type: ignore
biopython_abif_raw.get("RUNT3"), # type: ignore
biopython_abif_raw.get("RUNT4"), # type: ignore
biopython_abif_raw.get("Satd"), # type: ignore
biopython_abif_raw.get("Scal1"), # type: ignore
biopython_abif_raw.get("SCAN1"), # type: ignore
biopython_abif_raw.get("SMED1"), # type: ignore
biopython_abif_raw.get("SMLt"), # type: ignore
biopython_abif_raw.get("SMPL1"), # type: ignore
biopython_abif_raw.get("SVER1"), # type: ignore
biopython_abif_raw.get("SVER3"), # type: ignore
biopython_abif_raw.get("Tmpr1"), # type: ignore
biopython_abif_raw.get("TUBE"), # type: ignore
biopython_abif_raw.get("User") # type: ignore
)
return trace_data
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
aligner = Align.PairwiseAligner(scoring="blastn")
aligner.mode = "local"
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
0] # take the best alignment
# TODO actually assemble the consensus sequence here
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
if isinstance(reference, str):
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
else:
reference_seq: NamedString = reference
for sanger_trace in sanger_traces:
yield NamedString("NA", "NA")
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")

View File

@@ -1,27 +0,0 @@
import asyncio
from Bio import Entrez
from Bio import SeqIO
# TODO Change this out for a more professional approach
Entrez.email = "yunyangdeng@outlook.com"
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)

View File

@@ -0,0 +1,21 @@
from automlst.engine.data.local.csv import dict_loci_alleles_variants_from_loci
from automlst.engine.data.structures.mlst import Allele
def test_dict_loci_alleles_variants_from_loci_single_loci_not_list():
alleles_map = {
"adk": [Allele("adk", "1", None)]
}
results = dict_loci_alleles_variants_from_loci(alleles_map)
for loci, variant in results.items():
assert isinstance(variant, str)
assert variant == "1"
def test_dict_loci_alleles_variants_from_loci_multi_loci_is_list():
alleles_map = {
"adk": [Allele("adk", "1", None), Allele("adk", "2", None)]
}
results = dict_loci_alleles_variants_from_loci(alleles_map)
for loci, variant in results.items():
assert isinstance(variant, list)
assert len(variant) == 2

View File

@@ -1,4 +1,4 @@
from automlst.engine.local.fasta import read_fasta
from automlst.engine.data.local.fasta import read_fasta
async def test_fasta_reader_not_none():

View File

@@ -3,10 +3,10 @@ import re
from typing import Collection, Sequence, Union
from Bio import SeqIO
import pytest
from automlst.engine.data.genomics import NamedString
from automlst.engine.data.mlst import Allele, MLSTProfile
from automlst.engine.data.structures.genomics import NamedString
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
from automlst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
rand = random.Random(gene)

View File

@@ -1,12 +0,0 @@
import os
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
async def test_load_sanger_sequence_has_data():
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
assert result_data is not None
async def test_consensus_assembly_with_ncbi():
consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
# TODO complete implementing this

View File

@@ -1,5 +0,0 @@
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0