Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
ab44dfaa48 | |||
611b956d88 | |||
cb22dfac9b | |||
7ea7ead46a | |||
a3c864b565 | |||
bad7dfc3a8 | |||
4fe0f0f287 |
25
Jenkinsfile
vendored
25
Jenkinsfile
vendored
@@ -30,11 +30,26 @@ pipeline {
|
||||
}
|
||||
}
|
||||
stage("publish") {
|
||||
environment {
|
||||
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
parallel {
|
||||
stage ("git.reslate.systems") {
|
||||
environment {
|
||||
TOKEN = credentials('git.reslate.systems')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload --repository-url https://git.reslate.systems/api/packages/ydeng/pypi -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
}
|
||||
}
|
||||
stage ("pypi.org") {
|
||||
when {
|
||||
tag '*.*'
|
||||
}
|
||||
environment {
|
||||
TOKEN = credentials('pypi.org')
|
||||
}
|
||||
steps {
|
||||
sh returnStatus: true, script: 'python -m twine upload -u __token__ -p ${TOKEN} --non-interactive --disable-progress-bar --verbose dist/*'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
27
README.md
27
README.md
@@ -1,12 +1,25 @@
|
||||
# autoMLST
|
||||
# autoMLST.Engine
|
||||
|
||||
A CLI/library for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
|
||||
A python library implementing common BIGSdb MLST schemes and databases. Implementation follows the RESTful API outlined by the official [BIGSdb documentation](https://bigsdb.readthedocs.io/en/latest/rest.html) up to `V1.50.0`.
|
||||
|
||||
# Components
|
||||
## Features
|
||||
|
||||
## automlst.cli
|
||||
Briefly, this library can:
|
||||
- Import multiple `FASTA` files
|
||||
- Fetch the available BIGSdb databases that is currently live and available
|
||||
- Fetch the available BIGSdb database schemas for a given MLST database
|
||||
- Retrieve exact/non-exact MLST allele variant IDs based off a sequence
|
||||
- Retrieve MLST sequence type IDs based off a sequence
|
||||
- Output all results to a single CSV
|
||||
|
||||
The command line interface, sets up very minimal and mostly makes calls to the library. Uses argparse and is split into two parts:
|
||||
Furthermore, this library is highly asynchronous where any potentially blocking operation, ranging from parsing FASTAs to performing HTTP requests are at least asynchronous, if not fully multithreaded.
|
||||
|
||||
- `automlst info`: Provides user information on available databases to pull from, and the schemas available.
|
||||
- `automlst exactmatch`: Provides users the ability to request exact match results from a given database and schema
|
||||
## Usage
|
||||
|
||||
This library can be installed through pip. Learn how to [setup and install pip first](https://pip.pypa.io/en/stable/installation/).
|
||||
|
||||
Then, it's as easy as running `pip install automlst-engine` in any terminal that has pip in it's path (any terminal where `pip --version` returns a valid version and install path).
|
||||
|
||||
### CLI usage
|
||||
|
||||
This is a independent python library and thus does not have any form of direct user interface. One way of using it could be to create your own Python script that makes calls to this libraries functions. Alternatively, you may use `automlst-cli`, a `Python` package that implements a CLI for calling this library.
|
@@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta"
|
||||
[project]
|
||||
name = "automlst.engine"
|
||||
dynamic = ["version"]
|
||||
readme = "README.md"
|
||||
|
||||
dependencies = [
|
||||
"biopython",
|
||||
|
@@ -3,7 +3,7 @@ from io import TextIOWrapper
|
||||
from os import PathLike
|
||||
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
|
||||
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
|
||||
|
||||
def dict_loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
|
@@ -3,7 +3,7 @@ from io import TextIOWrapper
|
||||
from typing import Any, AsyncGenerator, Generator, Iterable, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
|
||||
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
|
||||
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
|
0
src/automlst/engine/data/remote/__init__.py
Normal file
0
src/automlst/engine/data/remote/__init__.py
Normal file
@@ -5,8 +5,8 @@ from typing import Any, AsyncGenerator, AsyncIterable, Collection, Generator, It
|
||||
|
||||
from aiohttp import ClientSession, ClientTimeout
|
||||
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
from automlst.engine.data.structures.mlst import Allele, PartialAllelicMatchProfile, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException, NoSuchBIGSdbDatabaseException
|
||||
|
||||
class BIGSdbMLSTProfiler(AbstractAsyncContextManager):
|
0
src/automlst/engine/data/structures/__init__.py
Normal file
0
src/automlst/engine/data/structures/__init__.py
Normal file
@@ -1,126 +0,0 @@
|
||||
import asyncio
|
||||
from numbers import Number
|
||||
from os import path
|
||||
from typing import Any, AsyncGenerator, Collection, Iterable, Sequence, Union
|
||||
from automlst.engine.data.genomics import NamedString, SangerTraceData
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
from Bio import SeqIO, Align
|
||||
|
||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
||||
|
||||
|
||||
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
|
||||
with open(seq_path, "rb") as seq_handle:
|
||||
return SeqIO.read(seq_handle, "abi")
|
||||
|
||||
|
||||
async def read_abif(seq_path: str) -> SangerTraceData:
|
||||
ext = path.splitext(seq_path)[1]
|
||||
if ext.lower() != ".ab1" and ext.lower() != "abi":
|
||||
raise ValueError(
|
||||
'seq_path must have file extension of "ab1", or "abi".')
|
||||
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
|
||||
biopython_annotations = biopython_seq.annotations
|
||||
|
||||
# Lot of type ignoring since Biopython did not define their typing.
|
||||
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
|
||||
trace_data = SangerTraceData(
|
||||
path.basename(seq_path),
|
||||
biopython_seq.seq,
|
||||
biopython_abif_raw.get("APFN2"), # type: ignore
|
||||
biopython_abif_raw.get("APrN1"), # type: ignore
|
||||
biopython_abif_raw.get("APrV1"), # type: ignore
|
||||
biopython_abif_raw.get("APrX1"), # type: ignore
|
||||
biopython_abif_raw.get("APXV1"), # type: ignore
|
||||
biopython_abif_raw.get("CMNT1"), # type: ignore
|
||||
biopython_abif_raw.get("CpEP1"), # type: ignore
|
||||
biopython_abif_raw.get("CTID1"), # type: ignore
|
||||
biopython_abif_raw.get("CTNM1"), # type: ignore
|
||||
biopython_abif_raw.get("CTTL1"), # type: ignore
|
||||
biopython_abif_raw.get("DATA1"), # type: ignore
|
||||
biopython_abif_raw.get("DATA2"), # type: ignore
|
||||
biopython_abif_raw.get("DATA3"), # type: ignore
|
||||
biopython_abif_raw.get("DATA4"), # type: ignore
|
||||
biopython_abif_raw.get("DATA5"), # type: ignore
|
||||
biopython_abif_raw.get("DATA6"), # type: ignore
|
||||
biopython_abif_raw.get("DATA7"), # type: ignore
|
||||
biopython_abif_raw.get("DATA8"), # type: ignore
|
||||
biopython_abif_raw.get("DSam1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN2"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN3"), # type: ignore
|
||||
biopython_abif_raw.get("DyeN4"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW1"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW2"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW3"), # type: ignore
|
||||
biopython_abif_raw.get("DyeW4"), # type: ignore
|
||||
biopython_abif_raw.get("DySN1"), # type: ignore
|
||||
biopython_abif_raw.get("EPVt1"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT1"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT2"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT3"), # type: ignore
|
||||
biopython_abif_raw.get("EVNT4"), # type: ignore
|
||||
biopython_abif_raw.get("FWO_1"), # type: ignore
|
||||
biopython_abif_raw.get("GTyp1"), # type: ignore
|
||||
biopython_abif_raw.get("InSc1"), # type: ignore
|
||||
biopython_abif_raw.get("InVt1"), # type: ignore
|
||||
biopython_abif_raw.get("LANE1"), # type: ignore
|
||||
biopython_abif_raw.get("LIMS1"), # type: ignore
|
||||
biopython_abif_raw.get("LNTD1"), # type: ignore
|
||||
biopython_abif_raw.get("LsrP1"), # type: ignore
|
||||
biopython_abif_raw.get("MCHN1"), # type: ignore
|
||||
biopython_abif_raw.get("MODF1"), # type: ignore
|
||||
biopython_abif_raw.get("MODL1"), # type: ignore
|
||||
biopython_abif_raw.get("NAVG1"), # type: ignore
|
||||
biopython_abif_raw.get("NLNE1"), # type: ignore
|
||||
biopython_abif_raw.get("OfSc1"), # type: ignore
|
||||
biopython_abif_raw.get("PDMF1"), # type: ignore
|
||||
biopython_abif_raw.get("PXLB1"), # type: ignore
|
||||
biopython_abif_raw.get("RGCm1"), # type: ignore
|
||||
biopython_abif_raw.get("RGNm1"), # type: ignore
|
||||
biopython_abif_raw.get("RMdV1"), # type: ignore
|
||||
biopython_abif_raw.get("RMdX1"), # type: ignore
|
||||
biopython_abif_raw.get("RMXV1"), # type: ignore
|
||||
biopython_abif_raw.get("RPrN1"), # type: ignore
|
||||
biopython_abif_raw.get("RPrV1"), # type: ignore
|
||||
biopython_abif_raw.get("RUND1"), # type: ignore
|
||||
biopython_abif_raw.get("RUND2"), # type: ignore
|
||||
biopython_abif_raw.get("RUND3"), # type: ignore
|
||||
biopython_abif_raw.get("RUND4"), # type: ignore
|
||||
biopython_abif_raw.get("RunN1"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT1"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT2"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT3"), # type: ignore
|
||||
biopython_abif_raw.get("RUNT4"), # type: ignore
|
||||
biopython_abif_raw.get("Satd"), # type: ignore
|
||||
biopython_abif_raw.get("Scal1"), # type: ignore
|
||||
biopython_abif_raw.get("SCAN1"), # type: ignore
|
||||
biopython_abif_raw.get("SMED1"), # type: ignore
|
||||
biopython_abif_raw.get("SMLt"), # type: ignore
|
||||
biopython_abif_raw.get("SMPL1"), # type: ignore
|
||||
biopython_abif_raw.get("SVER1"), # type: ignore
|
||||
biopython_abif_raw.get("SVER3"), # type: ignore
|
||||
biopython_abif_raw.get("Tmpr1"), # type: ignore
|
||||
biopython_abif_raw.get("TUBE"), # type: ignore
|
||||
biopython_abif_raw.get("User") # type: ignore
|
||||
)
|
||||
return trace_data
|
||||
|
||||
|
||||
def _biopython_local_pairwise_alignment(reference: NamedString, query: NamedString) -> tuple[NamedString, NamedString]:
|
||||
aligner = Align.PairwiseAligner(scoring="blastn")
|
||||
aligner.mode = "local"
|
||||
alignment_result = sorted(aligner.align(reference.sequence, query.sequence))[
|
||||
0] # take the best alignment
|
||||
# TODO actually assemble the consensus sequence here
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
||||
|
||||
|
||||
async def reference_consensus_assembly(reference: Union[NamedString, str], sanger_traces: Iterable[SangerTraceData]) -> AsyncGenerator[NamedString, Any]:
|
||||
if isinstance(reference, str):
|
||||
reference_seq = NamedString(name=reference, sequence=(await fetch_ncbi_genbank(reference)).sequence)
|
||||
else:
|
||||
reference_seq: NamedString = reference
|
||||
for sanger_trace in sanger_traces:
|
||||
yield NamedString("NA", "NA")
|
||||
raise NotImplementedError("Pairwise alignment unto reference consensus assembly function not ready.")
|
@@ -1,27 +0,0 @@
|
||||
import asyncio
|
||||
from Bio import Entrez
|
||||
from Bio import SeqIO
|
||||
|
||||
# TODO Change this out for a more professional approach
|
||||
Entrez.email = "yunyangdeng@outlook.com"
|
||||
|
||||
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
|
||||
|
||||
|
||||
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
|
||||
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
|
||||
record = SeqIO.read(fetch_stream, "genbank")
|
||||
sequence_features = list()
|
||||
for feature in record.features:
|
||||
start = int(feature.location.start)
|
||||
end = int(feature.location.end)
|
||||
qualifiers = feature.qualifiers
|
||||
for qualifier_key in qualifiers:
|
||||
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
|
||||
sequence_features.append(StringAnnotation(
|
||||
type=feature.type,
|
||||
start=start,
|
||||
end=end+1, # Position is exclusive
|
||||
feature_properties=qualifiers
|
||||
))
|
||||
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
|
@@ -1,4 +1,4 @@
|
||||
from automlst.engine.local.fasta import read_fasta
|
||||
from automlst.engine.data.local.fasta import read_fasta
|
||||
|
||||
|
||||
async def test_fasta_reader_not_none():
|
@@ -3,10 +3,10 @@ import re
|
||||
from typing import Collection, Sequence, Union
|
||||
from Bio import SeqIO
|
||||
import pytest
|
||||
from automlst.engine.data.genomics import NamedString
|
||||
from automlst.engine.data.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.data.structures.genomics import NamedString
|
||||
from automlst.engine.data.structures.mlst import Allele, MLSTProfile
|
||||
from automlst.engine.exceptions.database import NoBIGSdbExactMatchesException, NoBIGSdbMatchesException
|
||||
from automlst.engine.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
from automlst.engine.data.remote.databases.bigsdb import BIGSdbIndex, BIGSdbMLSTProfiler
|
||||
|
||||
def gene_scrambler(gene: str, mutation_site_count: Union[int, float], alphabet: Sequence[str] = ["A", "T", "C", "G"]):
|
||||
rand = random.Random(gene)
|
@@ -1,12 +0,0 @@
|
||||
import os
|
||||
|
||||
from automlst.engine.local.abif import read_abif, reference_consensus_assembly
|
||||
|
||||
async def test_load_sanger_sequence_has_data():
|
||||
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
|
||||
assert result_data is not None
|
||||
|
||||
async def test_consensus_assembly_with_ncbi():
|
||||
consensus = reference_consensus_assembly("ON685494.1", [await read_abif("tests/resources/1I1_F_P1815443_047.ab1"), await read_abif("tests/resources/1I1_R_P1815443_094.ab1")])
|
||||
# TODO complete implementing this
|
@@ -1,5 +0,0 @@
|
||||
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
|
||||
|
||||
|
||||
async def test_fetch_ncbi_genbank_with_id_works():
|
||||
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0
|
Reference in New Issue
Block a user