Copied over most code from another of my projects

This commit is contained in:
Harrison Deng 2025-01-08 15:14:06 +00:00
commit 02985d5e37
32 changed files with 59455 additions and 0 deletions

View File

@ -0,0 +1,22 @@
// For format details, see https://aka.ms/devcontainer.json. For config options, see the
// README at: https://github.com/devcontainers/templates/tree/main/src/python
{
"name": "Python 3",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "pip3 install --user -r requirements.txt"
// Configure tool-specific properties.
// "customizations": {},
// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
}

359
.gitignore vendored Normal file
View File

@ -0,0 +1,359 @@
# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,svelte,python,linux,node
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
### Node Patch ###
# Serverless Webpack directories
.webpack/
# Optional stylelint cache
# SvelteKit build / generate output
.svelte-kit
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### Svelte ###
# gitignore template for the SvelteKit, frontend web component framework
# website: https://kit.svelte.dev/
.svelte-kit/
package
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
output

27
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,27 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "CLI ipdbmlst",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/src/automlst/cli/root.py",
"console": "integratedTerminal",
"args": [
"-fa",
"${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
"-ipdbmlst",
"pubmlst_bordetella_seqdef",
"${workspaceFolder}/output"
],
"cwd": "${workspaceFolder}/src",
"env": {
"PYTHONPATH": "${workspaceFolder}/src"
}
}
]
}

4
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,4 @@
{
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}

50
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,50 @@
pipeline {
agent {
kubernetes {
cloud 'rsys-devel'
defaultContainer 'homebrew'
inheritFrom 'homebrew'
}
}
stages {
stage("install") {
steps {
sh 'brew install python@3.11 sphinx-doc'
sh 'python3.11 -m pip install -r requirements.txt'
}
}
stage("unit tests") {
steps {
sh returnStatus: true, script: "python3.11 -m pytest --junitxml=test_results.xml"
xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
}
}
stage("build") {
steps {
sh "python3.11 -m build"
}
}
stage("test installation") {
steps {
sh "python3.11 -m pip install dist/*.whl --force-reinstall"
sh "automlst -h"
}
}
stage("archive") {
steps {
archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
}
}
stage("publish") {
environment {
CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
}
when {
branch '**/main'
}
steps {
sh returnStatus: true, script: 'python3.11 -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
}
}
}
}

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# FASTA-MLST
A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.

50
pyproject.toml Normal file
View File

@ -0,0 +1,50 @@
[build-system]
requires = ["setuptools >= 61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "automlst"
dynamic = ["version"]
dependencies = [
"biopython",
"aiohttp[speedups]",
]
requires-python = ">=3.11"
description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
[project.scripts]
automlst = "automlst.cli.root:cli"
nsbdiagtk = "automlst.cli.root:cli"
[tool.pyright]
extraPaths = ["src"]
exclude = [
"**/node_modules",
"**/__pycache__"
]
executionEnvironments = [
{root = "src"}
]
[tool.setuptools]
package-dir = {"" = "src"}
[tool.pytest.ini_options]
testpaths = [
"tests"
]
pythonpath = [
"src"
]
addopts = [
"--import-mode=importlib",
]
asyncio_mode = "auto"
[tool.pylint.main]
source-roots = "src"
[tool.pylint.format]
# Maximum number of characters on a single line.
max-line-length = 88

6
requirements.txt Normal file
View File

@ -0,0 +1,6 @@
aiohttp[speedups]
biopython
pytest
pytest-asyncio
build
twine

View File

@ -0,0 +1,23 @@
from os import path
from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
from automlst.engine.data.MLST import MLSTProfile
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.fasta import read_fasta
from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
for fasta_path in fastas:
async for fasta in read_fasta(fasta_path):
yield fasta.sequence
for abif_path in abifs:
abif_data = await read_abif(abif_path)
yield "".join(abif_data.sequence)
async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
profiles = list()
async with InstitutPasteurProfiler(database_name=database_name) as profiler:
async for string in strings:
profiles.append(await profiler.profile_string(string))
return profiles

70
src/automlst/cli/root.py Normal file
View File

@ -0,0 +1,70 @@
import argparse
import asyncio
import datetime
from os import path
import os
from automlst.cli import aggregator
from automlst.engine.data.genomics import NamedString
from automlst.engine.local.abif import read_abif
from automlst.engine.local.csv import write_mlst_profiles_as_csv
from automlst.engine.local.fasta import read_fasta
parser = argparse.ArgumentParser()
parser.add_argument(
"--run-name", "-name",
dest="run_name",
required=False,
default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"),
type=str,
help="The name of the run. Will use a date and time string if not provided."
)
parser.add_argument(
"--fasta", "-fa", "-fst",
nargs="+",
action='extend',
dest="fastas",
required=False,
default=[],
type=str,
help="The FASTA files to process. Multiple can be listed."
)
parser.add_argument(
"--abif", "-abi", "-ab1",
action='extend',
dest="abifs",
required=False,
default=[],
type=str,
help="The ABIF files to process. Multiple can be listed."
)
parser.add_argument(
"--institut-pasteur-mlst",
"-ipdbmlst",
dest="institut_pasteur_db",
required=False,
default=None,
type=str,
help="The Institut Pasteur MLST database to use."
)
parser.add_argument(
"out",
default="./.",
help="The output folder. Files will be named by the provided (or default) run name."
)
def cli():
args = parser.parse_args()
gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs)
os.makedirs(args.out, exist_ok=True)
if args.institut_pasteur_db is not None:
mlst_profiles = aggregator.profile_all_genetic_strings(
gen_strings, args.institut_pasteur_db)
asyncio.run(write_mlst_profiles_as_csv(
asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv"))))
if __name__ == "__main__":
cli()

View File

View File

@ -0,0 +1,44 @@
import asyncio
from collections.abc import Set
from typing import Any, Generator, List, Sequence
from Bio.Align import PairwiseAligner
from Bio import Entrez
from Bio import SeqIO
import numpy as np
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
# TODO implement asynchronous alignment algorithm
reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
query_annotations = list()
aligner = PairwiseAligner("blastn")
aligner.mode = "local"
for annotation in reference_annotations.annotations:
if annotation.type != "gene" or "gene" not in annotation.feature_properties:
continue
if len(gene_targets) > 0 and "gene" in annotation.feature_properties:
if not annotation.feature_properties["gene"].intersection(gene_targets):
continue
if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length:
# TODO implement a failsafe
continue
feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
alignments = aligner.align(query_string, feature_string_sequence)
if len(alignments) < 1:
# TODO implement a failsafe
continue
top_alignment = sorted(alignments)[0]
# TODO Check if alternatives are better
query_annotations.append(StringAnnotation(
type=annotation.type, # same as original
start=np.min(top_alignment.aligned[0]), # We only care about the start of first chunk
end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
feature_properties=dict(annotation.feature_properties) # same as original
))
return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
return annotated_string.sequence[string_annotation.start:string_annotation.end]

View File

@ -0,0 +1,13 @@
from dataclasses import dataclass
from typing import Mapping, Sequence
@dataclass
class Allele:
allele_loci: str
allele_variant: str
@dataclass
class MLSTProfile:
alleles: Mapping[str, Sequence[Allele]]
sequence_type: int
clonal_complex: str

View File

View File

@ -0,0 +1,105 @@
from dataclasses import dataclass
from numbers import Number
from typing import Mapping, Sequence, Set, Union
@dataclass
class StringAnnotation:
type: str
start: int
end: int
feature_properties: Mapping[str, Set[str]]
@dataclass
class NamedString:
name: str
sequence: str
@dataclass
class AnnotatedString(NamedString):
annotations: Sequence[StringAnnotation]
@dataclass
class SangerTraceData:
sequence: Sequence[str]
seq_param_file_name: str
analysis_proto_settings_name: str
analysis_rpto_settings_ver: str
analysis_proto_xml_data: str
analysis_proto_xml_schema_ver: str
sample_comment: Union[None, str]
capillary_machine: bool
container_identifier: str
container_name: str
comment_title: str
channel_1: Sequence[Number]
channel_2: Sequence[Number]
channel_3: Sequence[Number]
channel_4: Sequence[Number]
measured_voltage_dv: Sequence[Number]
measured_current_ma: Sequence[Number]
measured_power_mw: Sequence[Number]
measured_temperature_celsius: Sequence[Number]
down_sample_factor: Number
dye_1: str
dye_2: str
dye_3: str
dye_4: str
dye_wavelength_1: str
dye_wavelength_2: str
dye_wavelength_3: str
dye_wavelength_4: str
dye_set_name: str
electrophoresis_voltage_setting_v: Number
start_run_event: str
stop_run_event: str
start_collection_event: str
stop_collection_event: str
base_order: Sequence[str]
gel_type_desc: str
injection_time_sec: Number
inection_voltage_v: Number
lane_or_capillary: Number
sample_tracking_id: str
length_to_detector_cm: Number
laser_power_mw: Number
instrument_name_and_serial: str
data_collection_module_file: str
model_number: str
pixels_avg_per_lane: Number
number_of_capillaries: Number
marked_off_scale_scans: Union[None, Sequence[Number]]
# Skipped Ovrl, OvrV
mobility_file: str
# Skipped PRJT, PROJ
pixel_bin_size: Number
# Skipped scan rate
results_group_comment: Union[None, str]
results_group_name: str
run_module_ver: str
run_module_xml: str
run_module_xml_ver: str
run_proto_name: str
run_proto_ver: str
run_start_date: str # Date time object
run_stop_date: str # Date time object
data_collection_start_date: str
data_collection_stop_date: str
run_name: str
run_start_time: str # time object
run_stop_time: str # time object
collection_start_time: str # time object
collection_stop_time: str # time object
saturated_data_points: Union[None, Sequence[Number]]
color_rescaling_divisor: Number
scan_count: Number
polymer_lot_expiration: str # date time object
polymer_lot_number: Number
sample_name: str
# Skipped genescan data
# Skipped size standard file name
data_collection_software_ver: str
data_collection_firmware_ver: str
run_temperature_setting_celcius: Number
well_id: str
plate_user_name: str

View File

@ -0,0 +1,104 @@
import asyncio
from numbers import Number
from os import path
from typing import Sequence, Union
from automlst.engine.data.genomics import SangerTraceData
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
with open(seq_path, "rb") as seq_handle:
return SeqIO.read(seq_handle, "abi")
async def read_abif(seq_path: str) -> SangerTraceData:
ext = path.splitext(seq_path)[1]
if ext.lower() != ".ab1" and ext.lower() != "abi":
raise ValueError(
'seq_path must have file extension of "ab1", or "abi".')
biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
biopython_annotations = biopython_seq.annotations
# Lot of type ignoring since Biopython did not define their typing.
biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
trace_data = SangerTraceData(
biopython_seq.seq,
biopython_abif_raw.get("APFN2"), # type: ignore
biopython_abif_raw.get("APrN1"), # type: ignore
biopython_abif_raw.get("APrV1"), # type: ignore
biopython_abif_raw.get("APrX1"), # type: ignore
biopython_abif_raw.get("APXV1"), # type: ignore
biopython_abif_raw.get("CMNT1"), # type: ignore
biopython_abif_raw.get("CpEP1"), # type: ignore
biopython_abif_raw.get("CTID1"), # type: ignore
biopython_abif_raw.get("CTNM1"), # type: ignore
biopython_abif_raw.get("CTTL1"), # type: ignore
biopython_abif_raw.get("DATA1"), # type: ignore
biopython_abif_raw.get("DATA2"), # type: ignore
biopython_abif_raw.get("DATA3"), # type: ignore
biopython_abif_raw.get("DATA4"), # type: ignore
biopython_abif_raw.get("DATA5"), # type: ignore
biopython_abif_raw.get("DATA6"), # type: ignore
biopython_abif_raw.get("DATA7"), # type: ignore
biopython_abif_raw.get("DATA8"), # type: ignore
biopython_abif_raw.get("DSam1"), # type: ignore
biopython_abif_raw.get("DyeN1"), # type: ignore
biopython_abif_raw.get("DyeN2"), # type: ignore
biopython_abif_raw.get("DyeN3"), # type: ignore
biopython_abif_raw.get("DyeN4"), # type: ignore
biopython_abif_raw.get("DyeW1"), # type: ignore
biopython_abif_raw.get("DyeW2"), # type: ignore
biopython_abif_raw.get("DyeW3"), # type: ignore
biopython_abif_raw.get("DyeW4"), # type: ignore
biopython_abif_raw.get("DySN1"), # type: ignore
biopython_abif_raw.get("EPVt1"), # type: ignore
biopython_abif_raw.get("EVNT1"), # type: ignore
biopython_abif_raw.get("EVNT2"), # type: ignore
biopython_abif_raw.get("EVNT3"), # type: ignore
biopython_abif_raw.get("EVNT4"), # type: ignore
biopython_abif_raw.get("FWO_1"), # type: ignore
biopython_abif_raw.get("GTyp1"), # type: ignore
biopython_abif_raw.get("InSc1"), # type: ignore
biopython_abif_raw.get("InVt1"), # type: ignore
biopython_abif_raw.get("LANE1"), # type: ignore
biopython_abif_raw.get("LIMS1"), # type: ignore
biopython_abif_raw.get("LNTD1"), # type: ignore
biopython_abif_raw.get("LsrP1"), # type: ignore
biopython_abif_raw.get("MCHN1"), # type: ignore
biopython_abif_raw.get("MODF1"), # type: ignore
biopython_abif_raw.get("MODL1"), # type: ignore
biopython_abif_raw.get("NAVG1"), # type: ignore
biopython_abif_raw.get("NLNE1"), # type: ignore
biopython_abif_raw.get("OfSc1"), # type: ignore
biopython_abif_raw.get("PDMF1"), # type: ignore
biopython_abif_raw.get("PXLB1"), # type: ignore
biopython_abif_raw.get("RGCm1"), # type: ignore
biopython_abif_raw.get("RGNm1"), # type: ignore
biopython_abif_raw.get("RMdV1"), # type: ignore
biopython_abif_raw.get("RMdX1"), # type: ignore
biopython_abif_raw.get("RMXV1"), # type: ignore
biopython_abif_raw.get("RPrN1"), # type: ignore
biopython_abif_raw.get("RPrV1"), # type: ignore
biopython_abif_raw.get("RUND1"), # type: ignore
biopython_abif_raw.get("RUND2"), # type: ignore
biopython_abif_raw.get("RUND3"), # type: ignore
biopython_abif_raw.get("RUND4"), # type: ignore
biopython_abif_raw.get("RunN1"), # type: ignore
biopython_abif_raw.get("RUNT1"), # type: ignore
biopython_abif_raw.get("RUNT2"), # type: ignore
biopython_abif_raw.get("RUNT3"), # type: ignore
biopython_abif_raw.get("RUNT4"), # type: ignore
biopython_abif_raw.get("Satd"), # type: ignore
biopython_abif_raw.get("Scal1"), # type: ignore
biopython_abif_raw.get("SCAN1"), # type: ignore
biopython_abif_raw.get("SMED1"), # type: ignore
biopython_abif_raw.get("SMLt"), # type: ignore
biopython_abif_raw.get("SMPL1"), # type: ignore
biopython_abif_raw.get("SVER1"), # type: ignore
biopython_abif_raw.get("SVER3"), # type: ignore
biopython_abif_raw.get("Tmpr1"), # type: ignore
biopython_abif_raw.get("TUBE"), # type: ignore
biopython_abif_raw.get("User") # type: ignore
)
return trace_data

View File

@ -0,0 +1,31 @@
import csv
from io import TextIOWrapper
from os import PathLike
from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
from automlst.engine.data.MLST import Allele, MLSTProfile
def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
result_dict: dict[str, list[str]] = {}
for loci, alleles in alleles_map.items():
result_dict[loci] = list()
for allele in alleles:
result_dict[loci].append(allele.allele_variant)
return result_dict
async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
mlst_profiles = list(mlst_profiles_iterable)
header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
with open(handle, "w", newline='') as filehandle:
writer = csv.DictWriter(filehandle, fieldnames=header)
writer.writeheader()
for mlst_profile in mlst_profiles:
row_dictionary = {
"st": mlst_profile.sequence_type,
"clonal-complex": mlst_profile.clonal_complex,
**loci_alleles_variants_from_loci(mlst_profile.alleles)
}
writer.writerow(rowdict=row_dictionary)

View File

@ -0,0 +1,11 @@
import asyncio
from io import TextIOWrapper
from typing import Any, AsyncGenerator, Generator, Sequence, Union
from Bio import SeqIO
from automlst.engine.data.genomics import NamedString
async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
for fasta_sequence in await fasta_sequences:
yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))

View File

@ -0,0 +1,62 @@
from collections import defaultdict
from contextlib import AbstractAsyncContextManager
import re
from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
from aiohttp import ClientSession, ClientTimeout
from automlst.engine.data.MLST import Allele, MLSTProfile
from automlst.engine.data.genomics import NamedString
class InstitutPasteurProfiler(AbstractAsyncContextManager):
async def __aenter__(self):
return self
def __init__(self, database_name: str):
self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
# See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
uri_path = "schemes/3/sequence"
response = await self._http_client.post(uri_path, json={
"sequence": sequence_string
})
sequence_response: dict = await response.json()
exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]
for allele_loci, alleles in exact_matches.items():
for allele in alleles:
alelle_id = allele["allele_id"]
yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
uri_path = "schemes/3/designations"
allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
if isinstance(alleles, AsyncIterable):
async for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
else:
for allele in alleles:
allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
response = await self._http_client.post(uri_path, json={
"designations": allele_request_dict
})
response_json = await response.json()
schema_fields_returned = response_json["fields"]
schema_exact_matches = response_json["exact_matches"]
allele_map: dict[str, list[Allele]] = defaultdict(list)
for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
for exact_match_allele in exact_match_alleles:
allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
async def profile_string(self, string: str) -> MLSTProfile:
alleles = self.fetch_mlst_allele_variants(string)
return await self.fetch_mlst_st(alleles)
async def close(self):
await self._http_client.close()
async def __aexit__(self, exc_type, exc_value, traceback):
await self.close()

View File

@ -0,0 +1,27 @@
import asyncio
from Bio import Entrez
from Bio import SeqIO
# TODO Change this out for a more professional approach
Entrez.email = "yunyangdeng@outlook.com"
from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
record = SeqIO.read(fetch_stream, "genbank")
sequence_features = list()
for feature in record.features:
start = int(feature.location.start)
end = int(feature.location.end)
qualifiers = feature.qualifiers
for qualifier_key in qualifiers:
qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
sequence_features.append(StringAnnotation(
type=feature.type,
start=start,
end=end+1, # Position is exclusive
feature_properties=qualifiers
))
return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)

View File

@ -0,0 +1,8 @@
import os
from automlst.engine.local.abif import read_abif
async def test_load_sanger_sequence_has_data():
assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
assert result_data is not None

View File

@ -0,0 +1,7 @@
from automlst.engine.local.fasta import read_fasta
async def test_fasta_reader_not_none():
named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
async for named_string in named_strings:
assert named_string.name == "BX470248.1"

View File

@ -0,0 +1,35 @@
from Bio import SeqIO
from automlst.engine.data.MLST import Allele, MLSTProfile
from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
async def test_profiling_results_in_exact_matches_when_exact():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
async for exact_match in exact_matches:
assert isinstance(exact_match, Allele)
assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
targets_left.remove(exact_match.allele_loci)
assert len(targets_left) == 0
async def test_profiling_results_in_correct_st():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
dummy_alleles = [
Allele("adk", "1"),
Allele("fumC", "1"),
Allele("glyA", "1"),
Allele("tyrB", "1"),
Allele("icd", "1"),
Allele("pepA", "1"),
Allele("pgm", "1"),
]
async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
assert mlst_st_data is not None
assert isinstance(mlst_st_data, MLSTProfile)
assert mlst_st_data.clonal_complex == "ST-2 complex"
assert mlst_st_data.sequence_type == "1"

View File

@ -0,0 +1,5 @@
from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
async def test_fetch_ncbi_genbank_with_id_works():
assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0

View File

@ -0,0 +1,12 @@
from automlst.engine.annotate import annotate_from_genbank, fetch_ncbi_genbank
from Bio import SeqIO
from automlst.engine.data.genomics import AnnotatedString
async def test_annotate_from_genbank_for_adk_annotation():
sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", sequence, max_annotation_length=750, gene_targets=set(["adk"]))
assert isinstance(annotated_sequence, AnnotatedString)
assert len(annotated_sequence.annotations) >= 1
assert annotated_sequence.annotations[0].type == "gene"
assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff