Copied over most code from another of my projects

2025-01-08 15:14:06 +00:00 · 2025-01-08 15:14:06 +00:00 · 02985d5e37
commit 02985d5e37
32 changed files with 59455 additions and 0 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@ -0,0 +1,22 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+	"name": "Python 3",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "pip3 install --user -r requirements.txt"
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,359 @@
+# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,svelte,python,linux,node
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### Node ###
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
+
+### Node Patch ###
+# Serverless Webpack directories
+.webpack/
+
+# Optional stylelint cache
+
+# SvelteKit build / generate output
+.svelte-kit
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### Svelte ###
+# gitignore template for the SvelteKit, frontend web component framework
+# website: https://kit.svelte.dev/
+
+.svelte-kit/
+package
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,svelte,python,linux,node
+
+# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
+
+output
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,27 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "CLI ipdbmlst",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/src/automlst/cli/root.py",
+            "console": "integratedTerminal",
+            "args": [
+                "-fa",
+                "${workspaceFolder}/tests/resources/tohama_I_bpertussis.fasta",
+                "-ipdbmlst",
+                "pubmlst_bordetella_seqdef",
+                "${workspaceFolder}/output"
+            ],
+            "cwd": "${workspaceFolder}/src",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/src"
+            }
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,4 @@
+{
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
--- a/50
+++ b/50
@ -0,0 +1,50 @@
+pipeline {
+    agent {
+        kubernetes {
+            cloud 'rsys-devel'
+            defaultContainer 'homebrew'
+            inheritFrom 'homebrew'
+        }
+    }
+    stages {
+        stage("install") {
+            steps {
+                sh 'brew install python@3.11 sphinx-doc'
+                sh 'python3.11 -m pip install -r requirements.txt'
+            }
+        }
+        stage("unit tests") {
+            steps {
+                sh returnStatus: true, script: "python3.11 -m pytest --junitxml=test_results.xml"
+                xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
+            }
+        }
+        stage("build") {
+            steps {
+                sh "python3.11 -m build"
+            }
+        }
+        stage("test installation") {
+            steps {
+                sh "python3.11 -m pip install dist/*.whl --force-reinstall"
+                sh "automlst -h"
+            }
+        }
+        stage("archive") {
+            steps {
+                archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl', fingerprint: true, followSymlinks: false, onlyIfSuccessful: true
+            }
+        }
+        stage("publish") {
+            environment {
+                CREDS = credentials('4d6f64be-d26d-4f95-8de3-b6a9b0beb311')
+            }
+            when {
+                branch '**/main'
+            }
+            steps {
+                sh returnStatus: true, script: 'python3.11 -m twine upload --repository-url https://git.reslate.systems/api/packages/${CREDS_USR}/pypi -u ${CREDS_USR} -p ${CREDS_PSW} --non-interactive --disable-progress-bar --verbose dist/*'
+            }
+        }
+    }
+}
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
+# FASTA-MLST
+
+A CLI tool for rapidly performing MLST typing via accessing pubMLST and InstitutPasteur MSLT databases.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools >= 61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "automlst"
+dynamic = ["version"]
+dependencies = [
+    "biopython",
+    "aiohttp[speedups]",
+]
+requires-python = ">=3.11"
+description = "A tool to rapidly fetch fetch MLST profiles given sequences for various diseases."
+
+[project.scripts]
+automlst = "automlst.cli.root:cli"
+nsbdiagtk = "automlst.cli.root:cli"
+
+[tool.pyright]
+extraPaths = ["src"]
+exclude = [
+    "**/node_modules",
+    "**/__pycache__"
+]
+executionEnvironments = [
+    {root = "src"}
+]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.pytest.ini_options]
+testpaths = [
+    "tests"
+]
+pythonpath = [
+    "src"
+]
+addopts = [
+    "--import-mode=importlib",
+]
+asyncio_mode = "auto"
+
+[tool.pylint.main]
+source-roots = "src"
+
+[tool.pylint.format]
+# Maximum number of characters on a single line.
+max-line-length = 88
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,6 @@
+aiohttp[speedups]
+biopython
+pytest
+pytest-asyncio
+build
+twine
--- a/src/automlst/cli/aggregator.py
+++ b/src/automlst/cli/aggregator.py
@ -0,0 +1,23 @@
+from os import path
+from typing import Any, AsyncGenerator, AsyncIterable, Iterable, Sequence
+from automlst.engine.data.MLST import MLSTProfile
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.local.abif import read_abif
+from automlst.engine.local.fasta import read_fasta
+from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
+
+
+async def aggregate_sequences(fastas: Iterable[str], abifs: Iterable[str]) -> AsyncGenerator[str, Any]:
+    for fasta_path in fastas:
+        async for fasta in read_fasta(fasta_path):
+            yield fasta.sequence
+    for abif_path in abifs:
+        abif_data = await read_abif(abif_path)
+        yield "".join(abif_data.sequence)
+
+async def profile_all_genetic_strings(strings: AsyncIterable[str], database_name: str) -> Sequence[MLSTProfile]:
+    profiles = list()
+    async with InstitutPasteurProfiler(database_name=database_name) as profiler:
+        async for string in strings:
+            profiles.append(await profiler.profile_string(string))
+    return profiles
--- a/src/automlst/cli/root.py
+++ b/src/automlst/cli/root.py
@ -0,0 +1,70 @@
+import argparse
+import asyncio
+import datetime
+from os import path
+import os
+
+from automlst.cli import aggregator
+from automlst.engine.data.genomics import NamedString
+from automlst.engine.local.abif import read_abif
+from automlst.engine.local.csv import write_mlst_profiles_as_csv
+from automlst.engine.local.fasta import read_fasta
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run-name", "-name",
+    dest="run_name",
+    required=False,
+    default=datetime.datetime.now().strftime(r"%Y%m%d%H%M%S"),
+    type=str,
+    help="The name of the run. Will use a date and time string if not provided."
+)
+parser.add_argument(
+    "--fasta", "-fa", "-fst",
+    nargs="+",
+    action='extend',
+    dest="fastas",
+    required=False,
+    default=[],
+    type=str,
+    help="The FASTA files to process. Multiple can be listed."
+)
+parser.add_argument(
+    "--abif", "-abi", "-ab1",
+    action='extend',
+    dest="abifs",
+    required=False,
+    default=[],
+    type=str,
+    help="The ABIF files to process. Multiple can be listed."
+)
+parser.add_argument(
+    "--institut-pasteur-mlst",
+    "-ipdbmlst",
+    dest="institut_pasteur_db",
+    required=False,
+    default=None,
+    type=str,
+    help="The Institut Pasteur MLST database to use."
+)
+parser.add_argument(
+    "out",
+    default="./.",
+    help="The output folder. Files will be named by the provided (or default) run name."
+)
+
+
+def cli():
+    args = parser.parse_args()
+    gen_strings = aggregator.aggregate_sequences(args.fastas, args.abifs)
+    os.makedirs(args.out, exist_ok=True)
+    if args.institut_pasteur_db is not None:
+        mlst_profiles = aggregator.profile_all_genetic_strings(
+            gen_strings, args.institut_pasteur_db)
+        asyncio.run(write_mlst_profiles_as_csv(
+            asyncio.run(mlst_profiles), str(path.join(args.out, "MLST_" + args.run_name + ".csv"))))
+
+
+if __name__ == "__main__":
+    cli()
--- a/src/automlst/engine/init.py
+++ b/src/automlst/engine/init.py
--- a/src/automlst/engine/annotate.py
+++ b/src/automlst/engine/annotate.py
@ -0,0 +1,44 @@
+import asyncio
+from collections.abc import Set
+from typing import Any, Generator, List, Sequence
+from Bio.Align import PairwiseAligner
+from Bio import Entrez
+from Bio import SeqIO
+import numpy as np
+
+from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
+from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
+
+
+async def annotate_from_genbank(genbank_id: str, query_name: str, query_string: str, max_annotation_length:int = 512, gene_targets:Set = set()):
+    # TODO implement asynchronous alignment algorithm
+    reference_annotations = await fetch_ncbi_genbank(genbank_id=genbank_id)
+    query_annotations = list()
+    aligner = PairwiseAligner("blastn")
+    aligner.mode = "local"
+    for annotation in reference_annotations.annotations:
+        if annotation.type != "gene" or "gene" not in annotation.feature_properties:
+            continue
+        if len(gene_targets) > 0 and "gene" in annotation.feature_properties:
+            if not annotation.feature_properties["gene"].intersection(gene_targets):
+                continue
+        if max_annotation_length > 0 and annotation.end - annotation.start > max_annotation_length:
+            # TODO implement a failsafe
+            continue
+        feature_string_sequence = get_feature_coding(annotated_string=reference_annotations, string_annotation=annotation)
+        alignments = aligner.align(query_string, feature_string_sequence)
+        if len(alignments) < 1:
+            # TODO implement a failsafe
+            continue
+        top_alignment = sorted(alignments)[0]
+        # TODO Check if alternatives are better
+        query_annotations.append(StringAnnotation(
+            type=annotation.type, # same as original
+            start=np.min(top_alignment.aligned[0]), # We only care about the start of first chunk
+            end=np.max(top_alignment.aligned[0]), # and the end of the last chunk
+            feature_properties=dict(annotation.feature_properties) # same as original
+        ))
+    return AnnotatedString(name=query_name, sequence=query_string, annotations=query_annotations)
+
+def get_feature_coding(annotated_string: AnnotatedString, string_annotation: StringAnnotation) -> str:
+    return annotated_string.sequence[string_annotation.start:string_annotation.end]
--- a/src/automlst/engine/data/MLST.py
+++ b/src/automlst/engine/data/MLST.py
@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Mapping, Sequence
+
+@dataclass
+class Allele:
+    allele_loci: str
+    allele_variant: str
+
+@dataclass
+class MLSTProfile:
+    alleles: Mapping[str, Sequence[Allele]]
+    sequence_type: int
+    clonal_complex: str
--- a/src/automlst/engine/data/init.py
+++ b/src/automlst/engine/data/init.py
--- a/src/automlst/engine/data/genomics.py
+++ b/src/automlst/engine/data/genomics.py
@ -0,0 +1,105 @@
+from dataclasses import dataclass
+from numbers import Number
+from typing import Mapping, Sequence, Set, Union
+
+
+@dataclass
+class StringAnnotation:
+    type: str
+    start: int
+    end: int
+    feature_properties: Mapping[str, Set[str]]
+
+@dataclass
+class NamedString:
+    name: str
+    sequence: str
+
+@dataclass
+class AnnotatedString(NamedString):
+    annotations: Sequence[StringAnnotation]
+
+@dataclass
+class SangerTraceData:
+    sequence: Sequence[str]
+    seq_param_file_name: str
+    analysis_proto_settings_name: str
+    analysis_rpto_settings_ver: str
+    analysis_proto_xml_data: str
+    analysis_proto_xml_schema_ver: str
+    sample_comment: Union[None, str]
+    capillary_machine: bool
+    container_identifier: str
+    container_name: str
+    comment_title: str
+    channel_1: Sequence[Number]
+    channel_2: Sequence[Number]
+    channel_3: Sequence[Number]
+    channel_4: Sequence[Number]
+    measured_voltage_dv: Sequence[Number]
+    measured_current_ma: Sequence[Number]
+    measured_power_mw: Sequence[Number]
+    measured_temperature_celsius: Sequence[Number]
+    down_sample_factor: Number
+    dye_1: str
+    dye_2: str
+    dye_3: str
+    dye_4: str
+    dye_wavelength_1: str
+    dye_wavelength_2: str
+    dye_wavelength_3: str
+    dye_wavelength_4: str
+    dye_set_name: str
+    electrophoresis_voltage_setting_v: Number
+    start_run_event: str
+    stop_run_event: str
+    start_collection_event: str
+    stop_collection_event: str
+    base_order: Sequence[str]
+    gel_type_desc: str
+    injection_time_sec: Number
+    inection_voltage_v: Number
+    lane_or_capillary: Number
+    sample_tracking_id: str
+    length_to_detector_cm: Number
+    laser_power_mw: Number
+    instrument_name_and_serial: str
+    data_collection_module_file: str
+    model_number: str
+    pixels_avg_per_lane: Number
+    number_of_capillaries: Number
+    marked_off_scale_scans: Union[None, Sequence[Number]]
+    # Skipped Ovrl, OvrV
+    mobility_file: str
+    # Skipped PRJT, PROJ
+    pixel_bin_size: Number
+    # Skipped scan rate
+    results_group_comment: Union[None, str]
+    results_group_name: str
+    run_module_ver: str
+    run_module_xml: str
+    run_module_xml_ver: str
+    run_proto_name: str
+    run_proto_ver: str
+    run_start_date: str  # Date time object
+    run_stop_date: str  # Date time object
+    data_collection_start_date: str
+    data_collection_stop_date: str
+    run_name: str
+    run_start_time: str  # time object
+    run_stop_time: str  # time object
+    collection_start_time: str  # time object
+    collection_stop_time: str  # time object
+    saturated_data_points: Union[None, Sequence[Number]]
+    color_rescaling_divisor: Number
+    scan_count: Number
+    polymer_lot_expiration: str  # date time object
+    polymer_lot_number: Number
+    sample_name: str
+    # Skipped genescan data
+    # Skipped size standard file name
+    data_collection_software_ver: str
+    data_collection_firmware_ver: str
+    run_temperature_setting_celcius: Number
+    well_id: str
+    plate_user_name: str
--- a/src/automlst/engine/local/abif.py
+++ b/src/automlst/engine/local/abif.py
@ -0,0 +1,104 @@
+import asyncio
+from numbers import Number
+from os import path
+from typing import Sequence, Union
+from automlst.engine.data.genomics import SangerTraceData
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+
+
+def _biopython_read_abif_sequence(seq_path: str) -> SeqRecord:
+    with open(seq_path, "rb") as seq_handle:
+        return SeqIO.read(seq_handle, "abi")
+
+
+async def read_abif(seq_path: str) -> SangerTraceData:
+    ext = path.splitext(seq_path)[1]
+    if ext.lower() != ".ab1" and ext.lower() != "abi":
+        raise ValueError(
+            'seq_path must have file extension of "ab1", or "abi".')
+    biopython_seq = await asyncio.to_thread(_biopython_read_abif_sequence, seq_path)
+    biopython_annotations = biopython_seq.annotations
+
+    # Lot of type ignoring since Biopython did not define their typing.
+    biopython_abif_raw = biopython_annotations["abif_raw"] # type: ignore
+    trace_data = SangerTraceData(
+        biopython_seq.seq,
+        biopython_abif_raw.get("APFN2"), # type: ignore
+        biopython_abif_raw.get("APrN1"), # type: ignore
+        biopython_abif_raw.get("APrV1"), # type: ignore
+        biopython_abif_raw.get("APrX1"), # type: ignore
+        biopython_abif_raw.get("APXV1"), # type: ignore
+        biopython_abif_raw.get("CMNT1"), # type: ignore
+        biopython_abif_raw.get("CpEP1"), # type: ignore
+        biopython_abif_raw.get("CTID1"), # type: ignore
+        biopython_abif_raw.get("CTNM1"), # type: ignore
+        biopython_abif_raw.get("CTTL1"), # type: ignore
+        biopython_abif_raw.get("DATA1"), # type: ignore
+        biopython_abif_raw.get("DATA2"), # type: ignore
+        biopython_abif_raw.get("DATA3"), # type: ignore
+        biopython_abif_raw.get("DATA4"), # type: ignore
+        biopython_abif_raw.get("DATA5"), # type: ignore
+        biopython_abif_raw.get("DATA6"), # type: ignore
+        biopython_abif_raw.get("DATA7"), # type: ignore
+        biopython_abif_raw.get("DATA8"), # type: ignore
+        biopython_abif_raw.get("DSam1"), # type: ignore
+        biopython_abif_raw.get("DyeN1"), # type: ignore
+        biopython_abif_raw.get("DyeN2"), # type: ignore
+        biopython_abif_raw.get("DyeN3"), # type: ignore
+        biopython_abif_raw.get("DyeN4"), # type: ignore
+        biopython_abif_raw.get("DyeW1"), # type: ignore
+        biopython_abif_raw.get("DyeW2"), # type: ignore
+        biopython_abif_raw.get("DyeW3"), # type: ignore
+        biopython_abif_raw.get("DyeW4"), # type: ignore
+        biopython_abif_raw.get("DySN1"), # type: ignore
+        biopython_abif_raw.get("EPVt1"), # type: ignore
+        biopython_abif_raw.get("EVNT1"), # type: ignore
+        biopython_abif_raw.get("EVNT2"), # type: ignore
+        biopython_abif_raw.get("EVNT3"), # type: ignore
+        biopython_abif_raw.get("EVNT4"), # type: ignore
+        biopython_abif_raw.get("FWO_1"), # type: ignore
+        biopython_abif_raw.get("GTyp1"), # type: ignore
+        biopython_abif_raw.get("InSc1"), # type: ignore
+        biopython_abif_raw.get("InVt1"), # type: ignore
+        biopython_abif_raw.get("LANE1"), # type: ignore
+        biopython_abif_raw.get("LIMS1"), # type: ignore
+        biopython_abif_raw.get("LNTD1"), # type: ignore
+        biopython_abif_raw.get("LsrP1"), # type: ignore
+        biopython_abif_raw.get("MCHN1"), # type: ignore
+        biopython_abif_raw.get("MODF1"), # type: ignore
+        biopython_abif_raw.get("MODL1"), # type: ignore
+        biopython_abif_raw.get("NAVG1"), # type: ignore
+        biopython_abif_raw.get("NLNE1"), # type: ignore
+        biopython_abif_raw.get("OfSc1"), # type: ignore
+        biopython_abif_raw.get("PDMF1"), # type: ignore
+        biopython_abif_raw.get("PXLB1"), # type: ignore
+        biopython_abif_raw.get("RGCm1"), # type: ignore
+        biopython_abif_raw.get("RGNm1"), # type: ignore
+        biopython_abif_raw.get("RMdV1"), # type: ignore
+        biopython_abif_raw.get("RMdX1"), # type: ignore
+        biopython_abif_raw.get("RMXV1"), # type: ignore
+        biopython_abif_raw.get("RPrN1"), # type: ignore
+        biopython_abif_raw.get("RPrV1"), # type: ignore
+        biopython_abif_raw.get("RUND1"), # type: ignore
+        biopython_abif_raw.get("RUND2"), # type: ignore
+        biopython_abif_raw.get("RUND3"), # type: ignore
+        biopython_abif_raw.get("RUND4"), # type: ignore
+        biopython_abif_raw.get("RunN1"), # type: ignore
+        biopython_abif_raw.get("RUNT1"), # type: ignore
+        biopython_abif_raw.get("RUNT2"), # type: ignore
+        biopython_abif_raw.get("RUNT3"), # type: ignore
+        biopython_abif_raw.get("RUNT4"), # type: ignore
+        biopython_abif_raw.get("Satd"), # type: ignore
+        biopython_abif_raw.get("Scal1"), # type: ignore
+        biopython_abif_raw.get("SCAN1"), # type: ignore
+        biopython_abif_raw.get("SMED1"), # type: ignore
+        biopython_abif_raw.get("SMLt"), # type: ignore
+        biopython_abif_raw.get("SMPL1"), # type: ignore
+        biopython_abif_raw.get("SVER1"), # type: ignore
+        biopython_abif_raw.get("SVER3"), # type: ignore
+        biopython_abif_raw.get("Tmpr1"), # type: ignore
+        biopython_abif_raw.get("TUBE"), # type: ignore
+        biopython_abif_raw.get("User") # type: ignore
+    )
+    return trace_data
--- a/src/automlst/engine/local/csv.py
+++ b/src/automlst/engine/local/csv.py
@ -0,0 +1,31 @@
+import csv
+from io import TextIOWrapper
+from os import PathLike
+from typing import AsyncIterable, Iterable, Mapping, Sequence, Union
+
+from automlst.engine.data.MLST import Allele, MLSTProfile
+
+
+def loci_alleles_variants_from_loci(alleles_map: Mapping[str, Sequence[Allele]]):
+    result_dict: dict[str, list[str]] = {}
+    for loci, alleles in alleles_map.items():
+        result_dict[loci] = list()
+        for allele in alleles:
+            result_dict[loci].append(allele.allele_variant)
+    return result_dict
+
+
+async def write_mlst_profiles_as_csv(mlst_profiles_iterable: Iterable[MLSTProfile], handle: Union[str, bytes, PathLike[str], PathLike[bytes]]):
+    mlst_profiles = list(mlst_profiles_iterable)
+    header = ["st", "clonal-complex", *mlst_profiles[0].alleles.keys()]
+    with open(handle, "w", newline='') as filehandle:
+        writer = csv.DictWriter(filehandle, fieldnames=header)
+        writer.writeheader()
+        for mlst_profile in mlst_profiles:
+            row_dictionary = {
+                "st": mlst_profile.sequence_type,
+                "clonal-complex": mlst_profile.clonal_complex,
+                **loci_alleles_variants_from_loci(mlst_profile.alleles)
+            }
+
+            writer.writerow(rowdict=row_dictionary)
--- a/src/automlst/engine/local/fasta.py
+++ b/src/automlst/engine/local/fasta.py
@ -0,0 +1,11 @@
+import asyncio
+from io import TextIOWrapper
+from typing import Any, AsyncGenerator, Generator, Sequence, Union
+from Bio import SeqIO
+
+from automlst.engine.data.genomics import NamedString
+
+async def read_fasta(handle: Union[str, TextIOWrapper]) -> AsyncGenerator[NamedString, Any]:
+    fasta_sequences = asyncio.to_thread(SeqIO.parse, handle=handle, format="fasta")
+    for fasta_sequence in await fasta_sequences:
+        yield NamedString(fasta_sequence.id, str(fasta_sequence.seq))
--- a/src/automlst/engine/remote/databases/institutpasteur/init.py
+++ b/src/automlst/engine/remote/databases/institutpasteur/init.py
--- a/src/automlst/engine/remote/databases/institutpasteur/profiling.py
+++ b/src/automlst/engine/remote/databases/institutpasteur/profiling.py
@ -0,0 +1,62 @@
+from collections import defaultdict
+from contextlib import AbstractAsyncContextManager
+import re
+from typing import Any, AsyncGenerator, AsyncIterable, Generator, Iterable, Sequence, Union
+from aiohttp import ClientSession, ClientTimeout
+from automlst.engine.data.MLST import Allele, MLSTProfile
+from automlst.engine.data.genomics import NamedString
+
+class InstitutPasteurProfiler(AbstractAsyncContextManager):
+
+    async def __aenter__(self):
+        return self
+
+
+    def __init__(self, database_name: str):
+        self._base_url = f"https://bigsdb.pasteur.fr/api/db/{database_name}/"
+        self._http_client = ClientSession(self._base_url, timeout=ClientTimeout(10000))
+
+    async def fetch_mlst_allele_variants(self, sequence_string: str) -> AsyncGenerator[Allele, Any]:
+        # See https://bigsdb.pasteur.fr/api/db/pubmlst_bordetella_seqdef/schemes
+        uri_path = "schemes/3/sequence"
+        response = await self._http_client.post(uri_path, json={
+            "sequence": sequence_string
+        })
+        sequence_response: dict = await response.json()
+        exact_matches: dict[str, Sequence[dict[str, str]]] = sequence_response["exact_matches"]  
+        for allele_loci, alleles in exact_matches.items():
+            for allele in alleles:
+                alelle_id = allele["allele_id"]
+                yield Allele(allele_loci=allele_loci, allele_variant=alelle_id)
+
+    async def fetch_mlst_st(self, alleles: Union[AsyncIterable[Allele], Iterable[Allele]]) -> MLSTProfile:
+        uri_path = "schemes/3/designations"
+        allele_request_dict: dict[str, list[dict[str, str]]] = defaultdict(list)
+        if isinstance(alleles, AsyncIterable):
+            async for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        else:
+            for allele in alleles:
+                allele_request_dict[allele.allele_loci].append({"allele": str(allele.allele_variant)})
+        response = await self._http_client.post(uri_path, json={
+            "designations": allele_request_dict
+        })
+        response_json = await response.json()
+        schema_fields_returned = response_json["fields"]
+        schema_exact_matches = response_json["exact_matches"]
+        allele_map: dict[str, list[Allele]] = defaultdict(list)
+        for exact_match_loci, exact_match_alleles in schema_exact_matches.items():
+            for exact_match_allele in exact_match_alleles:
+                allele_map[exact_match_loci].append(Allele(exact_match_loci, exact_match_allele["allele_id"]))
+        return MLSTProfile(allele_map, schema_fields_returned["ST"], schema_fields_returned["clonal_complex"])
+
+    async def profile_string(self, string: str) -> MLSTProfile:
+        alleles = self.fetch_mlst_allele_variants(string)
+        return await self.fetch_mlst_st(alleles)
+
+
+    async def close(self):
+        await self._http_client.close()
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        await self.close()
--- a/src/automlst/engine/remote/databases/ncbi/init.py
+++ b/src/automlst/engine/remote/databases/ncbi/init.py
--- a/src/automlst/engine/remote/databases/ncbi/genbank.py
+++ b/src/automlst/engine/remote/databases/ncbi/genbank.py
@ -0,0 +1,27 @@
+import asyncio
+from Bio import Entrez
+from Bio import SeqIO
+
+# TODO Change this out for a more professional approach
+Entrez.email = "yunyangdeng@outlook.com"
+
+from automlst.engine.data.genomics import AnnotatedString, StringAnnotation
+
+
+async def fetch_ncbi_genbank(genbank_id: str) -> AnnotatedString:
+    with (await asyncio.to_thread(Entrez.efetch, db="nucleotide", id=genbank_id, rettype="gb", retmode="text")) as fetch_stream:
+        record = SeqIO.read(fetch_stream, "genbank")
+        sequence_features = list()
+        for feature in record.features:
+            start = int(feature.location.start)
+            end = int(feature.location.end)
+            qualifiers = feature.qualifiers
+            for qualifier_key in qualifiers:
+                qualifiers[qualifier_key] = set(qualifiers[qualifier_key])
+            sequence_features.append(StringAnnotation(
+                type=feature.type,
+                start=start,
+                end=end+1,  # Position is exclusive
+                feature_properties=qualifiers
+            ))
+        return AnnotatedString(name=genbank_id, sequence=str(record.seq), annotations=sequence_features)
--- a/tests/nsbdiagnosistoolkit/engine/local/test_abif.py
+++ b/tests/nsbdiagnosistoolkit/engine/local/test_abif.py
@ -0,0 +1,8 @@
+import os
+
+from automlst.engine.local.abif import read_abif
+
+async def test_load_sanger_sequence_has_data():
+    assert os.path.exists("tests/resources/1I1_F_P1815443_047.ab1")
+    result_data = await read_abif("tests/resources/1I1_F_P1815443_047.ab1")
+    assert result_data is not None
--- a/tests/nsbdiagnosistoolkit/engine/local/test_fasta.py
+++ b/tests/nsbdiagnosistoolkit/engine/local/test_fasta.py
@ -0,0 +1,7 @@
+from automlst.engine.local.fasta import read_fasta
+
+
+async def test_fasta_reader_not_none():
+    named_strings = read_fasta("tests/resources/tohama_I_bpertussis.fasta")
+    async for named_string in named_strings:
+        assert named_string.name == "BX470248.1"
--- a/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py
+++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/institutpasteur/test_profiling.py
@ -0,0 +1,35 @@
+from Bio import SeqIO
+from automlst.engine.data.MLST import Allele, MLSTProfile
+from automlst.engine.remote.databases.institutpasteur.profiling import InstitutPasteurProfiler
+
+
+async def test_profiling_results_in_exact_matches_when_exact():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        targets_left = {"adk", "fumC", "glyA", "tyrB", "icd", "pepA", "pgm"}
+        async for exact_match in exact_matches:
+            assert isinstance(exact_match, Allele)
+            assert exact_match.allele_variant == '1' # All of Tohama I has allele id I
+            targets_left.remove(exact_match.allele_loci)
+
+        assert len(targets_left) == 0
+
+async def test_profiling_results_in_correct_st():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    dummy_alleles = [
+        Allele("adk", "1"),
+        Allele("fumC", "1"),
+        Allele("glyA", "1"),
+        Allele("tyrB", "1"),
+        Allele("icd", "1"),
+        Allele("pepA", "1"),
+        Allele("pgm", "1"),
+    ]
+    async with InstitutPasteurProfiler(database_name="pubmlst_bordetella_seqdef") as dummy_profiler:
+        exact_matches = dummy_profiler.fetch_mlst_allele_variants(sequence_string=sequence)
+        mlst_st_data = await dummy_profiler.fetch_mlst_st(dummy_alleles)
+        assert mlst_st_data is not None
+        assert isinstance(mlst_st_data, MLSTProfile)
+        assert mlst_st_data.clonal_complex == "ST-2 complex"
+        assert mlst_st_data.sequence_type == "1"
--- a/tests/nsbdiagnosistoolkit/engine/remote/databases/ncbi/test_genbank.py
+++ b/tests/nsbdiagnosistoolkit/engine/remote/databases/ncbi/test_genbank.py
@ -0,0 +1,5 @@
+from automlst.engine.remote.databases.ncbi.genbank import fetch_ncbi_genbank
+
+
+async def test_fetch_ncbi_genbank_with_id_works():
+    assert len((await fetch_ncbi_genbank("CP011448.1")).sequence) > 0 
--- a/tests/nsbdiagnosistoolkit/engine/test_annotate.py
+++ b/tests/nsbdiagnosistoolkit/engine/test_annotate.py
@ -0,0 +1,12 @@
+from automlst.engine.annotate import annotate_from_genbank, fetch_ncbi_genbank
+from Bio import SeqIO
+
+from automlst.engine.data.genomics import AnnotatedString
+
+async def test_annotate_from_genbank_for_adk_annotation():
+    sequence = str(SeqIO.read("tests/resources/tohama_I_bpertussis.fasta", "fasta").seq)
+    annotated_sequence = await annotate_from_genbank("CP011448.1", "bpertussis_tohamaI", sequence, max_annotation_length=750, gene_targets=set(["adk"]))
+    assert isinstance(annotated_sequence, AnnotatedString)
+    assert len(annotated_sequence.annotations) >= 1
+    assert annotated_sequence.annotations[0].type == "gene"
+    assert "adk" in annotated_sequence.annotations[0].feature_properties["gene"]
--- a/tests/resources/1I1_F_P1815443_047.ab1
+++ b/tests/resources/1I1_F_P1815443_047.ab1
--- a/tests/resources/1I1_R_P1815443_094.ab1
+++ b/tests/resources/1I1_R_P1815443_094.ab1
--- a/tests/resources/1I2_F_P1815443_048.ab1
+++ b/tests/resources/1I2_F_P1815443_048.ab1
--- a/tests/resources/1I2_R_P1815443_011.ab1
+++ b/tests/resources/1I2_R_P1815443_011.ab1
--- a/tests/resources/tohama_I_bpertussis.fasta
+++ b/tests/resources/tohama_I_bpertussis.fasta