commit fed4709d7902f962808206f36a5c34ea5e684be3 Author: Harrison Date: Thu Apr 20 09:48:19 2023 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a31886c --- /dev/null +++ b/.gitignore @@ -0,0 +1,214 @@ +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig +# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python +# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..de288e1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "black" +} \ No newline at end of file diff --git a/protannot/Datatypes.py b/protannot/Datatypes.py new file mode 100644 index 0000000..5bd6374 --- /dev/null +++ b/protannot/Datatypes.py @@ -0,0 +1,46 @@ +class Annotation: + def __init__( + self, + id: str, + sequence: str, + name: str = None, + description: str = None, + start: int = None, + end: int = None, + score: int = None, + ) -> None: + self._description = description + self._start = start + self._end = end + self._id = id + self._name = name + self._sequence = sequence + self._score = score + + @property + def start(self): + return self._start + + @property + def end(self): + return self._end + + @property + def id(self): + return self._id + + @property + def name(self): + return self._name + + @property + def description(self): + return self._description + + @property + def sequence(self): + return self._sequence + + @property + def score(self): + return self._score diff --git a/protannot/__init__.py b/protannot/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/protannot/protannot.py b/protannot/protannot.py new file mode 100644 index 0000000..e1bcff7 --- /dev/null +++ b/protannot/protannot.py @@ -0,0 +1,193 @@ +import csv +import os +from skbio.alignment import StripedSmithWaterman +import argparse +from Bio import SeqIO + +from Datatypes import Annotation + + +def read_annotations_from_csv( + csv_path: str, + id_header: str, + name_header: str, + desc_header: str, + start_header: str, + end_header: str, + sequence_header: str, +): + annotations = {} + with open(csv_path, "r") as csv_fd: + reader = csv.reader(csv_fd) + id_ind = None + name_ind = None + desc_ind = None + start_ind = None + end_ind = None + sequence_ind = None + headers_parsed = False + for row in reader: + if not headers_parsed: + id_ind = row.index(id_header) + name_ind = row.index(name_header) if name_header else None + desc_ind = row.index(desc_header) if desc_header else None + start_ind = row.index(start_header) if start_header else None + end_ind = row.index(end_header) if end_header else None + sequence_ind = row.index(sequence_header) + headers_parsed = True + continue + id = row[id_ind] + name = row[name_ind] if name_header else None + desc = row[desc_ind] if desc_header else None + start = row[start_ind] if start_header else None + end = row[end_ind] if end_header else None + sequence = row[sequence_ind] + annotations[id] = Annotation( + id, + sequence, + name, + desc, + int(start) if start else None, + int(end) if end else None, + ) + return annotations + + +def generate_annotated_positions(sequence: str, annotations: dict[str, Annotation]): + annotation_pairs = {} + align = StripedSmithWaterman(sequence) + for id, annot in annotations.items(): + alignment = align(annot.sequence) + score, aligned_start, aligned_end = ( + alignment.optimal_alignment_score, + alignment.query_begin, + alignment.query_end, + ) + annotation_pairs[id] = Annotation( + id, + annot.sequence, + annot.name, + annot.description, + annot.start, + annot.end, + annot.score, + ), Annotation( + id, + alignment.aligned_target_sequence, + annot.name, + annot.description, + aligned_start, + aligned_end, + score, + ) + return annotation_pairs + + +def save_alignments_to_csv( + aligned_pairs: dict[str, tuple[Annotation, Annotation]], output_path: str +): + with open(output_path, "w") as output_fd: + writer = csv.writer(output_fd) + header_wrote = False + header_order = None + for id, annotations in aligned_pairs.items(): + original, aligned = annotations + original_vars = vars(original) + aligned_vars = vars(aligned) + if not header_wrote: + header_order = list(original_vars.keys()) + header = ["original" + key for key in header_order] + header.extend(["aligned" + key for key in header_order]) + writer.writerow(header) + header_wrote = True + row_data = [original_vars[header] for header in header_order] + row_data.extend([aligned_vars[header] for header in header_order]) + writer.writerow(row_data) + + +def main(): + argparser = argparse.ArgumentParser("protannot") + argparser.add_argument( + "annotations", + type=str, + help=( + "Path to CSV containing the sequences to align as well as the " + "annotations for the respective sequences" + ), + metavar="a", + ) + argparser.add_argument( + "sequence", + type=str, + help=( + "Path to the sequence to annotate in FASTA format. " + "If multiple sequences are present, annotations will be run on each" + ), + metavar="s", + ) + argparser.add_argument( + "output", type=str, help="Path to output location", metavar="o" + ) + argparser.add_argument( + "-I", "--id-header", type=str, help="The header for the ID of the annotation" + ) + argparser.add_argument( + "-N", + "--name-header", + type=str, + help="The header for the name of the annotation", + required=False, + ) + argparser.add_argument( + "-D", + "--desc-header", + type=str, + help="The header for the description of the annotation", + required=False, + ) + argparser.add_argument( + "-T", + "--start-header", + type=str, + help="The header for the start of the annotation", + required=False, + ) + argparser.add_argument( + "-E", + "--end-header", + type=str, + help="The header for the end of the annotation", + required=False, + ) + argparser.add_argument( + "-S", + "--seq-header", + type=str, + help="The header for the sequence of the annotation", + ) + args = argparser.parse_args() + given_annotations = read_annotations_from_csv( + args.annotations, + args.id_header, + args.name_header, + args.desc_header, + args.start_header, + args.end_header, + args.seq_header, + ) + with open(args.sequence, "r") as sequence_fd: + for sequence in SeqIO.parse(sequence_fd, "fasta"): + aligned_annotations = generate_annotated_positions( + str(sequence.seq), given_annotations + ) + save_alignments_to_csv( + aligned_annotations, + os.path.join( + args.output, + sequence.id.replace("|", "+").replace(".", "_") + ".csv", + ), + ) + + +if __name__ == "__main__": + main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..e0ea542 --- /dev/null +++ b/tox.ini @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +extend-ignore = E203 \ No newline at end of file