Initial commit

2023-04-20 09:48:19 -05:00
commit fed4709d79
6 changed files with 459 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,214 @@
+# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
+# Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,linux,python
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,linux,python
+
+# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
+
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}
--- a/protannot/Datatypes.py
+++ b/protannot/Datatypes.py
@@ -0,0 +1,46 @@
+class Annotation:
+    def __init__(
+        self,
+        id: str,
+        sequence: str,
+        name: str = None,
+        description: str = None,
+        start: int = None,
+        end: int = None,
+        score: int = None,
+    ) -> None:
+        self._description = description
+        self._start = start
+        self._end = end
+        self._id = id
+        self._name = name
+        self._sequence = sequence
+        self._score = score
+
+    @property
+    def start(self):
+        return self._start
+
+    @property
+    def end(self):
+        return self._end
+
+    @property
+    def id(self):
+        return self._id
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def description(self):
+        return self._description
+
+    @property
+    def sequence(self):
+        return self._sequence
+
+    @property
+    def score(self):
+        return self._score
--- a/protannot/init.py
+++ b/protannot/init.py
--- a/protannot/protannot.py
+++ b/protannot/protannot.py
@@ -0,0 +1,193 @@
+import csv
+import os
+from skbio.alignment import StripedSmithWaterman
+import argparse
+from Bio import SeqIO
+
+from Datatypes import Annotation
+
+
+def read_annotations_from_csv(
+    csv_path: str,
+    id_header: str,
+    name_header: str,
+    desc_header: str,
+    start_header: str,
+    end_header: str,
+    sequence_header: str,
+):
+    annotations = {}
+    with open(csv_path, "r") as csv_fd:
+        reader = csv.reader(csv_fd)
+        id_ind = None
+        name_ind = None
+        desc_ind = None
+        start_ind = None
+        end_ind = None
+        sequence_ind = None
+        headers_parsed = False
+        for row in reader:
+            if not headers_parsed:
+                id_ind = row.index(id_header)
+                name_ind = row.index(name_header) if name_header else None
+                desc_ind = row.index(desc_header) if desc_header else None
+                start_ind = row.index(start_header) if start_header else None
+                end_ind = row.index(end_header) if end_header else None
+                sequence_ind = row.index(sequence_header)
+                headers_parsed = True
+                continue
+            id = row[id_ind]
+            name = row[name_ind] if name_header else None
+            desc = row[desc_ind] if desc_header else None
+            start = row[start_ind] if start_header else None
+            end = row[end_ind] if end_header else None
+            sequence = row[sequence_ind]
+            annotations[id] = Annotation(
+                id,
+                sequence,
+                name,
+                desc,
+                int(start) if start else None,
+                int(end) if end else None,
+            )
+    return annotations
+
+
+def generate_annotated_positions(sequence: str, annotations: dict[str, Annotation]):
+    annotation_pairs = {}
+    align = StripedSmithWaterman(sequence)
+    for id, annot in annotations.items():
+        alignment = align(annot.sequence)
+        score, aligned_start, aligned_end = (
+            alignment.optimal_alignment_score,
+            alignment.query_begin,
+            alignment.query_end,
+        )
+        annotation_pairs[id] = Annotation(
+            id,
+            annot.sequence,
+            annot.name,
+            annot.description,
+            annot.start,
+            annot.end,
+            annot.score,
+        ), Annotation(
+            id,
+            alignment.aligned_target_sequence,
+            annot.name,
+            annot.description,
+            aligned_start,
+            aligned_end,
+            score,
+        )
+    return annotation_pairs
+
+
+def save_alignments_to_csv(
+    aligned_pairs: dict[str, tuple[Annotation, Annotation]], output_path: str
+):
+    with open(output_path, "w") as output_fd:
+        writer = csv.writer(output_fd)
+        header_wrote = False
+        header_order = None
+        for id, annotations in aligned_pairs.items():
+            original, aligned = annotations
+            original_vars = vars(original)
+            aligned_vars = vars(aligned)
+            if not header_wrote:
+                header_order = list(original_vars.keys())
+                header = ["original" + key for key in header_order]
+                header.extend(["aligned" + key for key in header_order])
+                writer.writerow(header)
+                header_wrote = True
+            row_data = [original_vars[header] for header in header_order]
+            row_data.extend([aligned_vars[header] for header in header_order])
+            writer.writerow(row_data)
+
+
+def main():
+    argparser = argparse.ArgumentParser("protannot")
+    argparser.add_argument(
+        "annotations",
+        type=str,
+        help=(
+            "Path to CSV containing the sequences to align as well as the "
+            "annotations for the respective sequences"
+        ),
+        metavar="a",
+    )
+    argparser.add_argument(
+        "sequence",
+        type=str,
+        help=(
+            "Path to the sequence to annotate in FASTA format. "
+            "If multiple sequences are present, annotations will be run on each"
+        ),
+        metavar="s",
+    )
+    argparser.add_argument(
+        "output", type=str, help="Path to output location", metavar="o"
+    )
+    argparser.add_argument(
+        "-I", "--id-header", type=str, help="The header for the ID of the annotation"
+    )
+    argparser.add_argument(
+        "-N",
+        "--name-header",
+        type=str,
+        help="The header for the name of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-D",
+        "--desc-header",
+        type=str,
+        help="The header for the description of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-T",
+        "--start-header",
+        type=str,
+        help="The header for the start of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-E",
+        "--end-header",
+        type=str,
+        help="The header for the end of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-S",
+        "--seq-header",
+        type=str,
+        help="The header for the sequence of the annotation",
+    )
+    args = argparser.parse_args()
+    given_annotations = read_annotations_from_csv(
+        args.annotations,
+        args.id_header,
+        args.name_header,
+        args.desc_header,
+        args.start_header,
+        args.end_header,
+        args.seq_header,
+    )
+    with open(args.sequence, "r") as sequence_fd:
+        for sequence in SeqIO.parse(sequence_fd, "fasta"):
+            aligned_annotations = generate_annotated_positions(
+                str(sequence.seq), given_annotations
+            )
+            save_alignments_to_csv(
+                aligned_annotations,
+                os.path.join(
+                    args.output,
+                    sequence.id.replace("|", "+").replace(".", "_") + ".csv",
+                ),
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/tox.ini
+++ b/tox.ini
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203