commit 99f75942ac5758ceaf686fb6d0babd6a93388376
Author: Harrison Deng <yunyangdeng@gmail.com>
Date:   Wed Mar 22 14:30:53 2023 -0500

    Initial commit.

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bb21ddc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,200 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
+
+output
+.vscode
+input_fasta.fasta
+gene_list.csv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..96efe71
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# MSA Splitter
+
+Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
diff --git a/msa_splitter.py b/msa_splitter.py
new file mode 100755
index 0000000..232d429
--- /dev/null
+++ b/msa_splitter.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+
+# General Outline of What Needs to be Done
+# Multiple sequence alignments in FASTA format
+# -> Split into individual gene files ->
+# 2 Files for each gene, one with full nucleotide sequence, one without.
+# Looks like we also want the amino acid sequence, we'll do that too.
+
+
+import argparse
+from logging import debug, error, info, warning
+import logging
+import os
+from Bio import SeqIO, SeqRecord
+import csv
+
+
+def read_genes_from_csv(batch_genes_csv_path: str):
+    csv_header = None
+    genes = []
+    with open(batch_genes_csv_path) as batch_csv_fd:
+        reader = csv.reader(batch_csv_fd)
+        for row in reader:
+            if not csv_header:
+                csv_header = list(row)
+            else:
+                gene_name = row[csv_header.index("Gene")]
+                gene_start = int(row[csv_header.index("Start")]) - 1
+                gene_end = int(row[csv_header.index("End")])
+                genes.append((gene_name, gene_start, gene_end))
+    return genes
+
+
+def read_msa_file(input: str):
+    if not os.path.isfile(input):
+        error(f'"{input}" could not be found.')
+        exit(3)
+    for s_record in SeqIO.parse(input, "fasta"):
+        yield s_record
+
+
+def write_to_file(
+    output_dir: str,
+    gene_name: str,
+    start: int,
+    end: int,
+    full_suffix: str,
+    ns_suffix: str,
+    aa_suffix: str,
+    nt_sequence_records: list,
+    nt_no_stop_sequence_records: list,
+    aa_sequence_records: list = None,
+    aa_no_stop_sequence_records: list = None,
+):
+
+    output_path = os.path.abspath(output_dir)
+    if not os.path.isdir(output_dir):
+        os.makedirs(output_path)
+
+    SeqIO.write(
+        nt_sequence_records,
+        os.path.join(
+            output_path,
+            f'{gene_name or f"{start} - {end - 3}"}' f" - {full_suffix}.fasta",
+        ),
+        "fasta",
+    )
+    SeqIO.write(
+        nt_no_stop_sequence_records,
+        os.path.join(
+            output_path,
+            f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
+        ),
+        "fasta",
+    )
+
+    if aa_sequence_records:
+        SeqIO.write(
+            aa_sequence_records,
+            os.path.join(
+                output_path,
+                f'{gene_name or f"{start} - {end - 3}"}'
+                f" - {aa_suffix}"
+                f" - {full_suffix}.fasta",
+            ),
+            "fasta",
+        )
+
+    if aa_no_stop_sequence_records:
+        SeqIO.write(
+            aa_no_stop_sequence_records,
+            os.path.join(
+                output_path,
+                f'{gene_name or f"{start} - {end - 3}"}'
+                f" - {aa_suffix}"
+                f" - {ns_suffix}.fasta",
+            ),
+            "fasta",
+        )
+
+
+STOP_CODONS = {"TAG", "TAA", "TGA"}
+
+START_CODON = "ATG"
+
+
+def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
+
+    nt_sequence_records = []
+    nt_no_stop_sequence_records = []
+    aa_sequence_records = []
+    aa_no_stop_sequence_records = []
+
+    debug(f"Beginning sequence trimming for {start} - {end}.")
+    for s_record in msa_records:
+        record_metadata = (
+            s_record.id,
+            s_record.name,
+            s_record.description,
+            s_record.dbxrefs,
+            s_record.features,
+            s_record.annotations,
+            s_record.letter_annotations,
+        )
+        found_start_codon = False
+        start_shift = 0
+        for i in range(correction_range):
+            if s_record.seq[start + i: start + i + 3] == START_CODON:
+                found_start_codon = True
+                start_shift = i
+                break
+            if s_record.seq[start - i: start - i + 3] == START_CODON:
+                found_start_codon = True
+                start_shift = -i
+                break
+        if not found_start_codon:
+            warning(
+                f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
+        if start_shift != 0:
+            warning(
+                f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
+
+        end_shift = 0
+        found_stop_codon = False
+        for i in range(correction_range):
+            if s_record.seq[end + i - 3: end + i] in STOP_CODONS:
+                found_stop_codon = True
+                end_shift = i
+                break
+            if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
+                found_stop_codon = True
+                end_shift = -i
+                break
+        if not found_stop_codon:
+            warning(
+                f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
+        if end_shift != 0:
+            warning(
+                f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
+        nt_sequence = s_record.seq[start +
+                                   start_shift: end + end_shift]  # Cropping step
+        nt_no_stop_sequence = s_record.seq[start +
+                                           start_shift: end - 3 + end_shift]
+        nt_sequence_records.append(
+            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
+        nt_no_stop_sequence_records.append(
+            SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
+        )
+
+        if gen_cut_stop_codon:
+            aa_sequence = nt_sequence.translate()
+            aa_no_stop_sequence = nt_no_stop_sequence.translate()
+
+            aa_sequence_records.append(
+                SeqRecord.SeqRecord(aa_sequence, *record_metadata)
+            )
+
+            aa_no_stop_sequence_records.append(
+                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
+            )
+        debug(f"Trimming for {s_record.id} complete.")
+    debug(f"Sequence trimming for {start} - {end} complete.")
+    return (
+        nt_sequence_records,
+        nt_no_stop_sequence_records,
+        aa_sequence_records,
+        aa_no_stop_sequence_records,
+    )
+
+
+def main(args):
+    logging.basicConfig(level=args.log_level.upper())
+
+    msa_records = list(read_msa_file(args.input))
+    info(f"MSA records read complete. Found {len(msa_records)} records.")
+    genes = []
+    if args.gene_list:
+        genes = read_genes_from_csv(args.gene_list)
+        info(
+            f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
+    else:
+        if args.gene_name and args.start and args.end:
+            genes.append([args.gene_name, args.start, args.end])
+            info(
+                f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
+        else:
+            raise Exception(
+                "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
+            )
+    for gene_name, start, end in genes:
+        (
+            nt_sequence_records,
+            nt_no_stop_sequence_records,
+            aa_sequence_records,
+            aa_no_stop_sequence_records,
+        ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
+        write_to_file(
+            args.output_dir,
+            gene_name,
+            start,
+            end,
+            args.full_suffix,
+            args.ns_suffix,
+            args.aa_suffix,
+            nt_sequence_records,
+            nt_no_stop_sequence_records,
+            aa_sequence_records,
+            aa_no_stop_sequence_records,
+        )
+        info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="msa_splitter",
+        description="""
+            The MSA splitter is a simple program that takes in two positions
+            and a MSA file and produces two separate FASTA files
+        """,
+    )
+
+    parser.add_argument(
+        "input",
+        help="The MSA file in FASTA format.",
+        type=str,
+        metavar="i",
+    )
+
+    parser.add_argument(
+        "-L",
+        "--gene-list",
+        help="A list of genes in CSV file.",
+        type=str,
+        required=False,
+        default=None,
+    )
+
+    parser.add_argument(
+        "-s",
+        "--start",
+        help="The start of the desired sequence.",
+        type=int,
+        required=False,
+        default=None,
+    )
+
+    parser.add_argument(
+        "-e",
+        "--end",
+        help="The end of the desired sequence.",
+        type=int,
+        required=False,
+        default=None,
+    )
+
+    parser.add_argument(
+        "-n",
+        "--gene-name",
+        help="The name of the gene with the given start and end locations.",
+        type=str,
+        required=False,
+        default=None,
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        help="""The directory (folder) of which the
+        generated files should reside.
+        """,
+        type=str,
+        required=False,
+        default="output",
+    )
+
+    parser.add_argument(
+        "-G",
+        "--gen-cut-stop-codon",
+        help="""If specified, for each FASTA file generated,
+        another FASTA will be generated without the stop codon.
+        """,
+        required=False,
+        default=False,
+        action="store_true"
+    )
+
+    parser.add_argument(
+        "-nt",
+        "--full-suffix",
+        help="""The suffix to append to the full nucleotide sequence
+        FASTA output files.
+        """,
+        type=str,
+        required=False,
+        default="full",
+    )
+
+    parser.add_argument(
+        "-ns",
+        "--ns-suffix",
+        help="""The suffix to append to the full nucleotide sequence
+        without the stop FASTA output files.
+        """,
+        type=str,
+        required=False,
+        default="no stop",
+    )
+
+    parser.add_argument(
+        "-aa",
+        "--aa-suffix",
+        help="""The suffix to append to the amino acid sequence FASTA
+        output files.
+        """,
+        type=str,
+        required=False,
+        default="aa",
+    )
+
+    parser.add_argument(
+        "--correction-range",
+        "-R",
+        help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
+        type=int,
+        required=False,
+        default=9
+    )
+
+    parser.add_argument(
+        "-E",
+        "--log-level",
+        help="The verbosity of the program.",
+        type=str,
+        required=False,
+        default="INFO"
+    )
+
+    main(parser.parse_args())
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..808664b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+Bio==1.5.6