From 99f75942ac5758ceaf686fb6d0babd6a93388376 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 22 Mar 2023 14:30:53 -0500 Subject: [PATCH] Initial commit. --- .gitignore | 200 ++++++++++++++++++++++++++ README.md | 3 + msa_splitter.py | 358 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 562 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 msa_splitter.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb21ddc --- /dev/null +++ b/.gitignore @@ -0,0 +1,200 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode + +output +.vscode +input_fasta.fasta +gene_list.csv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..96efe71 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# MSA Splitter + +Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file. diff --git a/msa_splitter.py b/msa_splitter.py new file mode 100755 index 0000000..232d429 --- /dev/null +++ b/msa_splitter.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 + +# General Outline of What Needs to be Done +# Multiple sequence alignments in FASTA format +# -> Split into individual gene files -> +# 2 Files for each gene, one with full nucleotide sequence, one without. +# Looks like we also want the amino acid sequence, we'll do that too. + + +import argparse +from logging import debug, error, info, warning +import logging +import os +from Bio import SeqIO, SeqRecord +import csv + + +def read_genes_from_csv(batch_genes_csv_path: str): + csv_header = None + genes = [] + with open(batch_genes_csv_path) as batch_csv_fd: + reader = csv.reader(batch_csv_fd) + for row in reader: + if not csv_header: + csv_header = list(row) + else: + gene_name = row[csv_header.index("Gene")] + gene_start = int(row[csv_header.index("Start")]) - 1 + gene_end = int(row[csv_header.index("End")]) + genes.append((gene_name, gene_start, gene_end)) + return genes + + +def read_msa_file(input: str): + if not os.path.isfile(input): + error(f'"{input}" could not be found.') + exit(3) + for s_record in SeqIO.parse(input, "fasta"): + yield s_record + + +def write_to_file( + output_dir: str, + gene_name: str, + start: int, + end: int, + full_suffix: str, + ns_suffix: str, + aa_suffix: str, + nt_sequence_records: list, + nt_no_stop_sequence_records: list, + aa_sequence_records: list = None, + aa_no_stop_sequence_records: list = None, +): + + output_path = os.path.abspath(output_dir) + if not os.path.isdir(output_dir): + os.makedirs(output_path) + + SeqIO.write( + nt_sequence_records, + os.path.join( + output_path, + f'{gene_name or f"{start} - {end - 3}"}' f" - {full_suffix}.fasta", + ), + "fasta", + ) + SeqIO.write( + nt_no_stop_sequence_records, + os.path.join( + output_path, + f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta", + ), + "fasta", + ) + + if aa_sequence_records: + SeqIO.write( + aa_sequence_records, + os.path.join( + output_path, + f'{gene_name or f"{start} - {end - 3}"}' + f" - {aa_suffix}" + f" - {full_suffix}.fasta", + ), + "fasta", + ) + + if aa_no_stop_sequence_records: + SeqIO.write( + aa_no_stop_sequence_records, + os.path.join( + output_path, + f'{gene_name or f"{start} - {end - 3}"}' + f" - {aa_suffix}" + f" - {ns_suffix}.fasta", + ), + "fasta", + ) + + +STOP_CODONS = {"TAG", "TAA", "TGA"} + +START_CODON = "ATG" + + +def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16): + + nt_sequence_records = [] + nt_no_stop_sequence_records = [] + aa_sequence_records = [] + aa_no_stop_sequence_records = [] + + debug(f"Beginning sequence trimming for {start} - {end}.") + for s_record in msa_records: + record_metadata = ( + s_record.id, + s_record.name, + s_record.description, + s_record.dbxrefs, + s_record.features, + s_record.annotations, + s_record.letter_annotations, + ) + found_start_codon = False + start_shift = 0 + for i in range(correction_range): + if s_record.seq[start + i: start + i + 3] == START_CODON: + found_start_codon = True + start_shift = i + break + if s_record.seq[start - i: start - i + 3] == START_CODON: + found_start_codon = True + start_shift = -i + break + if not found_start_codon: + warning( + f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") + if start_shift != 0: + warning( + f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}") + + end_shift = 0 + found_stop_codon = False + for i in range(correction_range): + if s_record.seq[end + i - 3: end + i] in STOP_CODONS: + found_stop_codon = True + end_shift = i + break + if s_record.seq[end - i - 3: end - i] in STOP_CODONS: + found_stop_codon = True + end_shift = -i + break + if not found_stop_codon: + warning( + f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") + if end_shift != 0: + warning( + f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.") + nt_sequence = s_record.seq[start + + start_shift: end + end_shift] # Cropping step + nt_no_stop_sequence = s_record.seq[start + + start_shift: end - 3 + end_shift] + nt_sequence_records.append( + SeqRecord.SeqRecord(nt_sequence, *record_metadata)) + nt_no_stop_sequence_records.append( + SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata) + ) + + if gen_cut_stop_codon: + aa_sequence = nt_sequence.translate() + aa_no_stop_sequence = nt_no_stop_sequence.translate() + + aa_sequence_records.append( + SeqRecord.SeqRecord(aa_sequence, *record_metadata) + ) + + aa_no_stop_sequence_records.append( + SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata) + ) + debug(f"Trimming for {s_record.id} complete.") + debug(f"Sequence trimming for {start} - {end} complete.") + return ( + nt_sequence_records, + nt_no_stop_sequence_records, + aa_sequence_records, + aa_no_stop_sequence_records, + ) + + +def main(args): + logging.basicConfig(level=args.log_level.upper()) + + msa_records = list(read_msa_file(args.input)) + info(f"MSA records read complete. Found {len(msa_records)} records.") + genes = [] + if args.gene_list: + genes = read_genes_from_csv(args.gene_list) + info( + f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.") + else: + if args.gene_name and args.start and args.end: + genes.append([args.gene_name, args.start, args.end]) + info( + f"Extracting {args.gene_name} starting at {args.start} to {args.end}.") + else: + raise Exception( + "Need either a gene list by --gene-list or a start and end via --start, and --end respectively." + ) + for gene_name, start, end in genes: + ( + nt_sequence_records, + nt_no_stop_sequence_records, + aa_sequence_records, + aa_no_stop_sequence_records, + ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range) + write_to_file( + args.output_dir, + gene_name, + start, + end, + args.full_suffix, + args.ns_suffix, + args.aa_suffix, + nt_sequence_records, + nt_no_stop_sequence_records, + aa_sequence_records, + aa_no_stop_sequence_records, + ) + info(f"Extracted gene sequence for {gene_name} ({start} - {end})") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="msa_splitter", + description=""" + The MSA splitter is a simple program that takes in two positions + and a MSA file and produces two separate FASTA files + """, + ) + + parser.add_argument( + "input", + help="The MSA file in FASTA format.", + type=str, + metavar="i", + ) + + parser.add_argument( + "-L", + "--gene-list", + help="A list of genes in CSV file.", + type=str, + required=False, + default=None, + ) + + parser.add_argument( + "-s", + "--start", + help="The start of the desired sequence.", + type=int, + required=False, + default=None, + ) + + parser.add_argument( + "-e", + "--end", + help="The end of the desired sequence.", + type=int, + required=False, + default=None, + ) + + parser.add_argument( + "-n", + "--gene-name", + help="The name of the gene with the given start and end locations.", + type=str, + required=False, + default=None, + ) + + parser.add_argument( + "-o", + "--output-dir", + help="""The directory (folder) of which the + generated files should reside. + """, + type=str, + required=False, + default="output", + ) + + parser.add_argument( + "-G", + "--gen-cut-stop-codon", + help="""If specified, for each FASTA file generated, + another FASTA will be generated without the stop codon. + """, + required=False, + default=False, + action="store_true" + ) + + parser.add_argument( + "-nt", + "--full-suffix", + help="""The suffix to append to the full nucleotide sequence + FASTA output files. + """, + type=str, + required=False, + default="full", + ) + + parser.add_argument( + "-ns", + "--ns-suffix", + help="""The suffix to append to the full nucleotide sequence + without the stop FASTA output files. + """, + type=str, + required=False, + default="no stop", + ) + + parser.add_argument( + "-aa", + "--aa-suffix", + help="""The suffix to append to the amino acid sequence FASTA + output files. + """, + type=str, + required=False, + default="aa", + ) + + parser.add_argument( + "--correction-range", + "-R", + help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.", + type=int, + required=False, + default=9 + ) + + parser.add_argument( + "-E", + "--log-level", + help="The verbosity of the program.", + type=str, + required=False, + default="INFO" + ) + + main(parser.parse_args()) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..808664b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Bio==1.5.6