Initial commit.
This commit is contained in:
commit
99f75942ac
200
.gitignore
vendored
Normal file
200
.gitignore
vendored
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
### Python Patch ###
|
||||||
|
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||||
|
poetry.toml
|
||||||
|
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# LSP config files
|
||||||
|
pyrightconfig.json
|
||||||
|
|
||||||
|
### VisualStudioCode ###
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
!.vscode/*.code-snippets
|
||||||
|
|
||||||
|
# Local History for Visual Studio Code
|
||||||
|
.history/
|
||||||
|
|
||||||
|
# Built Visual Studio Code Extensions
|
||||||
|
*.vsix
|
||||||
|
|
||||||
|
### VisualStudioCode Patch ###
|
||||||
|
# Ignore all local history of files
|
||||||
|
.history
|
||||||
|
.ionide
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
|
||||||
|
|
||||||
|
output
|
||||||
|
.vscode
|
||||||
|
input_fasta.fasta
|
||||||
|
gene_list.csv
|
3
README.md
Normal file
3
README.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# MSA Splitter
|
||||||
|
|
||||||
|
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
|
358
msa_splitter.py
Executable file
358
msa_splitter.py
Executable file
@ -0,0 +1,358 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# General Outline of What Needs to be Done
|
||||||
|
# Multiple sequence alignments in FASTA format
|
||||||
|
# -> Split into individual gene files ->
|
||||||
|
# 2 Files for each gene, one with full nucleotide sequence, one without.
|
||||||
|
# Looks like we also want the amino acid sequence, we'll do that too.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from logging import debug, error, info, warning
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from Bio import SeqIO, SeqRecord
|
||||||
|
import csv
|
||||||
|
|
||||||
|
|
||||||
|
def read_genes_from_csv(batch_genes_csv_path: str):
|
||||||
|
csv_header = None
|
||||||
|
genes = []
|
||||||
|
with open(batch_genes_csv_path) as batch_csv_fd:
|
||||||
|
reader = csv.reader(batch_csv_fd)
|
||||||
|
for row in reader:
|
||||||
|
if not csv_header:
|
||||||
|
csv_header = list(row)
|
||||||
|
else:
|
||||||
|
gene_name = row[csv_header.index("Gene")]
|
||||||
|
gene_start = int(row[csv_header.index("Start")]) - 1
|
||||||
|
gene_end = int(row[csv_header.index("End")])
|
||||||
|
genes.append((gene_name, gene_start, gene_end))
|
||||||
|
return genes
|
||||||
|
|
||||||
|
|
||||||
|
def read_msa_file(input: str):
|
||||||
|
if not os.path.isfile(input):
|
||||||
|
error(f'"{input}" could not be found.')
|
||||||
|
exit(3)
|
||||||
|
for s_record in SeqIO.parse(input, "fasta"):
|
||||||
|
yield s_record
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_file(
|
||||||
|
output_dir: str,
|
||||||
|
gene_name: str,
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
full_suffix: str,
|
||||||
|
ns_suffix: str,
|
||||||
|
aa_suffix: str,
|
||||||
|
nt_sequence_records: list,
|
||||||
|
nt_no_stop_sequence_records: list,
|
||||||
|
aa_sequence_records: list = None,
|
||||||
|
aa_no_stop_sequence_records: list = None,
|
||||||
|
):
|
||||||
|
|
||||||
|
output_path = os.path.abspath(output_dir)
|
||||||
|
if not os.path.isdir(output_dir):
|
||||||
|
os.makedirs(output_path)
|
||||||
|
|
||||||
|
SeqIO.write(
|
||||||
|
nt_sequence_records,
|
||||||
|
os.path.join(
|
||||||
|
output_path,
|
||||||
|
f'{gene_name or f"{start} - {end - 3}"}' f" - {full_suffix}.fasta",
|
||||||
|
),
|
||||||
|
"fasta",
|
||||||
|
)
|
||||||
|
SeqIO.write(
|
||||||
|
nt_no_stop_sequence_records,
|
||||||
|
os.path.join(
|
||||||
|
output_path,
|
||||||
|
f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
|
||||||
|
),
|
||||||
|
"fasta",
|
||||||
|
)
|
||||||
|
|
||||||
|
if aa_sequence_records:
|
||||||
|
SeqIO.write(
|
||||||
|
aa_sequence_records,
|
||||||
|
os.path.join(
|
||||||
|
output_path,
|
||||||
|
f'{gene_name or f"{start} - {end - 3}"}'
|
||||||
|
f" - {aa_suffix}"
|
||||||
|
f" - {full_suffix}.fasta",
|
||||||
|
),
|
||||||
|
"fasta",
|
||||||
|
)
|
||||||
|
|
||||||
|
if aa_no_stop_sequence_records:
|
||||||
|
SeqIO.write(
|
||||||
|
aa_no_stop_sequence_records,
|
||||||
|
os.path.join(
|
||||||
|
output_path,
|
||||||
|
f'{gene_name or f"{start} - {end - 3}"}'
|
||||||
|
f" - {aa_suffix}"
|
||||||
|
f" - {ns_suffix}.fasta",
|
||||||
|
),
|
||||||
|
"fasta",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
STOP_CODONS = {"TAG", "TAA", "TGA"}
|
||||||
|
|
||||||
|
START_CODON = "ATG"
|
||||||
|
|
||||||
|
|
||||||
|
def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
|
||||||
|
|
||||||
|
nt_sequence_records = []
|
||||||
|
nt_no_stop_sequence_records = []
|
||||||
|
aa_sequence_records = []
|
||||||
|
aa_no_stop_sequence_records = []
|
||||||
|
|
||||||
|
debug(f"Beginning sequence trimming for {start} - {end}.")
|
||||||
|
for s_record in msa_records:
|
||||||
|
record_metadata = (
|
||||||
|
s_record.id,
|
||||||
|
s_record.name,
|
||||||
|
s_record.description,
|
||||||
|
s_record.dbxrefs,
|
||||||
|
s_record.features,
|
||||||
|
s_record.annotations,
|
||||||
|
s_record.letter_annotations,
|
||||||
|
)
|
||||||
|
found_start_codon = False
|
||||||
|
start_shift = 0
|
||||||
|
for i in range(correction_range):
|
||||||
|
if s_record.seq[start + i: start + i + 3] == START_CODON:
|
||||||
|
found_start_codon = True
|
||||||
|
start_shift = i
|
||||||
|
break
|
||||||
|
if s_record.seq[start - i: start - i + 3] == START_CODON:
|
||||||
|
found_start_codon = True
|
||||||
|
start_shift = -i
|
||||||
|
break
|
||||||
|
if not found_start_codon:
|
||||||
|
warning(
|
||||||
|
f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
|
||||||
|
if start_shift != 0:
|
||||||
|
warning(
|
||||||
|
f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
|
||||||
|
|
||||||
|
end_shift = 0
|
||||||
|
found_stop_codon = False
|
||||||
|
for i in range(correction_range):
|
||||||
|
if s_record.seq[end + i - 3: end + i] in STOP_CODONS:
|
||||||
|
found_stop_codon = True
|
||||||
|
end_shift = i
|
||||||
|
break
|
||||||
|
if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
|
||||||
|
found_stop_codon = True
|
||||||
|
end_shift = -i
|
||||||
|
break
|
||||||
|
if not found_stop_codon:
|
||||||
|
warning(
|
||||||
|
f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
|
||||||
|
if end_shift != 0:
|
||||||
|
warning(
|
||||||
|
f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
|
||||||
|
nt_sequence = s_record.seq[start +
|
||||||
|
start_shift: end + end_shift] # Cropping step
|
||||||
|
nt_no_stop_sequence = s_record.seq[start +
|
||||||
|
start_shift: end - 3 + end_shift]
|
||||||
|
nt_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
|
||||||
|
nt_no_stop_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
|
||||||
|
)
|
||||||
|
|
||||||
|
if gen_cut_stop_codon:
|
||||||
|
aa_sequence = nt_sequence.translate()
|
||||||
|
aa_no_stop_sequence = nt_no_stop_sequence.translate()
|
||||||
|
|
||||||
|
aa_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(aa_sequence, *record_metadata)
|
||||||
|
)
|
||||||
|
|
||||||
|
aa_no_stop_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
|
||||||
|
)
|
||||||
|
debug(f"Trimming for {s_record.id} complete.")
|
||||||
|
debug(f"Sequence trimming for {start} - {end} complete.")
|
||||||
|
return (
|
||||||
|
nt_sequence_records,
|
||||||
|
nt_no_stop_sequence_records,
|
||||||
|
aa_sequence_records,
|
||||||
|
aa_no_stop_sequence_records,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
logging.basicConfig(level=args.log_level.upper())
|
||||||
|
|
||||||
|
msa_records = list(read_msa_file(args.input))
|
||||||
|
info(f"MSA records read complete. Found {len(msa_records)} records.")
|
||||||
|
genes = []
|
||||||
|
if args.gene_list:
|
||||||
|
genes = read_genes_from_csv(args.gene_list)
|
||||||
|
info(
|
||||||
|
f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
|
||||||
|
else:
|
||||||
|
if args.gene_name and args.start and args.end:
|
||||||
|
genes.append([args.gene_name, args.start, args.end])
|
||||||
|
info(
|
||||||
|
f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
"Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
|
||||||
|
)
|
||||||
|
for gene_name, start, end in genes:
|
||||||
|
(
|
||||||
|
nt_sequence_records,
|
||||||
|
nt_no_stop_sequence_records,
|
||||||
|
aa_sequence_records,
|
||||||
|
aa_no_stop_sequence_records,
|
||||||
|
) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
|
||||||
|
write_to_file(
|
||||||
|
args.output_dir,
|
||||||
|
gene_name,
|
||||||
|
start,
|
||||||
|
end,
|
||||||
|
args.full_suffix,
|
||||||
|
args.ns_suffix,
|
||||||
|
args.aa_suffix,
|
||||||
|
nt_sequence_records,
|
||||||
|
nt_no_stop_sequence_records,
|
||||||
|
aa_sequence_records,
|
||||||
|
aa_no_stop_sequence_records,
|
||||||
|
)
|
||||||
|
info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="msa_splitter",
|
||||||
|
description="""
|
||||||
|
The MSA splitter is a simple program that takes in two positions
|
||||||
|
and a MSA file and produces two separate FASTA files
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"input",
|
||||||
|
help="The MSA file in FASTA format.",
|
||||||
|
type=str,
|
||||||
|
metavar="i",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-L",
|
||||||
|
"--gene-list",
|
||||||
|
help="A list of genes in CSV file.",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--start",
|
||||||
|
help="The start of the desired sequence.",
|
||||||
|
type=int,
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--end",
|
||||||
|
help="The end of the desired sequence.",
|
||||||
|
type=int,
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--gene-name",
|
||||||
|
help="The name of the gene with the given start and end locations.",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output-dir",
|
||||||
|
help="""The directory (folder) of which the
|
||||||
|
generated files should reside.
|
||||||
|
""",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default="output",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-G",
|
||||||
|
"--gen-cut-stop-codon",
|
||||||
|
help="""If specified, for each FASTA file generated,
|
||||||
|
another FASTA will be generated without the stop codon.
|
||||||
|
""",
|
||||||
|
required=False,
|
||||||
|
default=False,
|
||||||
|
action="store_true"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-nt",
|
||||||
|
"--full-suffix",
|
||||||
|
help="""The suffix to append to the full nucleotide sequence
|
||||||
|
FASTA output files.
|
||||||
|
""",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default="full",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-ns",
|
||||||
|
"--ns-suffix",
|
||||||
|
help="""The suffix to append to the full nucleotide sequence
|
||||||
|
without the stop FASTA output files.
|
||||||
|
""",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default="no stop",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-aa",
|
||||||
|
"--aa-suffix",
|
||||||
|
help="""The suffix to append to the amino acid sequence FASTA
|
||||||
|
output files.
|
||||||
|
""",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default="aa",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--correction-range",
|
||||||
|
"-R",
|
||||||
|
help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
|
||||||
|
type=int,
|
||||||
|
required=False,
|
||||||
|
default=9
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-E",
|
||||||
|
"--log-level",
|
||||||
|
help="The verbosity of the program.",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default="INFO"
|
||||||
|
)
|
||||||
|
|
||||||
|
main(parser.parse_args())
|
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Bio==1.5.6
|
Loading…
x
Reference in New Issue
Block a user