From 42d73270b39d4dcff712a82d409e2e7b663566a6 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Fri, 10 Mar 2023 15:40:42 -0600 Subject: [PATCH] Initial commit. Finished program with additional functions such as: * Custom search string * Custom search location --- .gitignore | 195 ++++++++++++++++++++++++++++++++++++++++++++ .vscode/launch.json | 21 +++++ README.md | 18 ++++ fasta_filter.py | 65 +++++++++++++++ 4 files changed, 299 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/launch.json create mode 100644 README.md create mode 100755 fasta_filter.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4934ad1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,195 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..13dd9c7 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Run Fasta Filter", + "type": "python", + "request": "launch", + "program": "fasta_filter.py", + "args": [ + "reference_standards.fas", + "reference_stasndards_filtered.fas", + "-c", "rt" + ], + "console": "integratedTerminal", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..deee741 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# Fasta Filter + +A simple program to remove sequences in a fasta file containing specific strings. + +## Usage + + 0. Install python3. + + 1. Run `pip install -r ./requirements.txt` if `biopython` is not installed. If installed, skip to step 2. + + 2. Run `./fasta_filter.py -h` if on Linux, or on Windows `python3 ./fasta_filter -h`. See help for more information. + +### Example Usage + + 1. Pretend `input.fasta` exists in the same folder as `fasta_filter.py` (the program file). + + 2. To remove all sequences in `input.fasta` that contain `rt` in the header and output to a new file called `input_filtered.fasta`, run: `python3 ./fasta_filter input.fasta input_filtered.fasta -c rt` + diff --git a/fasta_filter.py b/fasta_filter.py new file mode 100755 index 0000000..b3abbcf --- /dev/null +++ b/fasta_filter.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +import os +from Bio import SeqIO +import argparse + + +def main(args): + kept_records = [] + for s_record in SeqIO.parse(args.input, "fasta"): + if args.property != "all" and not ( + args.contains in getattr(s_record, args.property) + ): + kept_records.append(s_record) + elif args.property == "all" and not ( + args.contains in s_record.id + and args.contains in s_record.name + and args.contains in s_record.description): + kept_records.append(s_record) + SeqIO.write( + kept_records, + os.path.abspath(args.output), + "fasta" + ) + + +if __name__ == "__main__": + argparser = argparse.ArgumentParser( + "Removes sequences where id contains certain string.") + argparser.add_argument( + "input", + type=str, + metavar="i", + help="The input fasta file." + ) + + argparser.add_argument( + "output", + type=str, + metavar="o", + help="The output file path." + ) + + argparser.add_argument( + "-c", + "--contains", + type=str, + default=None, + required=False, + help="The string to search for." + ) + + argparser.add_argument( + "-p", + "--property", + type=str, + default="all", + required=False, + help=""" + The part of the fasta file to look through. Valid options are: id, + name, description, or all. + """ + ) + + main(argparser.parse_args())