From eccb08153ef92610db0dfeb46bb010fd47d4a9e9 Mon Sep 17 00:00:00 2001 From: Harrison Date: Tue, 11 Apr 2023 12:33:52 -0500 Subject: [PATCH] Renamed project to "SplitMSA" and added pipeline file --- Jenkinsfile | 26 +++++ README.md | 2 +- environment.yaml | 46 -------- environment.yml | 5 + pyproject.toml | 3 + setup.cfg | 12 +++ setup.py | 3 + msa_splitter.py => splitmsa/splitmsa.py | 137 ++++++++++++------------ 8 files changed, 120 insertions(+), 114 deletions(-) create mode 100644 Jenkinsfile delete mode 100644 environment.yaml create mode 100644 environment.yml create mode 100644 pyproject.toml create mode 100644 setup.cfg create mode 100644 setup.py rename msa_splitter.py => splitmsa/splitmsa.py (99%) diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..6fb36f0 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,26 @@ +pipeline { + agent any + stages { + stage("install") { + steps { + sh 'conda env update --file environment.yml' + sh 'echo "conda activate splitmsa" >> ~/.bashrc' + } + } + stage("build") { + steps { + sh "python -m build" + } + } + stage("publish") { + when { + branch '**/master' + } + steps { + withCredentials([usernamePassword(credentialsId: 'rs-git-package-registry-ydeng', passwordVariable: 'PASS', usernameVariable: 'USER')]) { + sh "python -m twine upload --repository-url https://git.reslate.systems/api/packages/${USER}/pypi -u ${USER} -p ${PASS} --non-interactive --disable-progress-bar --verbose dist/*" + } + } + } + } +} \ No newline at end of file diff --git a/README.md b/README.md index d560a84..8ca6806 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# MSA Splitter +# SplitMSA Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file. diff --git a/environment.yaml b/environment.yaml deleted file mode 100644 index 0d4902a..0000000 --- a/environment.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: /home/ydeng/msa-splitter/envs -channels: - - conda-forge -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - biopython=1.81=py311h2582759_0 - - black=23.3.0=py311h38be061_0 - - bzip2=1.0.8=h7f98852_4 - - ca-certificates=2022.12.7=ha878542_0 - - click=8.1.3=unix_pyhd8ed1ab_2 - - ld_impl_linux-64=2.40=h41732ed_0 - - libblas=3.9.0=16_linux64_openblas - - libcblas=3.9.0=16_linux64_openblas - - libexpat=2.5.0=hcb278e6_1 - - libffi=3.4.2=h7f98852_5 - - libgcc-ng=12.2.0=h65d4601_19 - - libgfortran-ng=12.2.0=h69a702a_19 - - libgfortran5=12.2.0=h337968e_19 - - libgomp=12.2.0=h65d4601_19 - - liblapack=3.9.0=16_linux64_openblas - - libnsl=2.0.0=h7f98852_0 - - libopenblas=0.3.21=pthreads_h78a6416_3 - - libsqlite=3.40.0=h753d276_0 - - libstdcxx-ng=12.2.0=h46fd767_19 - - libuuid=2.38.1=h0b41bf4_0 - - libzlib=1.2.13=h166bdaf_4 - - mypy_extensions=1.0.0=pyha770c72_0 - - ncurses=6.3=h27087fc_1 - - numpy=1.24.2=py311h8e6699e_0 - - openssl=3.1.0=h0b41bf4_0 - - packaging=23.0=pyhd8ed1ab_0 - - pathspec=0.11.1=pyhd8ed1ab_0 - - pip=23.0.1=pyhd8ed1ab_0 - - platformdirs=3.2.0=pyhd8ed1ab_0 - - python=3.11.1=h2755cc3_0_cpython - - python_abi=3.11=3_cp311 - - readline=8.2=h8228510_1 - - setuptools=67.6.1=pyhd8ed1ab_0 - - tk=8.6.12=h27826a3_0 - - typing-extensions=4.5.0=hd8ed1ab_0 - - typing_extensions=4.5.0=pyha770c72_0 - - tzdata=2023c=h71feb2d_0 - - wheel=0.40.0=pyhd8ed1ab_0 - - xz=5.2.6=h166bdaf_0 -prefix: /home/ydeng/msa-splitter/envs diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..3ced613 --- /dev/null +++ b/environment.yml @@ -0,0 +1,5 @@ +name: splitmsa +channels: + - conda-forge +dependencies: + - biopython=1.81 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5f8f5e3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools", "wheel"] \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3f5f3de --- /dev/null +++ b/setup.cfg @@ -0,0 +1,12 @@ +[metadata] +name = splitmsa +version = 0.0.1 + +[options] +packages = splitmsa +install_requires = + Bio + +[options.entry_points] +console_scripts = + splitmsa = splitmsa.splitmsa:main \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6068493 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() diff --git a/msa_splitter.py b/splitmsa/splitmsa.py similarity index 99% rename from msa_splitter.py rename to splitmsa/splitmsa.py index 14fb3a4..af66656 100755 --- a/msa_splitter.py +++ b/splitmsa/splitmsa.py @@ -203,7 +203,7 @@ def trim( ) if perform_translation and not skip_translation: - if '-' in nt_sequence: + if "-" in nt_sequence: sequence_with_ambiguity = [] for codon_in_sequence in range(0, len(nt_sequence), 3): codon = nt_sequence[codon_in_sequence : codon_in_sequence + 3] @@ -247,72 +247,9 @@ def output_as_csv(gene: str, problems: list[list[str]], output_path: str): writer.writerows(problems) -def main(args): - logging.basicConfig(level=args.log_level.upper()) - - msa_records = list(read_msa_file(args.input)) - info(f"MSA records read complete. Found {len(msa_records)} records.") - genes = [] - if args.gene_list: - genes = read_genes_from_csv(args.gene_list) - info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.") - else: - if args.gene_name and args.start and args.end: - genes.append([args.gene_name, args.start, args.end]) - info( - f"Extracting {args.gene_name} starting at {args.start} to " - f"{args.end}." - ) - else: - raise Exception( - "Need either a gene list by --gene-list or a start and end " - "via --start, and --end respectively." - ) - for gene_name, start, end in genes: - info(f"Started on gene {gene_name} ({start} - {end})") - ( - nt_sequence_records, - nt_no_stop_sequence_records, - aa_sequence_records, - aa_no_stop_sequence_records, - problems, - ) = trim( - start, - end, - args.gen_cut_stop_codon, - args.do_translate, - msa_records, - correction_range=args.correction_range, - ) - if len(problems) > 0: - warning( - f"There were {len(problems)} problems " f"during trimming {gene_name}!" - ) - if args.catalogue_problems: - output_as_csv( - gene_name, - problems, - os.path.join(args.output_dir, f"{gene_name} - problems.csv"), - ) - write_to_file( - args.output_dir, - gene_name, - start, - end, - args.full_suffix, - args.ns_suffix, - args.aa_suffix, - nt_sequence_records, - nt_no_stop_sequence_records, - aa_sequence_records, - aa_no_stop_sequence_records, - ) - info(f"Completed gene {gene_name} ({start} - {end})") - - -if __name__ == "__main__": +def main(): parser = argparse.ArgumentParser( - prog="msa_splitter", + prog="splitmsa", description=""" The MSA splitter is a simple program that takes in two positions and a MSA file and produces two separate FASTA files @@ -453,4 +390,70 @@ if __name__ == "__main__": action="store_true", ) - main(parser.parse_args()) + args = parser.parse_args() + + logging.basicConfig(level=args.log_level.upper()) + + msa_records = list(read_msa_file(args.input)) + info(f"MSA records read complete. Found {len(msa_records)} records.") + genes = [] + if args.gene_list: + genes = read_genes_from_csv(args.gene_list) + info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.") + else: + if args.gene_name and args.start and args.end: + genes.append([args.gene_name, args.start, args.end]) + info( + f"Extracting {args.gene_name} starting at {args.start} to " + f"{args.end}." + ) + else: + raise Exception( + "Need either a gene list by --gene-list or a start and end " + "via --start, and --end respectively." + ) + for gene_name, start, end in genes: + info(f"Started on gene {gene_name} ({start} - {end})") + ( + nt_sequence_records, + nt_no_stop_sequence_records, + aa_sequence_records, + aa_no_stop_sequence_records, + problems, + ) = trim( + start, + end, + args.gen_cut_stop_codon, + args.do_translate, + msa_records, + correction_range=args.correction_range, + ) + if len(problems) > 0: + warning( + f"There were {len(problems)} problems " f"during trimming {gene_name}!" + ) + if args.catalogue_problems: + output_as_csv( + gene_name, + problems, + os.path.join(args.output_dir, f"{gene_name} - problems.csv"), + ) + write_to_file( + args.output_dir, + gene_name, + start, + end, + args.full_suffix, + args.ns_suffix, + args.aa_suffix, + nt_sequence_records, + nt_no_stop_sequence_records, + aa_sequence_records, + aa_no_stop_sequence_records, + ) + info(f"Completed gene {gene_name} ({start} - {end})") + + +if __name__ == "__main__": + + main()