From eccb08153ef92610db0dfeb46bb010fd47d4a9e9 Mon Sep 17 00:00:00 2001
From: Harrison <yunyangdeng@gmail.com>
Date: Tue, 11 Apr 2023 12:33:52 -0500
Subject: [PATCH] Renamed project to "SplitMSA" and added pipeline file

---
 Jenkinsfile                             |  26 +++++
 README.md                               |   2 +-
 environment.yaml                        |  46 --------
 environment.yml                         |   5 +
 pyproject.toml                          |   3 +
 setup.cfg                               |  12 +++
 setup.py                                |   3 +
 msa_splitter.py => splitmsa/splitmsa.py | 137 ++++++++++++------------
 8 files changed, 120 insertions(+), 114 deletions(-)
 create mode 100644 Jenkinsfile
 delete mode 100644 environment.yaml
 create mode 100644 environment.yml
 create mode 100644 pyproject.toml
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 rename msa_splitter.py => splitmsa/splitmsa.py (99%)

diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 0000000..6fb36f0
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,26 @@
+pipeline {
+    agent any
+    stages {
+        stage("install") {
+            steps {
+                sh 'conda env update --file environment.yml'
+                sh 'echo "conda activate splitmsa" >> ~/.bashrc'
+            }
+        }
+        stage("build") {
+            steps {
+                sh "python -m build"
+            }
+        }
+        stage("publish") {
+            when {
+                branch '**/master'
+            }
+            steps {
+                withCredentials([usernamePassword(credentialsId: 'rs-git-package-registry-ydeng', passwordVariable: 'PASS', usernameVariable: 'USER')]) {
+                    sh "python -m twine upload --repository-url https://git.reslate.systems/api/packages/${USER}/pypi -u ${USER} -p ${PASS} --non-interactive --disable-progress-bar --verbose dist/*"
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index d560a84..8ca6806 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# MSA Splitter
+# SplitMSA
 
 Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
 
diff --git a/environment.yaml b/environment.yaml
deleted file mode 100644
index 0d4902a..0000000
--- a/environment.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-name: /home/ydeng/msa-splitter/envs
-channels:
-  - conda-forge
-dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
-  - biopython=1.81=py311h2582759_0
-  - black=23.3.0=py311h38be061_0
-  - bzip2=1.0.8=h7f98852_4
-  - ca-certificates=2022.12.7=ha878542_0
-  - click=8.1.3=unix_pyhd8ed1ab_2
-  - ld_impl_linux-64=2.40=h41732ed_0
-  - libblas=3.9.0=16_linux64_openblas
-  - libcblas=3.9.0=16_linux64_openblas
-  - libexpat=2.5.0=hcb278e6_1
-  - libffi=3.4.2=h7f98852_5
-  - libgcc-ng=12.2.0=h65d4601_19
-  - libgfortran-ng=12.2.0=h69a702a_19
-  - libgfortran5=12.2.0=h337968e_19
-  - libgomp=12.2.0=h65d4601_19
-  - liblapack=3.9.0=16_linux64_openblas
-  - libnsl=2.0.0=h7f98852_0
-  - libopenblas=0.3.21=pthreads_h78a6416_3
-  - libsqlite=3.40.0=h753d276_0
-  - libstdcxx-ng=12.2.0=h46fd767_19
-  - libuuid=2.38.1=h0b41bf4_0
-  - libzlib=1.2.13=h166bdaf_4
-  - mypy_extensions=1.0.0=pyha770c72_0
-  - ncurses=6.3=h27087fc_1
-  - numpy=1.24.2=py311h8e6699e_0
-  - openssl=3.1.0=h0b41bf4_0
-  - packaging=23.0=pyhd8ed1ab_0
-  - pathspec=0.11.1=pyhd8ed1ab_0
-  - pip=23.0.1=pyhd8ed1ab_0
-  - platformdirs=3.2.0=pyhd8ed1ab_0
-  - python=3.11.1=h2755cc3_0_cpython
-  - python_abi=3.11=3_cp311
-  - readline=8.2=h8228510_1
-  - setuptools=67.6.1=pyhd8ed1ab_0
-  - tk=8.6.12=h27826a3_0
-  - typing-extensions=4.5.0=hd8ed1ab_0
-  - typing_extensions=4.5.0=pyha770c72_0
-  - tzdata=2023c=h71feb2d_0
-  - wheel=0.40.0=pyhd8ed1ab_0
-  - xz=5.2.6=h166bdaf_0
-prefix: /home/ydeng/msa-splitter/envs
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..3ced613
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,5 @@
+name: splitmsa
+channels:
+  - conda-forge
+dependencies:
+  - biopython=1.81
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..5f8f5e3
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools", "wheel"]
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3f5f3de
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,12 @@
+[metadata]
+name = splitmsa
+version = 0.0.1
+
+[options]
+packages = splitmsa
+install_requires =
+    Bio
+
+[options.entry_points]
+console_scripts =
+    splitmsa = splitmsa.splitmsa:main
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..6068493
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,3 @@
+from setuptools import setup
+
+setup()
diff --git a/msa_splitter.py b/splitmsa/splitmsa.py
similarity index 99%
rename from msa_splitter.py
rename to splitmsa/splitmsa.py
index 14fb3a4..af66656 100755
--- a/msa_splitter.py
+++ b/splitmsa/splitmsa.py
@@ -203,7 +203,7 @@ def trim(
             )
 
         if perform_translation and not skip_translation:
-            if '-' in nt_sequence:
+            if "-" in nt_sequence:
                 sequence_with_ambiguity = []
                 for codon_in_sequence in range(0, len(nt_sequence), 3):
                     codon = nt_sequence[codon_in_sequence : codon_in_sequence + 3]
@@ -247,72 +247,9 @@ def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
         writer.writerows(problems)
 
 
-def main(args):
-    logging.basicConfig(level=args.log_level.upper())
-
-    msa_records = list(read_msa_file(args.input))
-    info(f"MSA records read complete. Found {len(msa_records)} records.")
-    genes = []
-    if args.gene_list:
-        genes = read_genes_from_csv(args.gene_list)
-        info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
-    else:
-        if args.gene_name and args.start and args.end:
-            genes.append([args.gene_name, args.start, args.end])
-            info(
-                f"Extracting {args.gene_name} starting at {args.start} to "
-                f"{args.end}."
-            )
-        else:
-            raise Exception(
-                "Need either a gene list by --gene-list or a start and end "
-                "via --start, and --end respectively."
-            )
-    for gene_name, start, end in genes:
-        info(f"Started on gene {gene_name} ({start} - {end})")
-        (
-            nt_sequence_records,
-            nt_no_stop_sequence_records,
-            aa_sequence_records,
-            aa_no_stop_sequence_records,
-            problems,
-        ) = trim(
-            start,
-            end,
-            args.gen_cut_stop_codon,
-            args.do_translate,
-            msa_records,
-            correction_range=args.correction_range,
-        )
-        if len(problems) > 0:
-            warning(
-                f"There were {len(problems)} problems " f"during trimming {gene_name}!"
-            )
-        if args.catalogue_problems:
-            output_as_csv(
-                gene_name,
-                problems,
-                os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
-            )
-        write_to_file(
-            args.output_dir,
-            gene_name,
-            start,
-            end,
-            args.full_suffix,
-            args.ns_suffix,
-            args.aa_suffix,
-            nt_sequence_records,
-            nt_no_stop_sequence_records,
-            aa_sequence_records,
-            aa_no_stop_sequence_records,
-        )
-        info(f"Completed gene {gene_name} ({start} - {end})")
-
-
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(
-        prog="msa_splitter",
+        prog="splitmsa",
         description="""
             The MSA splitter is a simple program that takes in two positions
             and a MSA file and produces two separate FASTA files
@@ -453,4 +390,70 @@ if __name__ == "__main__":
         action="store_true",
     )
 
-    main(parser.parse_args())
+    args = parser.parse_args()
+
+    logging.basicConfig(level=args.log_level.upper())
+
+    msa_records = list(read_msa_file(args.input))
+    info(f"MSA records read complete. Found {len(msa_records)} records.")
+    genes = []
+    if args.gene_list:
+        genes = read_genes_from_csv(args.gene_list)
+        info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
+    else:
+        if args.gene_name and args.start and args.end:
+            genes.append([args.gene_name, args.start, args.end])
+            info(
+                f"Extracting {args.gene_name} starting at {args.start} to "
+                f"{args.end}."
+            )
+        else:
+            raise Exception(
+                "Need either a gene list by --gene-list or a start and end "
+                "via --start, and --end respectively."
+            )
+    for gene_name, start, end in genes:
+        info(f"Started on gene {gene_name} ({start} - {end})")
+        (
+            nt_sequence_records,
+            nt_no_stop_sequence_records,
+            aa_sequence_records,
+            aa_no_stop_sequence_records,
+            problems,
+        ) = trim(
+            start,
+            end,
+            args.gen_cut_stop_codon,
+            args.do_translate,
+            msa_records,
+            correction_range=args.correction_range,
+        )
+        if len(problems) > 0:
+            warning(
+                f"There were {len(problems)} problems " f"during trimming {gene_name}!"
+            )
+        if args.catalogue_problems:
+            output_as_csv(
+                gene_name,
+                problems,
+                os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
+            )
+        write_to_file(
+            args.output_dir,
+            gene_name,
+            start,
+            end,
+            args.full_suffix,
+            args.ns_suffix,
+            args.aa_suffix,
+            nt_sequence_records,
+            nt_no_stop_sequence_records,
+            aa_sequence_records,
+            aa_no_stop_sequence_records,
+        )
+        info(f"Completed gene {gene_name} ({start} - {end})")
+
+
+if __name__ == "__main__":
+
+    main()