Added python project description files

Added 'Jenkinsfile' and 'environment.yml'
Removed bad parameter
2023-04-21 11:47:28 -05:00 · 2023-04-21 11:44:01 -05:00 · 2023-04-20 15:16:34 -05:00 · 2023-04-20 12:24:26 -05:00 · 2023-04-20 10:07:39 -05:00
12 changed files with 272 additions and 194 deletions
--- a/36
+++ b/36
@@ -0,0 +1,36 @@
+pipeline {
+    agent any
+    stages {
+        stage("clean") {
+            steps {
+                sh 'rm -rf ./dist/*'
+            }
+        }
+        stage("install") {
+            steps {
+                sh 'mamba env update --file environment.yml'
+                sh 'echo "mamba activate bmlsa" >> ~/.bashrc'
+            }
+        }
+        stage("build") {
+            steps {
+                sh "python -m build"
+            }
+        }
+        stage("test") {
+            steps {
+                sh "pip install dist/*.whl"
+            }
+        }
+        stage("publish") {
+            when {
+                branch '**/master'
+            }
+            steps {
+                withCredentials([usernamePassword(credentialsId: 'rs-git-package-registry-ydeng', passwordVariable: 'PASS', usernameVariable: 'USER')]) {
+                    sh "python -m twine upload --repository-url https://git.reslate.systems/api/packages/${USER}/pypi -u ${USER} -p ${PASS} --non-interactive --disable-progress-bar --verbose dist/*"
+                }
+            }
+        }
+    }
+}
--- a/protannot/init.py
+++ b/protannot/init.py
--- a/bmlsa/aligner.py
+++ b/bmlsa/aligner.py
@@ -0,0 +1,43 @@
+from Bio.Align import PairwiseAligner, substitution_matrices
+from exceptions import UnexpectedAlignmentResult
+
+from datatypes import AlignedSequence
+
+
+def protein_align_many_to_one_ssw(sequence: str, queries: dict[str, AlignedSequence]):
+    annotation_pairs = {}
+    aligner = PairwiseAligner()
+    aligner.mode = "local"
+    aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+    aligner.extend_gap_score = -1
+    aligner.open_gap_score = -11
+    for id, query in queries.items():
+        try:
+            alignments = aligner.align(sequence, query.sequence)
+        except ValueError:
+            continue
+        if len(alignments) > 1:
+            raise UnexpectedAlignmentResult(
+                "More than one alignment resulted from a single query."
+            )
+        for alignment in alignments:
+            score, query_aligned = (alignment.score, alignment.aligned[0][0])
+            aligned_start, aligned_end = query_aligned
+            annotation_pairs[id] = AlignedSequence(
+                id,
+                query.sequence,
+                query.name,
+                query.description,
+                query.start,
+                query.end,
+                query.score,
+            ), AlignedSequence(
+                id,
+                alignment.query,
+                query.name,
+                query.description,
+                aligned_start,
+                aligned_end,
+                score,
+            )
+    return annotation_pairs
--- a/bmlsa/cli.py
+++ b/bmlsa/cli.py
@@ -0,0 +1,94 @@
+import os
+import argparse
+from Bio import SeqIO
+from aligner import protein_align_many_to_one_ssw
+
+from persistence import read_annotations_from_csv, save_alignments_to_csv
+
+
+def main():
+    argparser = argparse.ArgumentParser("blmsa")
+    argparser.add_argument(
+        "annotations",
+        type=str,
+        help=(
+            "Path to CSV containing the sequences to align as well as the "
+            "annotations for the respective sequences"
+        ),
+        metavar="a",
+    )
+    argparser.add_argument(
+        "sequence",
+        type=str,
+        help=(
+            "Path to the sequence to annotate in FASTA format. "
+            "If multiple sequences are present, annotations will be run on each"
+        ),
+        metavar="s",
+    )
+    argparser.add_argument(
+        "output", type=str, help="Path to output location", metavar="o"
+    )
+    argparser.add_argument(
+        "-I", "--id-header", type=str, help="The header for the ID of the annotation"
+    )
+    argparser.add_argument(
+        "-N",
+        "--name-header",
+        type=str,
+        help="The header for the name of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-D",
+        "--desc-header",
+        type=str,
+        help="The header for the description of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-T",
+        "--start-header",
+        type=str,
+        help="The header for the start of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-E",
+        "--end-header",
+        type=str,
+        help="The header for the end of the annotation",
+        required=False,
+    )
+    argparser.add_argument(
+        "-S",
+        "--seq-header",
+        type=str,
+        help="The header for the sequence of the annotation",
+    )
+    args = argparser.parse_args()
+    given_annotations = read_annotations_from_csv(
+        args.annotations,
+        args.id_header,
+        args.name_header,
+        args.desc_header,
+        args.start_header,
+        args.end_header,
+        args.seq_header,
+    )
+    with open(args.sequence, "r") as sequence_fd:
+        for sequence in SeqIO.parse(sequence_fd, "fasta"):
+            aligned_annotations = protein_align_many_to_one_ssw(
+                str(sequence.seq), given_annotations
+            )
+            save_alignments_to_csv(
+                aligned_annotations,
+                os.path.join(
+                    args.output,
+                    sequence.id.replace("|", "+").replace(".", "_") + ".csv",
+                ),
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/protannot/Datatypes.py
+++ b/protannot/Datatypes.py
@@ -1,4 +1,4 @@
-class Annotation:
+class AlignedSequence:
    def __init__(
        self,
        id: str,
--- a/bmlsa/exceptions.py
+++ b/bmlsa/exceptions.py
@@ -0,0 +1,2 @@
+class UnexpectedAlignmentResult(Exception):
+    pass
--- a/bmlsa/persistence.py
+++ b/bmlsa/persistence.py
@@ -0,0 +1,70 @@
+import csv
+from datatypes import AlignedSequence
+
+
+def read_annotations_from_csv(
+    csv_path: str,
+    id_header: str,
+    name_header: str,
+    desc_header: str,
+    start_header: str,
+    end_header: str,
+    sequence_header: str,
+):
+    annotations = {}
+    with open(csv_path, "r") as csv_fd:
+        reader = csv.reader(csv_fd)
+        id_ind = None
+        name_ind = None
+        desc_ind = None
+        start_ind = None
+        end_ind = None
+        sequence_ind = None
+        headers_parsed = False
+        for row in reader:
+            if not headers_parsed:
+                id_ind = row.index(id_header)
+                name_ind = row.index(name_header) if name_header else None
+                desc_ind = row.index(desc_header) if desc_header else None
+                start_ind = row.index(start_header) if start_header else None
+                end_ind = row.index(end_header) if end_header else None
+                sequence_ind = row.index(sequence_header)
+                headers_parsed = True
+                continue
+            id = row[id_ind]
+            name = row[name_ind] if name_header else None
+            desc = row[desc_ind] if desc_header else None
+            start = row[start_ind] if start_header else None
+            end = row[end_ind] if end_header else None
+            sequence = row[sequence_ind]
+            annotations[id] = AlignedSequence(
+                id,
+                sequence,
+                name,
+                desc,
+                int(start) if start else None,
+                int(end) if end else None,
+            )
+    return annotations
+
+
+def save_alignments_to_csv(
+    aligned_pairs: dict[str, tuple[AlignedSequence, AlignedSequence]], output_path: str
+):
+    with open(output_path, "w") as output_fd:
+        writer = csv.writer(output_fd)
+        header_wrote = False
+        header_order = None
+        for id, annotations in aligned_pairs.items():
+            original, aligned = annotations
+            original_vars = vars(original)
+            aligned_vars = vars(aligned)
+            if not header_wrote:
+                header_order = list(original_vars.keys())
+                header = ["original" + key for key in header_order]
+                header.extend(["aligned" + key for key in header_order])
+                writer.writerow(header)
+                header_wrote = True
+            row_data = [original_vars[header] for header in header_order]
+            row_data.extend([aligned_vars[header] for header in header_order])
+            writer.writerow(row_data)
--- a/environment.yml
+++ b/environment.yml
@@ -0,0 +1,8 @@
+name: bmlsa
+channels:
+  - conda-forge
+dependencies:
+  - biopython=1.81
+  - build=0.7
+  - pytest=7
+  - twine=4
--- a/protannot/protannot.py
+++ b/protannot/protannot.py
@@ -1,193 +0,0 @@
-import csv
-import os
-from skbio.alignment import StripedSmithWaterman
-import argparse
-from Bio import SeqIO
-
-from Datatypes import Annotation
-
-
-def read_annotations_from_csv(
-    csv_path: str,
-    id_header: str,
-    name_header: str,
-    desc_header: str,
-    start_header: str,
-    end_header: str,
-    sequence_header: str,
-):
-    annotations = {}
-    with open(csv_path, "r") as csv_fd:
-        reader = csv.reader(csv_fd)
-        id_ind = None
-        name_ind = None
-        desc_ind = None
-        start_ind = None
-        end_ind = None
-        sequence_ind = None
-        headers_parsed = False
-        for row in reader:
-            if not headers_parsed:
-                id_ind = row.index(id_header)
-                name_ind = row.index(name_header) if name_header else None
-                desc_ind = row.index(desc_header) if desc_header else None
-                start_ind = row.index(start_header) if start_header else None
-                end_ind = row.index(end_header) if end_header else None
-                sequence_ind = row.index(sequence_header)
-                headers_parsed = True
-                continue
-            id = row[id_ind]
-            name = row[name_ind] if name_header else None
-            desc = row[desc_ind] if desc_header else None
-            start = row[start_ind] if start_header else None
-            end = row[end_ind] if end_header else None
-            sequence = row[sequence_ind]
-            annotations[id] = Annotation(
-                id,
-                sequence,
-                name,
-                desc,
-                int(start) if start else None,
-                int(end) if end else None,
-            )
-    return annotations
-
-
-def generate_annotated_positions(sequence: str, annotations: dict[str, Annotation]):
-    annotation_pairs = {}
-    align = StripedSmithWaterman(sequence)
-    for id, annot in annotations.items():
-        alignment = align(annot.sequence)
-        score, aligned_start, aligned_end = (
-            alignment.optimal_alignment_score,
-            alignment.query_begin,
-            alignment.query_end,
-        )
-        annotation_pairs[id] = Annotation(
-            id,
-            annot.sequence,
-            annot.name,
-            annot.description,
-            annot.start,
-            annot.end,
-            annot.score,
-        ), Annotation(
-            id,
-            alignment.aligned_target_sequence,
-            annot.name,
-            annot.description,
-            aligned_start,
-            aligned_end,
-            score,
-        )
-    return annotation_pairs
-
-
-def save_alignments_to_csv(
-    aligned_pairs: dict[str, tuple[Annotation, Annotation]], output_path: str
-):
-    with open(output_path, "w") as output_fd:
-        writer = csv.writer(output_fd)
-        header_wrote = False
-        header_order = None
-        for id, annotations in aligned_pairs.items():
-            original, aligned = annotations
-            original_vars = vars(original)
-            aligned_vars = vars(aligned)
-            if not header_wrote:
-                header_order = list(original_vars.keys())
-                header = ["original" + key for key in header_order]
-                header.extend(["aligned" + key for key in header_order])
-                writer.writerow(header)
-                header_wrote = True
-            row_data = [original_vars[header] for header in header_order]
-            row_data.extend([aligned_vars[header] for header in header_order])
-            writer.writerow(row_data)
-
-
-def main():
-    argparser = argparse.ArgumentParser("protannot")
-    argparser.add_argument(
-        "annotations",
-        type=str,
-        help=(
-            "Path to CSV containing the sequences to align as well as the "
-            "annotations for the respective sequences"
-        ),
-        metavar="a",
-    )
-    argparser.add_argument(
-        "sequence",
-        type=str,
-        help=(
-            "Path to the sequence to annotate in FASTA format. "
-            "If multiple sequences are present, annotations will be run on each"
-        ),
-        metavar="s",
-    )
-    argparser.add_argument(
-        "output", type=str, help="Path to output location", metavar="o"
-    )
-    argparser.add_argument(
-        "-I", "--id-header", type=str, help="The header for the ID of the annotation"
-    )
-    argparser.add_argument(
-        "-N",
-        "--name-header",
-        type=str,
-        help="The header for the name of the annotation",
-        required=False,
-    )
-    argparser.add_argument(
-        "-D",
-        "--desc-header",
-        type=str,
-        help="The header for the description of the annotation",
-        required=False,
-    )
-    argparser.add_argument(
-        "-T",
-        "--start-header",
-        type=str,
-        help="The header for the start of the annotation",
-        required=False,
-    )
-    argparser.add_argument(
-        "-E",
-        "--end-header",
-        type=str,
-        help="The header for the end of the annotation",
-        required=False,
-    )
-    argparser.add_argument(
-        "-S",
-        "--seq-header",
-        type=str,
-        help="The header for the sequence of the annotation",
-    )
-    args = argparser.parse_args()
-    given_annotations = read_annotations_from_csv(
-        args.annotations,
-        args.id_header,
-        args.name_header,
-        args.desc_header,
-        args.start_header,
-        args.end_header,
-        args.seq_header,
-    )
-    with open(args.sequence, "r") as sequence_fd:
-        for sequence in SeqIO.parse(sequence_fd, "fasta"):
-            aligned_annotations = generate_annotated_positions(
-                str(sequence.seq), given_annotations
-            )
-            save_alignments_to_csv(
-                aligned_annotations,
-                os.path.join(
-                    args.output,
-                    sequence.id.replace("|", "+").replace(".", "_") + ".csv",
-                ),
-            )
-
-
-if __name__ == "__main__":
-    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools", "wheel"]
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,12 @@
+[metadata]
+name = bmlsa
+version = 0.0.1
+
+[options]
+packages = bmlsa
+install_requires =
+    biopython ==1.81
+
+[options.entry_points]
+console_scripts =
+    bmlsa = bmlsa.cli:main
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,3 @@
+from setuptools import setup
+
+setup()
Author	SHA1	Message	Date
Harrison	2a80c2aea2	Added python project description files All checks were successful ydeng/bmlsa/pipeline/head This commit looks good Details	2023-04-21 11:47:28 -05:00
Harrison	6b3c9c312e	Added 'Jenkinsfile' and 'environment.yml' Some checks failed ydeng/bmlsa/pipeline/head There was a failure building this commit Details	2023-04-21 11:44:01 -05:00
Harrison	7f9f9405c3	Removed bad parameter	2023-04-20 15:16:34 -05:00
Harrison	04f730cacb	Changed alignment library	2023-04-20 12:24:26 -05:00
Harrison	f22070e8c3	Renamed project	2023-04-20 10:07:39 -05:00