From 04f730cacbe73581ed8b343ea2bc3ab565166e09 Mon Sep 17 00:00:00 2001
From: Harrison <yunyangdeng@gmail.com>
Date: Thu, 20 Apr 2023 12:24:26 -0500
Subject: [PATCH] Changed alignment library

---
 bmlsa/aligner.py     | 66 +++++++++++++++++++++++++-------------------
 bmlsa/cli.py         |  4 +--
 bmlsa/datatypes.py   |  2 +-
 bmlsa/exceptions.py  |  2 ++
 bmlsa/persistence.py |  6 ++--
 5 files changed, 46 insertions(+), 34 deletions(-)
 create mode 100644 bmlsa/exceptions.py

diff --git a/bmlsa/aligner.py b/bmlsa/aligner.py
index eb3ec45..eaeb509 100644
--- a/bmlsa/aligner.py
+++ b/bmlsa/aligner.py
@@ -1,33 +1,43 @@
-from skbio.alignment import StripedSmithWaterman
+from Bio.Align import PairwiseAligner, substitution_matrices
+from exceptions import UnexpectedAlignmentResult
 
-from datatypes import Annotation
+from datatypes import AlignedSequence
 
 
-def generate_annotated_positions(sequence: str, annotations: dict[str, Annotation]):
+def protein_align_many_to_one_ssw(sequence: str, queries: dict[str, AlignedSequence]):
     annotation_pairs = {}
-    align = StripedSmithWaterman(sequence)
-    for id, annot in annotations.items():
-        alignment = align(annot.sequence)
-        score, aligned_start, aligned_end = (
-            alignment.optimal_alignment_score,
-            alignment.query_begin,
-            alignment.query_end,
-        )
-        annotation_pairs[id] = Annotation(
-            id,
-            annot.sequence,
-            annot.name,
-            annot.description,
-            annot.start,
-            annot.end,
-            annot.score,
-        ), Annotation(
-            id,
-            alignment.aligned_target_sequence,
-            annot.name,
-            annot.description,
-            aligned_start,
-            aligned_end,
-            score,
-        )
+    aligner = PairwiseAligner(one_alignment_only=True)
+    aligner.mode = "local"
+    aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+    aligner.extend_gap_score = -1
+    aligner.open_gap_score = -11
+    for id, query in queries.items():
+        try:
+            alignments = aligner.align(sequence, query.sequence)
+        except ValueError:
+            continue
+        if len(alignments) > 1:
+            raise UnexpectedAlignmentResult(
+                "More than one alignment resulted from a single query."
+            )
+        for alignment in alignments:
+            score, query_aligned = (alignment.score, alignment.aligned[0][0])
+            aligned_start, aligned_end = query_aligned
+            annotation_pairs[id] = AlignedSequence(
+                id,
+                query.sequence,
+                query.name,
+                query.description,
+                query.start,
+                query.end,
+                query.score,
+            ), AlignedSequence(
+                id,
+                alignment.query,
+                query.name,
+                query.description,
+                aligned_start,
+                aligned_end,
+                score,
+            )
     return annotation_pairs
diff --git a/bmlsa/cli.py b/bmlsa/cli.py
index 0bfe373..424dfcd 100644
--- a/bmlsa/cli.py
+++ b/bmlsa/cli.py
@@ -1,7 +1,7 @@
 import os
 import argparse
 from Bio import SeqIO
-from aligner import generate_annotated_positions
+from aligner import protein_align_many_to_one_ssw
 
 from persistence import read_annotations_from_csv, save_alignments_to_csv
 
@@ -78,7 +78,7 @@ def main():
     )
     with open(args.sequence, "r") as sequence_fd:
         for sequence in SeqIO.parse(sequence_fd, "fasta"):
-            aligned_annotations = generate_annotated_positions(
+            aligned_annotations = protein_align_many_to_one_ssw(
                 str(sequence.seq), given_annotations
             )
             save_alignments_to_csv(
diff --git a/bmlsa/datatypes.py b/bmlsa/datatypes.py
index 5bd6374..f007ca6 100644
--- a/bmlsa/datatypes.py
+++ b/bmlsa/datatypes.py
@@ -1,4 +1,4 @@
-class Annotation:
+class AlignedSequence:
     def __init__(
         self,
         id: str,
diff --git a/bmlsa/exceptions.py b/bmlsa/exceptions.py
new file mode 100644
index 0000000..2ce522f
--- /dev/null
+++ b/bmlsa/exceptions.py
@@ -0,0 +1,2 @@
+class UnexpectedAlignmentResult(Exception):
+    pass
diff --git a/bmlsa/persistence.py b/bmlsa/persistence.py
index 28d7e8d..fa4dc9f 100644
--- a/bmlsa/persistence.py
+++ b/bmlsa/persistence.py
@@ -1,5 +1,5 @@
 import csv
-from datatypes import Annotation
+from datatypes import AlignedSequence
 
 
 def read_annotations_from_csv(
@@ -37,7 +37,7 @@ def read_annotations_from_csv(
             start = row[start_ind] if start_header else None
             end = row[end_ind] if end_header else None
             sequence = row[sequence_ind]
-            annotations[id] = Annotation(
+            annotations[id] = AlignedSequence(
                 id,
                 sequence,
                 name,
@@ -49,7 +49,7 @@ def read_annotations_from_csv(
 
 
 def save_alignments_to_csv(
-    aligned_pairs: dict[str, tuple[Annotation, Annotation]], output_path: str
+    aligned_pairs: dict[str, tuple[AlignedSequence, AlignedSequence]], output_path: str
 ):
     with open(output_path, "w") as output_fd:
         writer = csv.writer(output_fd)