From 04f730cacbe73581ed8b343ea2bc3ab565166e09 Mon Sep 17 00:00:00 2001 From: Harrison Date: Thu, 20 Apr 2023 12:24:26 -0500 Subject: [PATCH] Changed alignment library --- bmlsa/aligner.py | 66 +++++++++++++++++++++++++------------------- bmlsa/cli.py | 4 +-- bmlsa/datatypes.py | 2 +- bmlsa/exceptions.py | 2 ++ bmlsa/persistence.py | 6 ++-- 5 files changed, 46 insertions(+), 34 deletions(-) create mode 100644 bmlsa/exceptions.py diff --git a/bmlsa/aligner.py b/bmlsa/aligner.py index eb3ec45..eaeb509 100644 --- a/bmlsa/aligner.py +++ b/bmlsa/aligner.py @@ -1,33 +1,43 @@ -from skbio.alignment import StripedSmithWaterman +from Bio.Align import PairwiseAligner, substitution_matrices +from exceptions import UnexpectedAlignmentResult -from datatypes import Annotation +from datatypes import AlignedSequence -def generate_annotated_positions(sequence: str, annotations: dict[str, Annotation]): +def protein_align_many_to_one_ssw(sequence: str, queries: dict[str, AlignedSequence]): annotation_pairs = {} - align = StripedSmithWaterman(sequence) - for id, annot in annotations.items(): - alignment = align(annot.sequence) - score, aligned_start, aligned_end = ( - alignment.optimal_alignment_score, - alignment.query_begin, - alignment.query_end, - ) - annotation_pairs[id] = Annotation( - id, - annot.sequence, - annot.name, - annot.description, - annot.start, - annot.end, - annot.score, - ), Annotation( - id, - alignment.aligned_target_sequence, - annot.name, - annot.description, - aligned_start, - aligned_end, - score, - ) + aligner = PairwiseAligner(one_alignment_only=True) + aligner.mode = "local" + aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") + aligner.extend_gap_score = -1 + aligner.open_gap_score = -11 + for id, query in queries.items(): + try: + alignments = aligner.align(sequence, query.sequence) + except ValueError: + continue + if len(alignments) > 1: + raise UnexpectedAlignmentResult( + "More than one alignment resulted from a single query." + ) + for alignment in alignments: + score, query_aligned = (alignment.score, alignment.aligned[0][0]) + aligned_start, aligned_end = query_aligned + annotation_pairs[id] = AlignedSequence( + id, + query.sequence, + query.name, + query.description, + query.start, + query.end, + query.score, + ), AlignedSequence( + id, + alignment.query, + query.name, + query.description, + aligned_start, + aligned_end, + score, + ) return annotation_pairs diff --git a/bmlsa/cli.py b/bmlsa/cli.py index 0bfe373..424dfcd 100644 --- a/bmlsa/cli.py +++ b/bmlsa/cli.py @@ -1,7 +1,7 @@ import os import argparse from Bio import SeqIO -from aligner import generate_annotated_positions +from aligner import protein_align_many_to_one_ssw from persistence import read_annotations_from_csv, save_alignments_to_csv @@ -78,7 +78,7 @@ def main(): ) with open(args.sequence, "r") as sequence_fd: for sequence in SeqIO.parse(sequence_fd, "fasta"): - aligned_annotations = generate_annotated_positions( + aligned_annotations = protein_align_many_to_one_ssw( str(sequence.seq), given_annotations ) save_alignments_to_csv( diff --git a/bmlsa/datatypes.py b/bmlsa/datatypes.py index 5bd6374..f007ca6 100644 --- a/bmlsa/datatypes.py +++ b/bmlsa/datatypes.py @@ -1,4 +1,4 @@ -class Annotation: +class AlignedSequence: def __init__( self, id: str, diff --git a/bmlsa/exceptions.py b/bmlsa/exceptions.py new file mode 100644 index 0000000..2ce522f --- /dev/null +++ b/bmlsa/exceptions.py @@ -0,0 +1,2 @@ +class UnexpectedAlignmentResult(Exception): + pass diff --git a/bmlsa/persistence.py b/bmlsa/persistence.py index 28d7e8d..fa4dc9f 100644 --- a/bmlsa/persistence.py +++ b/bmlsa/persistence.py @@ -1,5 +1,5 @@ import csv -from datatypes import Annotation +from datatypes import AlignedSequence def read_annotations_from_csv( @@ -37,7 +37,7 @@ def read_annotations_from_csv( start = row[start_ind] if start_header else None end = row[end_ind] if end_header else None sequence = row[sequence_ind] - annotations[id] = Annotation( + annotations[id] = AlignedSequence( id, sequence, name, @@ -49,7 +49,7 @@ def read_annotations_from_csv( def save_alignments_to_csv( - aligned_pairs: dict[str, tuple[Annotation, Annotation]], output_path: str + aligned_pairs: dict[str, tuple[AlignedSequence, AlignedSequence]], output_path: str ): with open(output_path, "w") as output_fd: writer = csv.writer(output_fd)