From 7e3f43434e553346bc5190f66ccb78ca0fbdc047 Mon Sep 17 00:00:00 2001 From: Harrison Date: Fri, 28 Apr 2023 12:44:59 -0500 Subject: [PATCH] Added documentation --- .vscode/settings.json | 3 ++- src/bmlsa/aligner.py | 36 +++++++++++++++++++++---------- src/bmlsa/cli.py | 14 ++++++------ src/bmlsa/datatypes.py | 4 +++- src/bmlsa/io.py | 43 +++++++++++++++++++++++++++++++------ tests/bmlsa/test_aligner.py | 8 +++---- tests/bmlsa/test_io.py | 18 ++++++---------- 7 files changed, 84 insertions(+), 42 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 5be7edb..b035e8f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,6 @@ "FASTA" ], "python.testing.pytestEnabled": true, - "python.analysis.inlayHints.pytestParameters": true + "python.analysis.inlayHints.pytestParameters": true, + "autoDocstring.docstringFormat": "sphinx" } \ No newline at end of file diff --git a/src/bmlsa/aligner.py b/src/bmlsa/aligner.py index 7b04cf7..70aadc0 100644 --- a/src/bmlsa/aligner.py +++ b/src/bmlsa/aligner.py @@ -1,20 +1,38 @@ import logging -from typing import Iterable +from typing import Generator, Iterable from Bio.Align import PairwiseAligner, substitution_matrices -from bmlsa.datatypes import AlignedSequence -from bmlsa.exceptions import UnexpectedAlignmentResult +from bmlsa.datatypes import QuerySequence logger = logging.getLogger(__name__) def align_many_to_one_ssw( reference_sequence: str, - queries: Iterable[AlignedSequence], + queries: Iterable[QuerySequence], extend_gap_score: int, open_gap_score: int, alignment_mode: str, substitution_matrix: str = "BLOSUM62", -): +) -> Generator[tuple[QuerySequence, QuerySequence], None, None]: + """Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence + + :param reference_sequence: The reference sequence to align to + :type reference_sequence: str + :param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence` + :type queries: Iterable[QuerySequence] + :param extend_gap_score: The gap score to use for alignment. Typically negative. + :type extend_gap_score: int + :param open_gap_score: The open gap score to use for alignment. Typically negative. + :type open_gap_score: int + :param alignment_mode: The alignment mode to use. Either "local" or "global". + :type alignment_mode: str + :param substitution_matrix: The name of the substitution matrix available + in :module:`Bio.Align`, defaults to "BLOSUM62" + :type substitution_matrix: str, optional + :yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is + the original, and the second is the aligned version + :rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects + """ # TODO Consider using the built in "scoring" parameter aligner = PairwiseAligner() aligner.substitution_matrix = substitution_matrices.load(substitution_matrix) @@ -30,14 +48,10 @@ def align_many_to_one_ssw( ) continue # TODO Implement comparison with input positions to choose best - if len(alignments) > 1: - raise UnexpectedAlignmentResult( - "More than one alignment resulted from a single query." - ) for alignment in alignments: score, query_aligned = (alignment.score, alignment.aligned[0][0]) aligned_start, aligned_end = query_aligned - yield AlignedSequence( + yield QuerySequence( query.id, query.sequence, query.name, @@ -45,7 +59,7 @@ def align_many_to_one_ssw( query.start, query.end, query.score, - ), AlignedSequence( + ), QuerySequence( query.id, alignment.query, query.name, diff --git a/src/bmlsa/cli.py b/src/bmlsa/cli.py index f577737..b1ba931 100644 --- a/src/bmlsa/cli.py +++ b/src/bmlsa/cli.py @@ -4,7 +4,7 @@ from Bio import SeqIO import logging from bmlsa.aligner import align_many_to_one_ssw -from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv +from bmlsa.io import queries_from_csv, save_alignments_to_csv logger = logging.getLogger(__name__) @@ -29,8 +29,8 @@ def run(args): "more information." ) exit(3) - queries = read_annotations_from_csv( - args.annotations, + queries = queries_from_csv( + args.queries, args.id_header, args.seq_header, args.name_header, @@ -59,11 +59,11 @@ def run(args): os.makedirs(args.output, exist_ok=True) with open(args.sequence, "r") as sequence_fd: for sequence in SeqIO.parse(sequence_fd, "fasta"): - aligned_annotations = align_many_to_one_ssw( + aligned_queries = align_many_to_one_ssw( str(sequence.seq), queries, **scoring_parameter ) save_alignments_to_csv( - aligned_annotations, + aligned_queries, os.path.join( args.output, sequence.id.replace("|", "+").replace(".", "_") + ".csv", @@ -74,11 +74,11 @@ def run(args): def main(): arg_parser = argparse.ArgumentParser("bmlsa") arg_parser.add_argument( - "annotations", + "queries", type=str, help=( "Path to CSV containing the sequences to align as well as the " - "annotations for the respective sequences." + "queries for the respective sequences." ), metavar="a", ) diff --git a/src/bmlsa/datatypes.py b/src/bmlsa/datatypes.py index 09a0504..ad178af 100644 --- a/src/bmlsa/datatypes.py +++ b/src/bmlsa/datatypes.py @@ -1,4 +1,6 @@ -class AlignedSequence: +class QuerySequence: + """Represents a sequence that may be aligned.""" + def __init__( self, id: str, diff --git a/src/bmlsa/io.py b/src/bmlsa/io.py index a47da27..53b9c0d 100644 --- a/src/bmlsa/io.py +++ b/src/bmlsa/io.py @@ -1,10 +1,10 @@ import csv -from typing import Iterable +from typing import Generator, Iterable -from bmlsa.datatypes import AlignedSequence +from bmlsa.datatypes import QuerySequence -def read_annotations_from_csv( +def queries_from_csv( csv_path: str, id_header: str, sequence_header: str, @@ -12,7 +12,29 @@ def read_annotations_from_csv( desc_header: str = None, start_header: str = None, end_header: str = None, -): +) -> Generator[QuerySequence, None, None]: + """Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file. + + :param csv_path: Path to CSV to use + :type csv_path: str + :param id_header: The column title for the unique identifier for each query sequence + :type id_header: str + :param sequence_header: The column title for the sequences themselves + :type sequence_header: str + :param name_header: The column title for the name of the sequence, defaults to None + :type name_header: str, optional + :param desc_header: The column title for the description of the sequence, + defaults to None + :type desc_header: str, optional + :param start_header: The column title for the start position of the sequence, + defaults to None + :type start_header: str, optional + :param end_header: The column title for the end position of the sequence, + defaults to None + :type end_header: str, optional + :yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row + :rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence` + """ with open(csv_path, "r") as csv_fd: reader = csv.reader(csv_fd) id_ind = None @@ -38,7 +60,7 @@ def read_annotations_from_csv( desc = row[desc_ind] if desc_header else None start = row[start_ind] if start_header else None end = row[end_ind] if end_header else None - yield AlignedSequence( + yield QuerySequence( id, sequence, name, @@ -49,8 +71,15 @@ def read_annotations_from_csv( def save_alignments_to_csv( - aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str -): + aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str +) -> None: + """Saves alignments to a CSV. + + :param aligned_pairs: An iterable of the original sequence and aligned sequences + :type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]] + :param output_path: A path to the output directory + :type output_path: str + """ with open(output_path, "w") as output_fd: writer = csv.writer(output_fd) header_wrote = False diff --git a/tests/bmlsa/test_aligner.py b/tests/bmlsa/test_aligner.py index 4bc2544..a13e677 100644 --- a/tests/bmlsa/test_aligner.py +++ b/tests/bmlsa/test_aligner.py @@ -2,7 +2,7 @@ import pytest from Bio import SeqIO from bmlsa.aligner import align_many_to_one_ssw from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS -from bmlsa.datatypes import AlignedSequence +from bmlsa.datatypes import QuerySequence from collections.abc import Iterable @@ -16,7 +16,7 @@ def reference_sequence(): @pytest.fixture def queries(): return [ - AlignedSequence( + QuerySequence( "ORF10", "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT" "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG", @@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"] ) for original, aligned_seq in results: - assert isinstance(original, AlignedSequence) - assert isinstance(aligned_seq, AlignedSequence) + assert isinstance(original, QuerySequence) + assert isinstance(aligned_seq, QuerySequence) def test_align_many_to_one_returns_correct_data(reference_sequence, queries): diff --git a/tests/bmlsa/test_io.py b/tests/bmlsa/test_io.py index ac60ae8..835e08c 100644 --- a/tests/bmlsa/test_io.py +++ b/tests/bmlsa/test_io.py @@ -1,21 +1,17 @@ from csv import reader from os import path -from bmlsa.datatypes import AlignedSequence -from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv +from bmlsa.datatypes import QuerySequence +from bmlsa.io import queries_from_csv, save_alignments_to_csv from collections.abc import Iterable -def test_read_annotations_from_csv_has_data(): - results = read_annotations_from_csv( - "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence" - ) +def test_queries_from_csv_has_data(): + results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence") assert isinstance(results, Iterable) -def test_read_annotations_from_csv_data_valid(): - results = read_annotations_from_csv( - "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence" - ) +def test_queries_from_csv_data_valid(): + results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence") for aligned_seq in results: assert isinstance(aligned_seq.id, str) assert isinstance(aligned_seq.sequence, str) @@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid(): def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir): output_path = path.join(tmpdir, "alignment_results.csv") - dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence") + dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence") alignments = [(dummy_sequence, dummy_sequence)] save_alignments_to_csv(alignments, output_path)