From 7e3f43434e553346bc5190f66ccb78ca0fbdc047 Mon Sep 17 00:00:00 2001
From: Harrison <yunyangdeng@gmail.com>
Date: Fri, 28 Apr 2023 12:44:59 -0500
Subject: [PATCH] Added documentation

---
 .vscode/settings.json       |  3 ++-
 src/bmlsa/aligner.py        | 36 +++++++++++++++++++++----------
 src/bmlsa/cli.py            | 14 ++++++------
 src/bmlsa/datatypes.py      |  4 +++-
 src/bmlsa/io.py             | 43 +++++++++++++++++++++++++++++++------
 tests/bmlsa/test_aligner.py |  8 +++----
 tests/bmlsa/test_io.py      | 18 ++++++----------
 7 files changed, 84 insertions(+), 42 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 5be7edb..b035e8f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -7,5 +7,6 @@
         "FASTA"
     ],
     "python.testing.pytestEnabled": true,
-    "python.analysis.inlayHints.pytestParameters": true
+    "python.analysis.inlayHints.pytestParameters": true,
+    "autoDocstring.docstringFormat": "sphinx"
 }
\ No newline at end of file
diff --git a/src/bmlsa/aligner.py b/src/bmlsa/aligner.py
index 7b04cf7..70aadc0 100644
--- a/src/bmlsa/aligner.py
+++ b/src/bmlsa/aligner.py
@@ -1,20 +1,38 @@
 import logging
-from typing import Iterable
+from typing import Generator, Iterable
 from Bio.Align import PairwiseAligner, substitution_matrices
-from bmlsa.datatypes import AlignedSequence
-from bmlsa.exceptions import UnexpectedAlignmentResult
+from bmlsa.datatypes import QuerySequence
 
 logger = logging.getLogger(__name__)
 
 
 def align_many_to_one_ssw(
     reference_sequence: str,
-    queries: Iterable[AlignedSequence],
+    queries: Iterable[QuerySequence],
     extend_gap_score: int,
     open_gap_score: int,
     alignment_mode: str,
     substitution_matrix: str = "BLOSUM62",
-):
+) -> Generator[tuple[QuerySequence, QuerySequence], None, None]:
+    """Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence
+
+    :param reference_sequence: The reference sequence to align to
+    :type reference_sequence: str
+    :param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence`
+    :type queries: Iterable[QuerySequence]
+    :param extend_gap_score: The gap score to use for alignment. Typically negative.
+    :type extend_gap_score: int
+    :param open_gap_score: The open gap score to use for alignment. Typically negative.
+    :type open_gap_score: int
+    :param alignment_mode: The alignment mode to use. Either "local" or "global".
+    :type alignment_mode: str
+    :param substitution_matrix: The name of the substitution matrix available
+    in :module:`Bio.Align`, defaults to "BLOSUM62"
+    :type substitution_matrix: str, optional
+    :yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is
+    the original, and the second is the aligned version
+    :rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects
+    """
     # TODO Consider using the built in "scoring" parameter
     aligner = PairwiseAligner()
     aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
@@ -30,14 +48,10 @@ def align_many_to_one_ssw(
             )
             continue
         # TODO Implement comparison with input positions to choose best
-        if len(alignments) > 1:
-            raise UnexpectedAlignmentResult(
-                "More than one alignment resulted from a single query."
-            )
         for alignment in alignments:
             score, query_aligned = (alignment.score, alignment.aligned[0][0])
             aligned_start, aligned_end = query_aligned
-            yield AlignedSequence(
+            yield QuerySequence(
                 query.id,
                 query.sequence,
                 query.name,
@@ -45,7 +59,7 @@ def align_many_to_one_ssw(
                 query.start,
                 query.end,
                 query.score,
-            ), AlignedSequence(
+            ), QuerySequence(
                 query.id,
                 alignment.query,
                 query.name,
diff --git a/src/bmlsa/cli.py b/src/bmlsa/cli.py
index f577737..b1ba931 100644
--- a/src/bmlsa/cli.py
+++ b/src/bmlsa/cli.py
@@ -4,7 +4,7 @@ from Bio import SeqIO
 import logging
 from bmlsa.aligner import align_many_to_one_ssw
 
-from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
+from bmlsa.io import queries_from_csv, save_alignments_to_csv
 
 logger = logging.getLogger(__name__)
 
@@ -29,8 +29,8 @@ def run(args):
             "more information."
         )
         exit(3)
-    queries = read_annotations_from_csv(
-        args.annotations,
+    queries = queries_from_csv(
+        args.queries,
         args.id_header,
         args.seq_header,
         args.name_header,
@@ -59,11 +59,11 @@ def run(args):
     os.makedirs(args.output, exist_ok=True)
     with open(args.sequence, "r") as sequence_fd:
         for sequence in SeqIO.parse(sequence_fd, "fasta"):
-            aligned_annotations = align_many_to_one_ssw(
+            aligned_queries = align_many_to_one_ssw(
                 str(sequence.seq), queries, **scoring_parameter
             )
             save_alignments_to_csv(
-                aligned_annotations,
+                aligned_queries,
                 os.path.join(
                     args.output,
                     sequence.id.replace("|", "+").replace(".", "_") + ".csv",
@@ -74,11 +74,11 @@ def run(args):
 def main():
     arg_parser = argparse.ArgumentParser("bmlsa")
     arg_parser.add_argument(
-        "annotations",
+        "queries",
         type=str,
         help=(
             "Path to CSV containing the sequences to align as well as the "
-            "annotations for the respective sequences."
+            "queries for the respective sequences."
         ),
         metavar="a",
     )
diff --git a/src/bmlsa/datatypes.py b/src/bmlsa/datatypes.py
index 09a0504..ad178af 100644
--- a/src/bmlsa/datatypes.py
+++ b/src/bmlsa/datatypes.py
@@ -1,4 +1,6 @@
-class AlignedSequence:
+class QuerySequence:
+    """Represents a sequence that may be aligned."""
+
     def __init__(
         self,
         id: str,
diff --git a/src/bmlsa/io.py b/src/bmlsa/io.py
index a47da27..53b9c0d 100644
--- a/src/bmlsa/io.py
+++ b/src/bmlsa/io.py
@@ -1,10 +1,10 @@
 import csv
-from typing import Iterable
+from typing import Generator, Iterable
 
-from bmlsa.datatypes import AlignedSequence
+from bmlsa.datatypes import QuerySequence
 
 
-def read_annotations_from_csv(
+def queries_from_csv(
     csv_path: str,
     id_header: str,
     sequence_header: str,
@@ -12,7 +12,29 @@ def read_annotations_from_csv(
     desc_header: str = None,
     start_header: str = None,
     end_header: str = None,
-):
+) -> Generator[QuerySequence, None, None]:
+    """Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file.
+
+    :param csv_path: Path to CSV to use
+    :type csv_path: str
+    :param id_header: The column title for the unique identifier for each query sequence
+    :type id_header: str
+    :param sequence_header: The column title for the sequences themselves
+    :type sequence_header: str
+    :param name_header: The column title for the name of the sequence, defaults to None
+    :type name_header: str, optional
+    :param desc_header: The column title for the description of the sequence,
+    defaults to None
+    :type desc_header: str, optional
+    :param start_header: The column title for the start position of the sequence,
+    defaults to None
+    :type start_header: str, optional
+    :param end_header: The column title for the end position of the sequence,
+    defaults to None
+    :type end_header: str, optional
+    :yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row
+    :rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence`
+    """
     with open(csv_path, "r") as csv_fd:
         reader = csv.reader(csv_fd)
         id_ind = None
@@ -38,7 +60,7 @@ def read_annotations_from_csv(
             desc = row[desc_ind] if desc_header else None
             start = row[start_ind] if start_header else None
             end = row[end_ind] if end_header else None
-            yield AlignedSequence(
+            yield QuerySequence(
                 id,
                 sequence,
                 name,
@@ -49,8 +71,15 @@ def read_annotations_from_csv(
 
 
 def save_alignments_to_csv(
-    aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str
-):
+    aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str
+) -> None:
+    """Saves alignments to a CSV.
+
+    :param aligned_pairs: An iterable of the original sequence and aligned sequences
+    :type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]]
+    :param output_path: A path to the output directory
+    :type output_path: str
+    """
     with open(output_path, "w") as output_fd:
         writer = csv.writer(output_fd)
         header_wrote = False
diff --git a/tests/bmlsa/test_aligner.py b/tests/bmlsa/test_aligner.py
index 4bc2544..a13e677 100644
--- a/tests/bmlsa/test_aligner.py
+++ b/tests/bmlsa/test_aligner.py
@@ -2,7 +2,7 @@ import pytest
 from Bio import SeqIO
 from bmlsa.aligner import align_many_to_one_ssw
 from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS
-from bmlsa.datatypes import AlignedSequence
+from bmlsa.datatypes import QuerySequence
 from collections.abc import Iterable
 
 
@@ -16,7 +16,7 @@ def reference_sequence():
 @pytest.fixture
 def queries():
     return [
-        AlignedSequence(
+        QuerySequence(
             "ORF10",
             "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT"
             "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG",
@@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu
         reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"]
     )
     for original, aligned_seq in results:
-        assert isinstance(original, AlignedSequence)
-        assert isinstance(aligned_seq, AlignedSequence)
+        assert isinstance(original, QuerySequence)
+        assert isinstance(aligned_seq, QuerySequence)
 
 
 def test_align_many_to_one_returns_correct_data(reference_sequence, queries):
diff --git a/tests/bmlsa/test_io.py b/tests/bmlsa/test_io.py
index ac60ae8..835e08c 100644
--- a/tests/bmlsa/test_io.py
+++ b/tests/bmlsa/test_io.py
@@ -1,21 +1,17 @@
 from csv import reader
 from os import path
-from bmlsa.datatypes import AlignedSequence
-from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
+from bmlsa.datatypes import QuerySequence
+from bmlsa.io import queries_from_csv, save_alignments_to_csv
 from collections.abc import Iterable
 
 
-def test_read_annotations_from_csv_has_data():
-    results = read_annotations_from_csv(
-        "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
-    )
+def test_queries_from_csv_has_data():
+    results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
     assert isinstance(results, Iterable)
 
 
-def test_read_annotations_from_csv_data_valid():
-    results = read_annotations_from_csv(
-        "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
-    )
+def test_queries_from_csv_data_valid():
+    results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
     for aligned_seq in results:
         assert isinstance(aligned_seq.id, str)
         assert isinstance(aligned_seq.sequence, str)
@@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid():
 
 def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir):
     output_path = path.join(tmpdir, "alignment_results.csv")
-    dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
+    dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
     alignments = [(dummy_sequence, dummy_sequence)]
     save_alignments_to_csv(alignments, output_path)