Added documentation

2023-04-28 12:44:59 -05:00
parent 587c2e753a
commit 7e3f43434e
7 changed files with 84 additions and 42 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -7,5 +7,6 @@
        "FASTA"
    ],
    "python.testing.pytestEnabled": true,
-    "python.analysis.inlayHints.pytestParameters": true
+    "python.analysis.inlayHints.pytestParameters": true,
+    "autoDocstring.docstringFormat": "sphinx"
 }
--- a/src/bmlsa/aligner.py
+++ b/src/bmlsa/aligner.py
@@ -1,20 +1,38 @@
 import logging
-from typing import Iterable
+from typing import Generator, Iterable
 from Bio.Align import PairwiseAligner, substitution_matrices
-from bmlsa.datatypes import AlignedSequence
-from bmlsa.exceptions import UnexpectedAlignmentResult
+from bmlsa.datatypes import QuerySequence

 logger = logging.getLogger(__name__)


 def align_many_to_one_ssw(
    reference_sequence: str,
-    queries: Iterable[AlignedSequence],
+    queries: Iterable[QuerySequence],
    extend_gap_score: int,
    open_gap_score: int,
    alignment_mode: str,
    substitution_matrix: str = "BLOSUM62",
-):
+) -> Generator[tuple[QuerySequence, QuerySequence], None, None]:
+    """Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence
+
+    :param reference_sequence: The reference sequence to align to
+    :type reference_sequence: str
+    :param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence`
+    :type queries: Iterable[QuerySequence]
+    :param extend_gap_score: The gap score to use for alignment. Typically negative.
+    :type extend_gap_score: int
+    :param open_gap_score: The open gap score to use for alignment. Typically negative.
+    :type open_gap_score: int
+    :param alignment_mode: The alignment mode to use. Either "local" or "global".
+    :type alignment_mode: str
+    :param substitution_matrix: The name of the substitution matrix available
+    in :module:`Bio.Align`, defaults to "BLOSUM62"
+    :type substitution_matrix: str, optional
+    :yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is
+    the original, and the second is the aligned version
+    :rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects
+    """
    # TODO Consider using the built in "scoring" parameter
    aligner = PairwiseAligner()
    aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
@@ -30,14 +48,10 @@ def align_many_to_one_ssw(
            )
            continue
        # TODO Implement comparison with input positions to choose best
-        if len(alignments) > 1:
-            raise UnexpectedAlignmentResult(
-                "More than one alignment resulted from a single query."
-            )
        for alignment in alignments:
            score, query_aligned = (alignment.score, alignment.aligned[0][0])
            aligned_start, aligned_end = query_aligned
-            yield AlignedSequence(
+            yield QuerySequence(
                query.id,
                query.sequence,
                query.name,
@@ -45,7 +59,7 @@ def align_many_to_one_ssw(
                query.start,
                query.end,
                query.score,
-            ), AlignedSequence(
+            ), QuerySequence(
                query.id,
                alignment.query,
                query.name,
--- a/src/bmlsa/cli.py
+++ b/src/bmlsa/cli.py
@@ -4,7 +4,7 @@ from Bio import SeqIO
 import logging
 from bmlsa.aligner import align_many_to_one_ssw

-from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
+from bmlsa.io import queries_from_csv, save_alignments_to_csv

 logger = logging.getLogger(__name__)

@@ -29,8 +29,8 @@ def run(args):
            "more information."
        )
        exit(3)
-    queries = read_annotations_from_csv(
-        args.annotations,
+    queries = queries_from_csv(
+        args.queries,
        args.id_header,
        args.seq_header,
        args.name_header,
@@ -59,11 +59,11 @@ def run(args):
    os.makedirs(args.output, exist_ok=True)
    with open(args.sequence, "r") as sequence_fd:
        for sequence in SeqIO.parse(sequence_fd, "fasta"):
-            aligned_annotations = align_many_to_one_ssw(
+            aligned_queries = align_many_to_one_ssw(
                str(sequence.seq), queries, **scoring_parameter
            )
            save_alignments_to_csv(
-                aligned_annotations,
+                aligned_queries,
                os.path.join(
                    args.output,
                    sequence.id.replace("|", "+").replace(".", "_") + ".csv",
@@ -74,11 +74,11 @@ def run(args):
 def main():
    arg_parser = argparse.ArgumentParser("bmlsa")
    arg_parser.add_argument(
-        "annotations",
+        "queries",
        type=str,
        help=(
            "Path to CSV containing the sequences to align as well as the "
-            "annotations for the respective sequences."
+            "queries for the respective sequences."
        ),
        metavar="a",
    )
--- a/src/bmlsa/datatypes.py
+++ b/src/bmlsa/datatypes.py
@@ -1,4 +1,6 @@
-class AlignedSequence:
+class QuerySequence:
+    """Represents a sequence that may be aligned."""
+
    def __init__(
        self,
        id: str,
--- a/src/bmlsa/io.py
+++ b/src/bmlsa/io.py
@@ -1,10 +1,10 @@
 import csv
-from typing import Iterable
+from typing import Generator, Iterable

-from bmlsa.datatypes import AlignedSequence
+from bmlsa.datatypes import QuerySequence


-def read_annotations_from_csv(
+def queries_from_csv(
    csv_path: str,
    id_header: str,
    sequence_header: str,
@@ -12,7 +12,29 @@ def read_annotations_from_csv(
    desc_header: str = None,
    start_header: str = None,
    end_header: str = None,
-):
+) -> Generator[QuerySequence, None, None]:
+    """Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file.
+
+    :param csv_path: Path to CSV to use
+    :type csv_path: str
+    :param id_header: The column title for the unique identifier for each query sequence
+    :type id_header: str
+    :param sequence_header: The column title for the sequences themselves
+    :type sequence_header: str
+    :param name_header: The column title for the name of the sequence, defaults to None
+    :type name_header: str, optional
+    :param desc_header: The column title for the description of the sequence,
+    defaults to None
+    :type desc_header: str, optional
+    :param start_header: The column title for the start position of the sequence,
+    defaults to None
+    :type start_header: str, optional
+    :param end_header: The column title for the end position of the sequence,
+    defaults to None
+    :type end_header: str, optional
+    :yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row
+    :rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence`
+    """
    with open(csv_path, "r") as csv_fd:
        reader = csv.reader(csv_fd)
        id_ind = None
@@ -38,7 +60,7 @@ def read_annotations_from_csv(
            desc = row[desc_ind] if desc_header else None
            start = row[start_ind] if start_header else None
            end = row[end_ind] if end_header else None
-            yield AlignedSequence(
+            yield QuerySequence(
                id,
                sequence,
                name,
@@ -49,8 +71,15 @@ def read_annotations_from_csv(


 def save_alignments_to_csv(
-    aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str
-):
+    aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str
+) -> None:
+    """Saves alignments to a CSV.
+
+    :param aligned_pairs: An iterable of the original sequence and aligned sequences
+    :type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]]
+    :param output_path: A path to the output directory
+    :type output_path: str
+    """
    with open(output_path, "w") as output_fd:
        writer = csv.writer(output_fd)
        header_wrote = False
--- a/tests/bmlsa/test_aligner.py
+++ b/tests/bmlsa/test_aligner.py
@@ -2,7 +2,7 @@ import pytest
 from Bio import SeqIO
 from bmlsa.aligner import align_many_to_one_ssw
 from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS
-from bmlsa.datatypes import AlignedSequence
+from bmlsa.datatypes import QuerySequence
 from collections.abc import Iterable


@@ -16,7 +16,7 @@ def reference_sequence():
@pytest.fixture
 def queries():
    return [
-        AlignedSequence(
+        QuerySequence(
            "ORF10",
            "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT"
            "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG",
@@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu
        reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"]
    )
    for original, aligned_seq in results:
-        assert isinstance(original, AlignedSequence)
-        assert isinstance(aligned_seq, AlignedSequence)
+        assert isinstance(original, QuerySequence)
+        assert isinstance(aligned_seq, QuerySequence)


 def test_align_many_to_one_returns_correct_data(reference_sequence, queries):
--- a/tests/bmlsa/test_io.py
+++ b/tests/bmlsa/test_io.py
@@ -1,21 +1,17 @@
 from csv import reader
 from os import path
-from bmlsa.datatypes import AlignedSequence
-from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
+from bmlsa.datatypes import QuerySequence
+from bmlsa.io import queries_from_csv, save_alignments_to_csv
 from collections.abc import Iterable


-def test_read_annotations_from_csv_has_data():
-    results = read_annotations_from_csv(
-        "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
-    )
+def test_queries_from_csv_has_data():
+    results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
    assert isinstance(results, Iterable)


-def test_read_annotations_from_csv_data_valid():
-    results = read_annotations_from_csv(
-        "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
-    )
+def test_queries_from_csv_data_valid():
+    results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
    for aligned_seq in results:
        assert isinstance(aligned_seq.id, str)
        assert isinstance(aligned_seq.sequence, str)
@@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid():

 def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir):
    output_path = path.join(tmpdir, "alignment_results.csv")
-    dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
+    dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
    alignments = [(dummy_sequence, dummy_sequence)]
    save_alignments_to_csv(alignments, output_path)