Added documentation
Some checks failed
ydeng/bmlsa/pipeline/head There was a failure building this commit

This commit is contained in:
Harrison Deng 2023-04-28 12:44:59 -05:00
parent 587c2e753a
commit 7e3f43434e
7 changed files with 84 additions and 42 deletions

View File

@ -7,5 +7,6 @@
"FASTA"
],
"python.testing.pytestEnabled": true,
"python.analysis.inlayHints.pytestParameters": true
"python.analysis.inlayHints.pytestParameters": true,
"autoDocstring.docstringFormat": "sphinx"
}

View File

@ -1,20 +1,38 @@
import logging
from typing import Iterable
from typing import Generator, Iterable
from Bio.Align import PairwiseAligner, substitution_matrices
from bmlsa.datatypes import AlignedSequence
from bmlsa.exceptions import UnexpectedAlignmentResult
from bmlsa.datatypes import QuerySequence
logger = logging.getLogger(__name__)
def align_many_to_one_ssw(
reference_sequence: str,
queries: Iterable[AlignedSequence],
queries: Iterable[QuerySequence],
extend_gap_score: int,
open_gap_score: int,
alignment_mode: str,
substitution_matrix: str = "BLOSUM62",
):
) -> Generator[tuple[QuerySequence, QuerySequence], None, None]:
"""Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence
:param reference_sequence: The reference sequence to align to
:type reference_sequence: str
:param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence`
:type queries: Iterable[QuerySequence]
:param extend_gap_score: The gap score to use for alignment. Typically negative.
:type extend_gap_score: int
:param open_gap_score: The open gap score to use for alignment. Typically negative.
:type open_gap_score: int
:param alignment_mode: The alignment mode to use. Either "local" or "global".
:type alignment_mode: str
:param substitution_matrix: The name of the substitution matrix available
in :module:`Bio.Align`, defaults to "BLOSUM62"
:type substitution_matrix: str, optional
:yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is
the original, and the second is the aligned version
:rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects
"""
# TODO Consider using the built in "scoring" parameter
aligner = PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
@ -30,14 +48,10 @@ def align_many_to_one_ssw(
)
continue
# TODO Implement comparison with input positions to choose best
if len(alignments) > 1:
raise UnexpectedAlignmentResult(
"More than one alignment resulted from a single query."
)
for alignment in alignments:
score, query_aligned = (alignment.score, alignment.aligned[0][0])
aligned_start, aligned_end = query_aligned
yield AlignedSequence(
yield QuerySequence(
query.id,
query.sequence,
query.name,
@ -45,7 +59,7 @@ def align_many_to_one_ssw(
query.start,
query.end,
query.score,
), AlignedSequence(
), QuerySequence(
query.id,
alignment.query,
query.name,

View File

@ -4,7 +4,7 @@ from Bio import SeqIO
import logging
from bmlsa.aligner import align_many_to_one_ssw
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
from bmlsa.io import queries_from_csv, save_alignments_to_csv
logger = logging.getLogger(__name__)
@ -29,8 +29,8 @@ def run(args):
"more information."
)
exit(3)
queries = read_annotations_from_csv(
args.annotations,
queries = queries_from_csv(
args.queries,
args.id_header,
args.seq_header,
args.name_header,
@ -59,11 +59,11 @@ def run(args):
os.makedirs(args.output, exist_ok=True)
with open(args.sequence, "r") as sequence_fd:
for sequence in SeqIO.parse(sequence_fd, "fasta"):
aligned_annotations = align_many_to_one_ssw(
aligned_queries = align_many_to_one_ssw(
str(sequence.seq), queries, **scoring_parameter
)
save_alignments_to_csv(
aligned_annotations,
aligned_queries,
os.path.join(
args.output,
sequence.id.replace("|", "+").replace(".", "_") + ".csv",
@ -74,11 +74,11 @@ def run(args):
def main():
arg_parser = argparse.ArgumentParser("bmlsa")
arg_parser.add_argument(
"annotations",
"queries",
type=str,
help=(
"Path to CSV containing the sequences to align as well as the "
"annotations for the respective sequences."
"queries for the respective sequences."
),
metavar="a",
)

View File

@ -1,4 +1,6 @@
class AlignedSequence:
class QuerySequence:
"""Represents a sequence that may be aligned."""
def __init__(
self,
id: str,

View File

@ -1,10 +1,10 @@
import csv
from typing import Iterable
from typing import Generator, Iterable
from bmlsa.datatypes import AlignedSequence
from bmlsa.datatypes import QuerySequence
def read_annotations_from_csv(
def queries_from_csv(
csv_path: str,
id_header: str,
sequence_header: str,
@ -12,7 +12,29 @@ def read_annotations_from_csv(
desc_header: str = None,
start_header: str = None,
end_header: str = None,
):
) -> Generator[QuerySequence, None, None]:
"""Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file.
:param csv_path: Path to CSV to use
:type csv_path: str
:param id_header: The column title for the unique identifier for each query sequence
:type id_header: str
:param sequence_header: The column title for the sequences themselves
:type sequence_header: str
:param name_header: The column title for the name of the sequence, defaults to None
:type name_header: str, optional
:param desc_header: The column title for the description of the sequence,
defaults to None
:type desc_header: str, optional
:param start_header: The column title for the start position of the sequence,
defaults to None
:type start_header: str, optional
:param end_header: The column title for the end position of the sequence,
defaults to None
:type end_header: str, optional
:yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row
:rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence`
"""
with open(csv_path, "r") as csv_fd:
reader = csv.reader(csv_fd)
id_ind = None
@ -38,7 +60,7 @@ def read_annotations_from_csv(
desc = row[desc_ind] if desc_header else None
start = row[start_ind] if start_header else None
end = row[end_ind] if end_header else None
yield AlignedSequence(
yield QuerySequence(
id,
sequence,
name,
@ -49,8 +71,15 @@ def read_annotations_from_csv(
def save_alignments_to_csv(
aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str
):
aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str
) -> None:
"""Saves alignments to a CSV.
:param aligned_pairs: An iterable of the original sequence and aligned sequences
:type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]]
:param output_path: A path to the output directory
:type output_path: str
"""
with open(output_path, "w") as output_fd:
writer = csv.writer(output_fd)
header_wrote = False

View File

@ -2,7 +2,7 @@ import pytest
from Bio import SeqIO
from bmlsa.aligner import align_many_to_one_ssw
from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS
from bmlsa.datatypes import AlignedSequence
from bmlsa.datatypes import QuerySequence
from collections.abc import Iterable
@ -16,7 +16,7 @@ def reference_sequence():
@pytest.fixture
def queries():
return [
AlignedSequence(
QuerySequence(
"ORF10",
"ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT"
"GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG",
@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu
reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"]
)
for original, aligned_seq in results:
assert isinstance(original, AlignedSequence)
assert isinstance(aligned_seq, AlignedSequence)
assert isinstance(original, QuerySequence)
assert isinstance(aligned_seq, QuerySequence)
def test_align_many_to_one_returns_correct_data(reference_sequence, queries):

View File

@ -1,21 +1,17 @@
from csv import reader
from os import path
from bmlsa.datatypes import AlignedSequence
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
from bmlsa.datatypes import QuerySequence
from bmlsa.io import queries_from_csv, save_alignments_to_csv
from collections.abc import Iterable
def test_read_annotations_from_csv_has_data():
results = read_annotations_from_csv(
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
)
def test_queries_from_csv_has_data():
results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
assert isinstance(results, Iterable)
def test_read_annotations_from_csv_data_valid():
results = read_annotations_from_csv(
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
)
def test_queries_from_csv_data_valid():
results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
for aligned_seq in results:
assert isinstance(aligned_seq.id, str)
assert isinstance(aligned_seq.sequence, str)
@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid():
def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir):
output_path = path.join(tmpdir, "alignment_results.csv")
dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
alignments = [(dummy_sequence, dummy_sequence)]
save_alignments_to_csv(alignments, output_path)