Added documentation
Some checks failed
ydeng/bmlsa/pipeline/head There was a failure building this commit

This commit is contained in:
Harrison Deng 2023-04-28 12:44:59 -05:00
parent 587c2e753a
commit 7e3f43434e
7 changed files with 84 additions and 42 deletions

View File

@ -7,5 +7,6 @@
"FASTA" "FASTA"
], ],
"python.testing.pytestEnabled": true, "python.testing.pytestEnabled": true,
"python.analysis.inlayHints.pytestParameters": true "python.analysis.inlayHints.pytestParameters": true,
"autoDocstring.docstringFormat": "sphinx"
} }

View File

@ -1,20 +1,38 @@
import logging import logging
from typing import Iterable from typing import Generator, Iterable
from Bio.Align import PairwiseAligner, substitution_matrices from Bio.Align import PairwiseAligner, substitution_matrices
from bmlsa.datatypes import AlignedSequence from bmlsa.datatypes import QuerySequence
from bmlsa.exceptions import UnexpectedAlignmentResult
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def align_many_to_one_ssw( def align_many_to_one_ssw(
reference_sequence: str, reference_sequence: str,
queries: Iterable[AlignedSequence], queries: Iterable[QuerySequence],
extend_gap_score: int, extend_gap_score: int,
open_gap_score: int, open_gap_score: int,
alignment_mode: str, alignment_mode: str,
substitution_matrix: str = "BLOSUM62", substitution_matrix: str = "BLOSUM62",
): ) -> Generator[tuple[QuerySequence, QuerySequence], None, None]:
"""Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence
:param reference_sequence: The reference sequence to align to
:type reference_sequence: str
:param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence`
:type queries: Iterable[QuerySequence]
:param extend_gap_score: The gap score to use for alignment. Typically negative.
:type extend_gap_score: int
:param open_gap_score: The open gap score to use for alignment. Typically negative.
:type open_gap_score: int
:param alignment_mode: The alignment mode to use. Either "local" or "global".
:type alignment_mode: str
:param substitution_matrix: The name of the substitution matrix available
in :module:`Bio.Align`, defaults to "BLOSUM62"
:type substitution_matrix: str, optional
:yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is
the original, and the second is the aligned version
:rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects
"""
# TODO Consider using the built in "scoring" parameter # TODO Consider using the built in "scoring" parameter
aligner = PairwiseAligner() aligner = PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load(substitution_matrix) aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
@ -30,14 +48,10 @@ def align_many_to_one_ssw(
) )
continue continue
# TODO Implement comparison with input positions to choose best # TODO Implement comparison with input positions to choose best
if len(alignments) > 1:
raise UnexpectedAlignmentResult(
"More than one alignment resulted from a single query."
)
for alignment in alignments: for alignment in alignments:
score, query_aligned = (alignment.score, alignment.aligned[0][0]) score, query_aligned = (alignment.score, alignment.aligned[0][0])
aligned_start, aligned_end = query_aligned aligned_start, aligned_end = query_aligned
yield AlignedSequence( yield QuerySequence(
query.id, query.id,
query.sequence, query.sequence,
query.name, query.name,
@ -45,7 +59,7 @@ def align_many_to_one_ssw(
query.start, query.start,
query.end, query.end,
query.score, query.score,
), AlignedSequence( ), QuerySequence(
query.id, query.id,
alignment.query, alignment.query,
query.name, query.name,

View File

@ -4,7 +4,7 @@ from Bio import SeqIO
import logging import logging
from bmlsa.aligner import align_many_to_one_ssw from bmlsa.aligner import align_many_to_one_ssw
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv from bmlsa.io import queries_from_csv, save_alignments_to_csv
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -29,8 +29,8 @@ def run(args):
"more information." "more information."
) )
exit(3) exit(3)
queries = read_annotations_from_csv( queries = queries_from_csv(
args.annotations, args.queries,
args.id_header, args.id_header,
args.seq_header, args.seq_header,
args.name_header, args.name_header,
@ -59,11 +59,11 @@ def run(args):
os.makedirs(args.output, exist_ok=True) os.makedirs(args.output, exist_ok=True)
with open(args.sequence, "r") as sequence_fd: with open(args.sequence, "r") as sequence_fd:
for sequence in SeqIO.parse(sequence_fd, "fasta"): for sequence in SeqIO.parse(sequence_fd, "fasta"):
aligned_annotations = align_many_to_one_ssw( aligned_queries = align_many_to_one_ssw(
str(sequence.seq), queries, **scoring_parameter str(sequence.seq), queries, **scoring_parameter
) )
save_alignments_to_csv( save_alignments_to_csv(
aligned_annotations, aligned_queries,
os.path.join( os.path.join(
args.output, args.output,
sequence.id.replace("|", "+").replace(".", "_") + ".csv", sequence.id.replace("|", "+").replace(".", "_") + ".csv",
@ -74,11 +74,11 @@ def run(args):
def main(): def main():
arg_parser = argparse.ArgumentParser("bmlsa") arg_parser = argparse.ArgumentParser("bmlsa")
arg_parser.add_argument( arg_parser.add_argument(
"annotations", "queries",
type=str, type=str,
help=( help=(
"Path to CSV containing the sequences to align as well as the " "Path to CSV containing the sequences to align as well as the "
"annotations for the respective sequences." "queries for the respective sequences."
), ),
metavar="a", metavar="a",
) )

View File

@ -1,4 +1,6 @@
class AlignedSequence: class QuerySequence:
"""Represents a sequence that may be aligned."""
def __init__( def __init__(
self, self,
id: str, id: str,

View File

@ -1,10 +1,10 @@
import csv import csv
from typing import Iterable from typing import Generator, Iterable
from bmlsa.datatypes import AlignedSequence from bmlsa.datatypes import QuerySequence
def read_annotations_from_csv( def queries_from_csv(
csv_path: str, csv_path: str,
id_header: str, id_header: str,
sequence_header: str, sequence_header: str,
@ -12,7 +12,29 @@ def read_annotations_from_csv(
desc_header: str = None, desc_header: str = None,
start_header: str = None, start_header: str = None,
end_header: str = None, end_header: str = None,
): ) -> Generator[QuerySequence, None, None]:
"""Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file.
:param csv_path: Path to CSV to use
:type csv_path: str
:param id_header: The column title for the unique identifier for each query sequence
:type id_header: str
:param sequence_header: The column title for the sequences themselves
:type sequence_header: str
:param name_header: The column title for the name of the sequence, defaults to None
:type name_header: str, optional
:param desc_header: The column title for the description of the sequence,
defaults to None
:type desc_header: str, optional
:param start_header: The column title for the start position of the sequence,
defaults to None
:type start_header: str, optional
:param end_header: The column title for the end position of the sequence,
defaults to None
:type end_header: str, optional
:yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row
:rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence`
"""
with open(csv_path, "r") as csv_fd: with open(csv_path, "r") as csv_fd:
reader = csv.reader(csv_fd) reader = csv.reader(csv_fd)
id_ind = None id_ind = None
@ -38,7 +60,7 @@ def read_annotations_from_csv(
desc = row[desc_ind] if desc_header else None desc = row[desc_ind] if desc_header else None
start = row[start_ind] if start_header else None start = row[start_ind] if start_header else None
end = row[end_ind] if end_header else None end = row[end_ind] if end_header else None
yield AlignedSequence( yield QuerySequence(
id, id,
sequence, sequence,
name, name,
@ -49,8 +71,15 @@ def read_annotations_from_csv(
def save_alignments_to_csv( def save_alignments_to_csv(
aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str
): ) -> None:
"""Saves alignments to a CSV.
:param aligned_pairs: An iterable of the original sequence and aligned sequences
:type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]]
:param output_path: A path to the output directory
:type output_path: str
"""
with open(output_path, "w") as output_fd: with open(output_path, "w") as output_fd:
writer = csv.writer(output_fd) writer = csv.writer(output_fd)
header_wrote = False header_wrote = False

View File

@ -2,7 +2,7 @@ import pytest
from Bio import SeqIO from Bio import SeqIO
from bmlsa.aligner import align_many_to_one_ssw from bmlsa.aligner import align_many_to_one_ssw
from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS
from bmlsa.datatypes import AlignedSequence from bmlsa.datatypes import QuerySequence
from collections.abc import Iterable from collections.abc import Iterable
@ -16,7 +16,7 @@ def reference_sequence():
@pytest.fixture @pytest.fixture
def queries(): def queries():
return [ return [
AlignedSequence( QuerySequence(
"ORF10", "ORF10",
"ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT" "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT"
"GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG", "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG",
@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu
reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"] reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"]
) )
for original, aligned_seq in results: for original, aligned_seq in results:
assert isinstance(original, AlignedSequence) assert isinstance(original, QuerySequence)
assert isinstance(aligned_seq, AlignedSequence) assert isinstance(aligned_seq, QuerySequence)
def test_align_many_to_one_returns_correct_data(reference_sequence, queries): def test_align_many_to_one_returns_correct_data(reference_sequence, queries):

View File

@ -1,21 +1,17 @@
from csv import reader from csv import reader
from os import path from os import path
from bmlsa.datatypes import AlignedSequence from bmlsa.datatypes import QuerySequence
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv from bmlsa.io import queries_from_csv, save_alignments_to_csv
from collections.abc import Iterable from collections.abc import Iterable
def test_read_annotations_from_csv_has_data(): def test_queries_from_csv_has_data():
results = read_annotations_from_csv( results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
)
assert isinstance(results, Iterable) assert isinstance(results, Iterable)
def test_read_annotations_from_csv_data_valid(): def test_queries_from_csv_data_valid():
results = read_annotations_from_csv( results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
)
for aligned_seq in results: for aligned_seq in results:
assert isinstance(aligned_seq.id, str) assert isinstance(aligned_seq.id, str)
assert isinstance(aligned_seq.sequence, str) assert isinstance(aligned_seq.sequence, str)
@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid():
def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir): def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir):
output_path = path.join(tmpdir, "alignment_results.csv") output_path = path.join(tmpdir, "alignment_results.csv")
dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence") dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
alignments = [(dummy_sequence, dummy_sequence)] alignments = [(dummy_sequence, dummy_sequence)]
save_alignments_to_csv(alignments, output_path) save_alignments_to_csv(alignments, output_path)