Added documentation
Some checks failed
ydeng/bmlsa/pipeline/head There was a failure building this commit
Some checks failed
ydeng/bmlsa/pipeline/head There was a failure building this commit
This commit is contained in:
parent
587c2e753a
commit
7e3f43434e
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@ -7,5 +7,6 @@
|
||||
"FASTA"
|
||||
],
|
||||
"python.testing.pytestEnabled": true,
|
||||
"python.analysis.inlayHints.pytestParameters": true
|
||||
"python.analysis.inlayHints.pytestParameters": true,
|
||||
"autoDocstring.docstringFormat": "sphinx"
|
||||
}
|
@ -1,20 +1,38 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
from typing import Generator, Iterable
|
||||
from Bio.Align import PairwiseAligner, substitution_matrices
|
||||
from bmlsa.datatypes import AlignedSequence
|
||||
from bmlsa.exceptions import UnexpectedAlignmentResult
|
||||
from bmlsa.datatypes import QuerySequence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def align_many_to_one_ssw(
|
||||
reference_sequence: str,
|
||||
queries: Iterable[AlignedSequence],
|
||||
queries: Iterable[QuerySequence],
|
||||
extend_gap_score: int,
|
||||
open_gap_score: int,
|
||||
alignment_mode: str,
|
||||
substitution_matrix: str = "BLOSUM62",
|
||||
):
|
||||
) -> Generator[tuple[QuerySequence, QuerySequence], None, None]:
|
||||
"""Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence
|
||||
|
||||
:param reference_sequence: The reference sequence to align to
|
||||
:type reference_sequence: str
|
||||
:param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence`
|
||||
:type queries: Iterable[QuerySequence]
|
||||
:param extend_gap_score: The gap score to use for alignment. Typically negative.
|
||||
:type extend_gap_score: int
|
||||
:param open_gap_score: The open gap score to use for alignment. Typically negative.
|
||||
:type open_gap_score: int
|
||||
:param alignment_mode: The alignment mode to use. Either "local" or "global".
|
||||
:type alignment_mode: str
|
||||
:param substitution_matrix: The name of the substitution matrix available
|
||||
in :module:`Bio.Align`, defaults to "BLOSUM62"
|
||||
:type substitution_matrix: str, optional
|
||||
:yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is
|
||||
the original, and the second is the aligned version
|
||||
:rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects
|
||||
"""
|
||||
# TODO Consider using the built in "scoring" parameter
|
||||
aligner = PairwiseAligner()
|
||||
aligner.substitution_matrix = substitution_matrices.load(substitution_matrix)
|
||||
@ -30,14 +48,10 @@ def align_many_to_one_ssw(
|
||||
)
|
||||
continue
|
||||
# TODO Implement comparison with input positions to choose best
|
||||
if len(alignments) > 1:
|
||||
raise UnexpectedAlignmentResult(
|
||||
"More than one alignment resulted from a single query."
|
||||
)
|
||||
for alignment in alignments:
|
||||
score, query_aligned = (alignment.score, alignment.aligned[0][0])
|
||||
aligned_start, aligned_end = query_aligned
|
||||
yield AlignedSequence(
|
||||
yield QuerySequence(
|
||||
query.id,
|
||||
query.sequence,
|
||||
query.name,
|
||||
@ -45,7 +59,7 @@ def align_many_to_one_ssw(
|
||||
query.start,
|
||||
query.end,
|
||||
query.score,
|
||||
), AlignedSequence(
|
||||
), QuerySequence(
|
||||
query.id,
|
||||
alignment.query,
|
||||
query.name,
|
||||
|
@ -4,7 +4,7 @@ from Bio import SeqIO
|
||||
import logging
|
||||
from bmlsa.aligner import align_many_to_one_ssw
|
||||
|
||||
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
|
||||
from bmlsa.io import queries_from_csv, save_alignments_to_csv
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -29,8 +29,8 @@ def run(args):
|
||||
"more information."
|
||||
)
|
||||
exit(3)
|
||||
queries = read_annotations_from_csv(
|
||||
args.annotations,
|
||||
queries = queries_from_csv(
|
||||
args.queries,
|
||||
args.id_header,
|
||||
args.seq_header,
|
||||
args.name_header,
|
||||
@ -59,11 +59,11 @@ def run(args):
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
with open(args.sequence, "r") as sequence_fd:
|
||||
for sequence in SeqIO.parse(sequence_fd, "fasta"):
|
||||
aligned_annotations = align_many_to_one_ssw(
|
||||
aligned_queries = align_many_to_one_ssw(
|
||||
str(sequence.seq), queries, **scoring_parameter
|
||||
)
|
||||
save_alignments_to_csv(
|
||||
aligned_annotations,
|
||||
aligned_queries,
|
||||
os.path.join(
|
||||
args.output,
|
||||
sequence.id.replace("|", "+").replace(".", "_") + ".csv",
|
||||
@ -74,11 +74,11 @@ def run(args):
|
||||
def main():
|
||||
arg_parser = argparse.ArgumentParser("bmlsa")
|
||||
arg_parser.add_argument(
|
||||
"annotations",
|
||||
"queries",
|
||||
type=str,
|
||||
help=(
|
||||
"Path to CSV containing the sequences to align as well as the "
|
||||
"annotations for the respective sequences."
|
||||
"queries for the respective sequences."
|
||||
),
|
||||
metavar="a",
|
||||
)
|
||||
|
@ -1,4 +1,6 @@
|
||||
class AlignedSequence:
|
||||
class QuerySequence:
|
||||
"""Represents a sequence that may be aligned."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
id: str,
|
||||
|
@ -1,10 +1,10 @@
|
||||
import csv
|
||||
from typing import Iterable
|
||||
from typing import Generator, Iterable
|
||||
|
||||
from bmlsa.datatypes import AlignedSequence
|
||||
from bmlsa.datatypes import QuerySequence
|
||||
|
||||
|
||||
def read_annotations_from_csv(
|
||||
def queries_from_csv(
|
||||
csv_path: str,
|
||||
id_header: str,
|
||||
sequence_header: str,
|
||||
@ -12,7 +12,29 @@ def read_annotations_from_csv(
|
||||
desc_header: str = None,
|
||||
start_header: str = None,
|
||||
end_header: str = None,
|
||||
):
|
||||
) -> Generator[QuerySequence, None, None]:
|
||||
"""Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file.
|
||||
|
||||
:param csv_path: Path to CSV to use
|
||||
:type csv_path: str
|
||||
:param id_header: The column title for the unique identifier for each query sequence
|
||||
:type id_header: str
|
||||
:param sequence_header: The column title for the sequences themselves
|
||||
:type sequence_header: str
|
||||
:param name_header: The column title for the name of the sequence, defaults to None
|
||||
:type name_header: str, optional
|
||||
:param desc_header: The column title for the description of the sequence,
|
||||
defaults to None
|
||||
:type desc_header: str, optional
|
||||
:param start_header: The column title for the start position of the sequence,
|
||||
defaults to None
|
||||
:type start_header: str, optional
|
||||
:param end_header: The column title for the end position of the sequence,
|
||||
defaults to None
|
||||
:type end_header: str, optional
|
||||
:yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row
|
||||
:rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence`
|
||||
"""
|
||||
with open(csv_path, "r") as csv_fd:
|
||||
reader = csv.reader(csv_fd)
|
||||
id_ind = None
|
||||
@ -38,7 +60,7 @@ def read_annotations_from_csv(
|
||||
desc = row[desc_ind] if desc_header else None
|
||||
start = row[start_ind] if start_header else None
|
||||
end = row[end_ind] if end_header else None
|
||||
yield AlignedSequence(
|
||||
yield QuerySequence(
|
||||
id,
|
||||
sequence,
|
||||
name,
|
||||
@ -49,8 +71,15 @@ def read_annotations_from_csv(
|
||||
|
||||
|
||||
def save_alignments_to_csv(
|
||||
aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str
|
||||
):
|
||||
aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str
|
||||
) -> None:
|
||||
"""Saves alignments to a CSV.
|
||||
|
||||
:param aligned_pairs: An iterable of the original sequence and aligned sequences
|
||||
:type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]]
|
||||
:param output_path: A path to the output directory
|
||||
:type output_path: str
|
||||
"""
|
||||
with open(output_path, "w") as output_fd:
|
||||
writer = csv.writer(output_fd)
|
||||
header_wrote = False
|
||||
|
@ -2,7 +2,7 @@ import pytest
|
||||
from Bio import SeqIO
|
||||
from bmlsa.aligner import align_many_to_one_ssw
|
||||
from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS
|
||||
from bmlsa.datatypes import AlignedSequence
|
||||
from bmlsa.datatypes import QuerySequence
|
||||
from collections.abc import Iterable
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ def reference_sequence():
|
||||
@pytest.fixture
|
||||
def queries():
|
||||
return [
|
||||
AlignedSequence(
|
||||
QuerySequence(
|
||||
"ORF10",
|
||||
"ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT"
|
||||
"GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG",
|
||||
@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu
|
||||
reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"]
|
||||
)
|
||||
for original, aligned_seq in results:
|
||||
assert isinstance(original, AlignedSequence)
|
||||
assert isinstance(aligned_seq, AlignedSequence)
|
||||
assert isinstance(original, QuerySequence)
|
||||
assert isinstance(aligned_seq, QuerySequence)
|
||||
|
||||
|
||||
def test_align_many_to_one_returns_correct_data(reference_sequence, queries):
|
||||
|
@ -1,21 +1,17 @@
|
||||
from csv import reader
|
||||
from os import path
|
||||
from bmlsa.datatypes import AlignedSequence
|
||||
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
|
||||
from bmlsa.datatypes import QuerySequence
|
||||
from bmlsa.io import queries_from_csv, save_alignments_to_csv
|
||||
from collections.abc import Iterable
|
||||
|
||||
|
||||
def test_read_annotations_from_csv_has_data():
|
||||
results = read_annotations_from_csv(
|
||||
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
|
||||
)
|
||||
def test_queries_from_csv_has_data():
|
||||
results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
|
||||
assert isinstance(results, Iterable)
|
||||
|
||||
|
||||
def test_read_annotations_from_csv_data_valid():
|
||||
results = read_annotations_from_csv(
|
||||
"tests/resources/SARS_CoV-2_genes.csv", "id", "sequence"
|
||||
)
|
||||
def test_queries_from_csv_data_valid():
|
||||
results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence")
|
||||
for aligned_seq in results:
|
||||
assert isinstance(aligned_seq.id, str)
|
||||
assert isinstance(aligned_seq.sequence, str)
|
||||
@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid():
|
||||
|
||||
def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir):
|
||||
output_path = path.join(tmpdir, "alignment_results.csv")
|
||||
dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
|
||||
dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence")
|
||||
alignments = [(dummy_sequence, dummy_sequence)]
|
||||
save_alignments_to_csv(alignments, output_path)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user