Added documentation
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				ydeng/bmlsa/pipeline/head There was a failure building this commit
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	ydeng/bmlsa/pipeline/head There was a failure building this commit
				
			This commit is contained in:
		
							
								
								
									
										3
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.vscode/settings.json
									
									
									
									
										vendored
									
									
								
							| @@ -7,5 +7,6 @@ | |||||||
|         "FASTA" |         "FASTA" | ||||||
|     ], |     ], | ||||||
|     "python.testing.pytestEnabled": true, |     "python.testing.pytestEnabled": true, | ||||||
|     "python.analysis.inlayHints.pytestParameters": true |     "python.analysis.inlayHints.pytestParameters": true, | ||||||
|  |     "autoDocstring.docstringFormat": "sphinx" | ||||||
| } | } | ||||||
| @@ -1,20 +1,38 @@ | |||||||
| import logging | import logging | ||||||
| from typing import Iterable | from typing import Generator, Iterable | ||||||
| from Bio.Align import PairwiseAligner, substitution_matrices | from Bio.Align import PairwiseAligner, substitution_matrices | ||||||
| from bmlsa.datatypes import AlignedSequence | from bmlsa.datatypes import QuerySequence | ||||||
| from bmlsa.exceptions import UnexpectedAlignmentResult |  | ||||||
|  |  | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| def align_many_to_one_ssw( | def align_many_to_one_ssw( | ||||||
|     reference_sequence: str, |     reference_sequence: str, | ||||||
|     queries: Iterable[AlignedSequence], |     queries: Iterable[QuerySequence], | ||||||
|     extend_gap_score: int, |     extend_gap_score: int, | ||||||
|     open_gap_score: int, |     open_gap_score: int, | ||||||
|     alignment_mode: str, |     alignment_mode: str, | ||||||
|     substitution_matrix: str = "BLOSUM62", |     substitution_matrix: str = "BLOSUM62", | ||||||
| ): | ) -> Generator[tuple[QuerySequence, QuerySequence], None, None]: | ||||||
|  |     """Aligns :obj:`bmlsa.datatypes.QuerySequence` objects to a given reference sequence | ||||||
|  |  | ||||||
|  |     :param reference_sequence: The reference sequence to align to | ||||||
|  |     :type reference_sequence: str | ||||||
|  |     :param queries: A iterable sequence of :obj:`bmlsa.datatypes.QuerySequence` | ||||||
|  |     :type queries: Iterable[QuerySequence] | ||||||
|  |     :param extend_gap_score: The gap score to use for alignment. Typically negative. | ||||||
|  |     :type extend_gap_score: int | ||||||
|  |     :param open_gap_score: The open gap score to use for alignment. Typically negative. | ||||||
|  |     :type open_gap_score: int | ||||||
|  |     :param alignment_mode: The alignment mode to use. Either "local" or "global". | ||||||
|  |     :type alignment_mode: str | ||||||
|  |     :param substitution_matrix: The name of the substitution matrix available | ||||||
|  |     in :module:`Bio.Align`, defaults to "BLOSUM62" | ||||||
|  |     :type substitution_matrix: str, optional | ||||||
|  |     :yield: Pairs of :obj:`bmlsa.datatypes.QuerySequence` objects where the first is | ||||||
|  |     the original, and the second is the aligned version | ||||||
|  |     :rtype: a generator of :obj:`bmlsa.datatypes.QuerySequence` objects | ||||||
|  |     """ | ||||||
|     # TODO Consider using the built in "scoring" parameter |     # TODO Consider using the built in "scoring" parameter | ||||||
|     aligner = PairwiseAligner() |     aligner = PairwiseAligner() | ||||||
|     aligner.substitution_matrix = substitution_matrices.load(substitution_matrix) |     aligner.substitution_matrix = substitution_matrices.load(substitution_matrix) | ||||||
| @@ -30,14 +48,10 @@ def align_many_to_one_ssw( | |||||||
|             ) |             ) | ||||||
|             continue |             continue | ||||||
|         # TODO Implement comparison with input positions to choose best |         # TODO Implement comparison with input positions to choose best | ||||||
|         if len(alignments) > 1: |  | ||||||
|             raise UnexpectedAlignmentResult( |  | ||||||
|                 "More than one alignment resulted from a single query." |  | ||||||
|             ) |  | ||||||
|         for alignment in alignments: |         for alignment in alignments: | ||||||
|             score, query_aligned = (alignment.score, alignment.aligned[0][0]) |             score, query_aligned = (alignment.score, alignment.aligned[0][0]) | ||||||
|             aligned_start, aligned_end = query_aligned |             aligned_start, aligned_end = query_aligned | ||||||
|             yield AlignedSequence( |             yield QuerySequence( | ||||||
|                 query.id, |                 query.id, | ||||||
|                 query.sequence, |                 query.sequence, | ||||||
|                 query.name, |                 query.name, | ||||||
| @@ -45,7 +59,7 @@ def align_many_to_one_ssw( | |||||||
|                 query.start, |                 query.start, | ||||||
|                 query.end, |                 query.end, | ||||||
|                 query.score, |                 query.score, | ||||||
|             ), AlignedSequence( |             ), QuerySequence( | ||||||
|                 query.id, |                 query.id, | ||||||
|                 alignment.query, |                 alignment.query, | ||||||
|                 query.name, |                 query.name, | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ from Bio import SeqIO | |||||||
| import logging | import logging | ||||||
| from bmlsa.aligner import align_many_to_one_ssw | from bmlsa.aligner import align_many_to_one_ssw | ||||||
|  |  | ||||||
| from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv | from bmlsa.io import queries_from_csv, save_alignments_to_csv | ||||||
|  |  | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
|  |  | ||||||
| @@ -29,8 +29,8 @@ def run(args): | |||||||
|             "more information." |             "more information." | ||||||
|         ) |         ) | ||||||
|         exit(3) |         exit(3) | ||||||
|     queries = read_annotations_from_csv( |     queries = queries_from_csv( | ||||||
|         args.annotations, |         args.queries, | ||||||
|         args.id_header, |         args.id_header, | ||||||
|         args.seq_header, |         args.seq_header, | ||||||
|         args.name_header, |         args.name_header, | ||||||
| @@ -59,11 +59,11 @@ def run(args): | |||||||
|     os.makedirs(args.output, exist_ok=True) |     os.makedirs(args.output, exist_ok=True) | ||||||
|     with open(args.sequence, "r") as sequence_fd: |     with open(args.sequence, "r") as sequence_fd: | ||||||
|         for sequence in SeqIO.parse(sequence_fd, "fasta"): |         for sequence in SeqIO.parse(sequence_fd, "fasta"): | ||||||
|             aligned_annotations = align_many_to_one_ssw( |             aligned_queries = align_many_to_one_ssw( | ||||||
|                 str(sequence.seq), queries, **scoring_parameter |                 str(sequence.seq), queries, **scoring_parameter | ||||||
|             ) |             ) | ||||||
|             save_alignments_to_csv( |             save_alignments_to_csv( | ||||||
|                 aligned_annotations, |                 aligned_queries, | ||||||
|                 os.path.join( |                 os.path.join( | ||||||
|                     args.output, |                     args.output, | ||||||
|                     sequence.id.replace("|", "+").replace(".", "_") + ".csv", |                     sequence.id.replace("|", "+").replace(".", "_") + ".csv", | ||||||
| @@ -74,11 +74,11 @@ def run(args): | |||||||
| def main(): | def main(): | ||||||
|     arg_parser = argparse.ArgumentParser("bmlsa") |     arg_parser = argparse.ArgumentParser("bmlsa") | ||||||
|     arg_parser.add_argument( |     arg_parser.add_argument( | ||||||
|         "annotations", |         "queries", | ||||||
|         type=str, |         type=str, | ||||||
|         help=( |         help=( | ||||||
|             "Path to CSV containing the sequences to align as well as the " |             "Path to CSV containing the sequences to align as well as the " | ||||||
|             "annotations for the respective sequences." |             "queries for the respective sequences." | ||||||
|         ), |         ), | ||||||
|         metavar="a", |         metavar="a", | ||||||
|     ) |     ) | ||||||
|   | |||||||
| @@ -1,4 +1,6 @@ | |||||||
| class AlignedSequence: | class QuerySequence: | ||||||
|  |     """Represents a sequence that may be aligned.""" | ||||||
|  |  | ||||||
|     def __init__( |     def __init__( | ||||||
|         self, |         self, | ||||||
|         id: str, |         id: str, | ||||||
|   | |||||||
| @@ -1,10 +1,10 @@ | |||||||
| import csv | import csv | ||||||
| from typing import Iterable | from typing import Generator, Iterable | ||||||
|  |  | ||||||
| from bmlsa.datatypes import AlignedSequence | from bmlsa.datatypes import QuerySequence | ||||||
|  |  | ||||||
|  |  | ||||||
| def read_annotations_from_csv( | def queries_from_csv( | ||||||
|     csv_path: str, |     csv_path: str, | ||||||
|     id_header: str, |     id_header: str, | ||||||
|     sequence_header: str, |     sequence_header: str, | ||||||
| @@ -12,7 +12,29 @@ def read_annotations_from_csv( | |||||||
|     desc_header: str = None, |     desc_header: str = None, | ||||||
|     start_header: str = None, |     start_header: str = None, | ||||||
|     end_header: str = None, |     end_header: str = None, | ||||||
| ): | ) -> Generator[QuerySequence, None, None]: | ||||||
|  |     """Generates and :obj:`bmlsa.datatypes.QuerySequence` instances from a CSV file. | ||||||
|  |  | ||||||
|  |     :param csv_path: Path to CSV to use | ||||||
|  |     :type csv_path: str | ||||||
|  |     :param id_header: The column title for the unique identifier for each query sequence | ||||||
|  |     :type id_header: str | ||||||
|  |     :param sequence_header: The column title for the sequences themselves | ||||||
|  |     :type sequence_header: str | ||||||
|  |     :param name_header: The column title for the name of the sequence, defaults to None | ||||||
|  |     :type name_header: str, optional | ||||||
|  |     :param desc_header: The column title for the description of the sequence, | ||||||
|  |     defaults to None | ||||||
|  |     :type desc_header: str, optional | ||||||
|  |     :param start_header: The column title for the start position of the sequence, | ||||||
|  |     defaults to None | ||||||
|  |     :type start_header: str, optional | ||||||
|  |     :param end_header: The column title for the end position of the sequence, | ||||||
|  |     defaults to None | ||||||
|  |     :type end_header: str, optional | ||||||
|  |     :yield: One :obj:`bmlsa.datatypes.QuerySequence` for each row | ||||||
|  |     :rtype: A generator that yields objects of :class:`bmlsa.datatypes.QuerySequence` | ||||||
|  |     """ | ||||||
|     with open(csv_path, "r") as csv_fd: |     with open(csv_path, "r") as csv_fd: | ||||||
|         reader = csv.reader(csv_fd) |         reader = csv.reader(csv_fd) | ||||||
|         id_ind = None |         id_ind = None | ||||||
| @@ -38,7 +60,7 @@ def read_annotations_from_csv( | |||||||
|             desc = row[desc_ind] if desc_header else None |             desc = row[desc_ind] if desc_header else None | ||||||
|             start = row[start_ind] if start_header else None |             start = row[start_ind] if start_header else None | ||||||
|             end = row[end_ind] if end_header else None |             end = row[end_ind] if end_header else None | ||||||
|             yield AlignedSequence( |             yield QuerySequence( | ||||||
|                 id, |                 id, | ||||||
|                 sequence, |                 sequence, | ||||||
|                 name, |                 name, | ||||||
| @@ -49,8 +71,15 @@ def read_annotations_from_csv( | |||||||
|  |  | ||||||
|  |  | ||||||
| def save_alignments_to_csv( | def save_alignments_to_csv( | ||||||
|     aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]], output_path: str |     aligned_pairs: Iterable[tuple[QuerySequence, QuerySequence]], output_path: str | ||||||
| ): | ) -> None: | ||||||
|  |     """Saves alignments to a CSV. | ||||||
|  |  | ||||||
|  |     :param aligned_pairs: An iterable of the original sequence and aligned sequences | ||||||
|  |     :type aligned_pairs: Iterable[tuple[AlignedSequence, AlignedSequence]] | ||||||
|  |     :param output_path: A path to the output directory | ||||||
|  |     :type output_path: str | ||||||
|  |     """ | ||||||
|     with open(output_path, "w") as output_fd: |     with open(output_path, "w") as output_fd: | ||||||
|         writer = csv.writer(output_fd) |         writer = csv.writer(output_fd) | ||||||
|         header_wrote = False |         header_wrote = False | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ import pytest | |||||||
| from Bio import SeqIO | from Bio import SeqIO | ||||||
| from bmlsa.aligner import align_many_to_one_ssw | from bmlsa.aligner import align_many_to_one_ssw | ||||||
| from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS | from bmlsa.cli import DEFAULT_ALIGNMENT_PARAMETERS | ||||||
| from bmlsa.datatypes import AlignedSequence | from bmlsa.datatypes import QuerySequence | ||||||
| from collections.abc import Iterable | from collections.abc import Iterable | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -16,7 +16,7 @@ def reference_sequence(): | |||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def queries(): | def queries(): | ||||||
|     return [ |     return [ | ||||||
|         AlignedSequence( |         QuerySequence( | ||||||
|             "ORF10", |             "ORF10", | ||||||
|             "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT" |             "ATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAAT" | ||||||
|             "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG", |             "GAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAG", | ||||||
| @@ -38,8 +38,8 @@ def test_align_many_to_one_returns_correct_data_structure(reference_sequence, qu | |||||||
|         reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"] |         reference_sequence, queries, **DEFAULT_ALIGNMENT_PARAMETERS["BLASTp"] | ||||||
|     ) |     ) | ||||||
|     for original, aligned_seq in results: |     for original, aligned_seq in results: | ||||||
|         assert isinstance(original, AlignedSequence) |         assert isinstance(original, QuerySequence) | ||||||
|         assert isinstance(aligned_seq, AlignedSequence) |         assert isinstance(aligned_seq, QuerySequence) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_align_many_to_one_returns_correct_data(reference_sequence, queries): | def test_align_many_to_one_returns_correct_data(reference_sequence, queries): | ||||||
|   | |||||||
| @@ -1,21 +1,17 @@ | |||||||
| from csv import reader | from csv import reader | ||||||
| from os import path | from os import path | ||||||
| from bmlsa.datatypes import AlignedSequence | from bmlsa.datatypes import QuerySequence | ||||||
| from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv | from bmlsa.io import queries_from_csv, save_alignments_to_csv | ||||||
| from collections.abc import Iterable | from collections.abc import Iterable | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_read_annotations_from_csv_has_data(): | def test_queries_from_csv_has_data(): | ||||||
|     results = read_annotations_from_csv( |     results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence") | ||||||
|         "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence" |  | ||||||
|     ) |  | ||||||
|     assert isinstance(results, Iterable) |     assert isinstance(results, Iterable) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_read_annotations_from_csv_data_valid(): | def test_queries_from_csv_data_valid(): | ||||||
|     results = read_annotations_from_csv( |     results = queries_from_csv("tests/resources/SARS_CoV-2_genes.csv", "id", "sequence") | ||||||
|         "tests/resources/SARS_CoV-2_genes.csv", "id", "sequence" |  | ||||||
|     ) |  | ||||||
|     for aligned_seq in results: |     for aligned_seq in results: | ||||||
|         assert isinstance(aligned_seq.id, str) |         assert isinstance(aligned_seq.id, str) | ||||||
|         assert isinstance(aligned_seq.sequence, str) |         assert isinstance(aligned_seq.sequence, str) | ||||||
| @@ -23,7 +19,7 @@ def test_read_annotations_from_csv_data_valid(): | |||||||
|  |  | ||||||
| def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir): | def test_save_alignments_to_csv_produces_correct_headers_in_csv(tmpdir): | ||||||
|     output_path = path.join(tmpdir, "alignment_results.csv") |     output_path = path.join(tmpdir, "alignment_results.csv") | ||||||
|     dummy_sequence = AlignedSequence("DUMMY", "ATACTGGAAAA", name="test_sequence") |     dummy_sequence = QuerySequence("DUMMY", "ATACTGGAAAA", name="test_sequence") | ||||||
|     alignments = [(dummy_sequence, dummy_sequence)] |     alignments = [(dummy_sequence, dummy_sequence)] | ||||||
|     save_alignments_to_csv(alignments, output_path) |     save_alignments_to_csv(alignments, output_path) | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user