From 08ba073ef944e60969b85991d2662a010f7f38aa Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Mon, 26 Jun 2023 21:07:55 +0000 Subject: [PATCH] Tested and completed adding call sample filtering --- .vscode/settings.json | 2 + src/modvcfsamples/cli.py | 6 +- src/modvcfsamples/sample.py | 62 +++++++++++++------ tests/modvcfsamples/test_sample.py | 36 +++-------- .../expected_out/test_files_shortened.vcf | 17 +++++ 5 files changed, 73 insertions(+), 50 deletions(-) create mode 100644 tests/resources/expected_out/test_files_shortened.vcf diff --git a/.vscode/settings.json b/.vscode/settings.json index 8c7d484..55b11cb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,8 +10,10 @@ "src" ], "cSpell.words": [ + "CHROM", "pytest", "pyvcf", + "vcfpy", "vcfs" ], } \ No newline at end of file diff --git a/src/modvcfsamples/cli.py b/src/modvcfsamples/cli.py index 5283b96..065fa85 100644 --- a/src/modvcfsamples/cli.py +++ b/src/modvcfsamples/cli.py @@ -4,9 +4,9 @@ from modvcfsamples import sample def run(vcfs: list[str], only: list[str], output_dir: str): for vcf in vcfs: - vcf_records = sample.get_records_from_vcf(vcf) - sample.filter_all_sample_datatypes(vcf_records, *only) - sample.write_records_to_vcf(vcf_records, os.path.join(output_dir, os.path.basename(vcf))) + vcf_records, header = sample.get_records_from_vcf(vcf) + processed_vcfs, modified_header = sample.keep_specific_call_data(vcf_records, header, *only) + sample.write_records_to_vcf(processed_vcfs, modified_header, os.path.join(output_dir, os.path.basename(vcf))) def main(): parser = argparse.ArgumentParser() diff --git a/src/modvcfsamples/sample.py b/src/modvcfsamples/sample.py index 7bd1f90..fc9fb65 100644 --- a/src/modvcfsamples/sample.py +++ b/src/modvcfsamples/sample.py @@ -1,34 +1,58 @@ -import vcf +from typing import Iterable, OrderedDict +import vcfpy from collections import namedtuple from copy import deepcopy +import os + +from vcfpy.record import Call + def get_records_from_vcf(path: str): vcf_records = [] with open(path, "r") as vcf_stream: - reader = vcf.Reader(vcf_stream) + reader = vcfpy.Reader.from_stream(vcf_stream) for record in reader: vcf_records.append(record) - return vcf_records + return vcf_records, reader.header -def filter_sample_datatype(record, *datatypes: str): - call_data = namedtuple("Data", *datatypes) - filtered_calls = [] - modified_record = deepcopy(record) - for call in record.samples: - kept_data = {} - for datatype in datatypes: - kept_data[datatype] = call[datatype] - filtered_calls.append(call_data(**kept_data)) - modified_record.samples = filtered_calls - return modified_record -def filter_all_sample_datatypes(records: list, *datatypes: str): +def keep_specific_call_data(records: list[vcfpy.Record], header: vcfpy.Header, *datatypes: str): + lines_kept = [line for line in header.lines if line.key != "FORMAT"] + lines_kept.extend([line for line in header.get_lines("FORMAT") if line.id in datatypes]) + modified_header = vcfpy.Header(lines_kept, header.samples) + modified_records = [] + for record in records: - yield filter_sample_datatype(record, *datatypes) + modified_calls = [] + for call in record.calls: + kept_data = OrderedDict() + for datatype in datatypes: + kept_data[datatype] = call.data[datatype] + modified_calls.append(Call(call.sample, kept_data, record)) + modified_record = vcfpy.Record( + record.CHROM, + record.POS, + record.ID, + record.REF, + record.ALT, + record.QUAL, + record.FILTER, + record.INFO, + datatypes, + modified_calls, + ) + modified_records.append(modified_record) + return modified_records, modified_header -def write_records_to_vcf(records: list, path: str): +def normalize_gt_to_length(records: list[vcfpy.Record], header: vcfpy.Header, num: int): + for record in records: + for call in record.calls: + pass + +def write_records_to_vcf(records: Iterable[vcfpy.Record], header: vcfpy.Header, path: str): + os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w") as vcf_stream: - writer = vcf.Writer(vcf_stream, records[0]) + writer = vcfpy.Writer.from_stream(vcf_stream, header) for record in records: - writer.write_record(record) \ No newline at end of file + writer.write_record(record) diff --git a/tests/modvcfsamples/test_sample.py b/tests/modvcfsamples/test_sample.py index ea96ff7..22e4dbc 100644 --- a/tests/modvcfsamples/test_sample.py +++ b/tests/modvcfsamples/test_sample.py @@ -1,38 +1,18 @@ -from modvcfsamples.sample import filter_sample_datatype, filter_all_sample_datatypes, get_records_from_vcf +from modvcfsamples.sample import keep_specific_call_data, get_records_from_vcf import os -def test_get_records_from_vcf_not_none(): - records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) - assert len(records) > 0 - -def test_filter_sample_datatype_not_none(): - records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) - filter_for = ["GT"] - modified_record = filter_sample_datatype(records[0], *filter_for) - assert modified_record is not None - -def test_filter_sample_datatype_only_filtered(): - records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) - filter_for = ["GT"] - modified_record = filter_sample_datatype(records[0], *filter_for) - for sample in modified_record.samples: - assert len(sample) <= len(filter_for) - for key, _ in sample._asdict().items(): - assert key in filter_for - def test_filter_all_sample_datatypes_not_empty(): - records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) filter_for = ["GT"] - modified_records = list(filter_all_sample_datatypes(records, *filter_for)) + modified_records, header = keep_specific_call_data(records, header, *filter_for) assert len(modified_records) == 11 def test_filter_all_sample_datatypes_filtered(): - records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) filter_for = ["GT"] - modified_records = list(filter_all_sample_datatypes(records, *filter_for)) + modified_records, header = keep_specific_call_data(records, header, *filter_for) for modified_record in modified_records: - for sample in modified_record.samples: - assert len(sample) <= len(filter_for) - for key, _ in sample._asdict().items(): + for call in modified_record.calls: + assert len(call.data.keys()) <= len(filter_for) + for key, _ in call.data.items(): assert key in filter_for - diff --git a/tests/resources/expected_out/test_files_shortened.vcf b/tests/resources/expected_out/test_files_shortened.vcf new file mode 100644 index 0000000..b867dc6 --- /dev/null +++ b/tests/resources/expected_out/test_files_shortened.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.1 +##fileDate=10122015_22h01m13s +##source=SHAPEIT2.v837 +##log_file=shapeit_10122015_22h01m13s_3f764d75-2fbb-42df-ab75-8c2dfd5731ce.log +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Gambian Chinese French Brazilian Nigerian Pakistani English Colombian Indian Japanese +1 846808 rs4475691 C T 100 . AC=1276;AN=5008;DP=20368 GT 1|1 0|0 0|0 ./. 0|0 0|0 0|0 0|0 ./. 0|0 +1 846854 rs111957712 G A 100 . AC=114;AN=5008;DP=20538 GT ./. 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0 0|0 +1 846864 rs950122 G C 100 . AC=1116;AN=5008;DP=20582 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0 +1 847228 rs3905286 C T 100 . AC=1215;AN=5008;DP=20731 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0 +1 847297 rs11507768 G A 100 . AC=359;AN=5008;DP=20809 GT 1|0 0|0 0|0 ./. 0|0 0|0 ./. 0|0 0|0 0|0 +1 847491 rs28407778 G A 100 . AC=1262;AN=5008;DP=16939 GT 1|1 0|0 0|0 0|0 0|0 ./. 0|0 0|0 0|1 0|0 +1 848023 rs144407116 C A 100 . AC=52;AN=5008;DP=22562 GT 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 +1 848090 rs4246505 G A 100 . AC=857;AN=5008;DP=19301 GT 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0 ./. 0|0 +1 848445 rs4626817 G A 100 . AC=1255;AN=5008;DP=18444 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0 +1 848456 rs11507767 A G 100 . AC=1266;AN=5008;DP=18137 GT 1|1 ./. 0|0 0|0 0|0 0|0 0|0 0|0 0|1 ./. +1 848738 rs3829741 C T 100 . AC=855;AN=5008;DP=16663 GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0