Tested and completed adding call sample filtering
Some checks failed
ydeng/modvcfsamples/pipeline/head There was a failure building this commit

This commit is contained in:
Harrison Deng 2023-06-26 21:07:55 +00:00
parent 515d790844
commit 08ba073ef9
5 changed files with 73 additions and 50 deletions

View File

@ -10,8 +10,10 @@
"src" "src"
], ],
"cSpell.words": [ "cSpell.words": [
"CHROM",
"pytest", "pytest",
"pyvcf", "pyvcf",
"vcfpy",
"vcfs" "vcfs"
], ],
} }

View File

@ -4,9 +4,9 @@ from modvcfsamples import sample
def run(vcfs: list[str], only: list[str], output_dir: str): def run(vcfs: list[str], only: list[str], output_dir: str):
for vcf in vcfs: for vcf in vcfs:
vcf_records = sample.get_records_from_vcf(vcf) vcf_records, header = sample.get_records_from_vcf(vcf)
sample.filter_all_sample_datatypes(vcf_records, *only) processed_vcfs, modified_header = sample.keep_specific_call_data(vcf_records, header, *only)
sample.write_records_to_vcf(vcf_records, os.path.join(output_dir, os.path.basename(vcf))) sample.write_records_to_vcf(processed_vcfs, modified_header, os.path.join(output_dir, os.path.basename(vcf)))
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -1,34 +1,58 @@
import vcf from typing import Iterable, OrderedDict
import vcfpy
from collections import namedtuple from collections import namedtuple
from copy import deepcopy from copy import deepcopy
import os
from vcfpy.record import Call
def get_records_from_vcf(path: str): def get_records_from_vcf(path: str):
vcf_records = [] vcf_records = []
with open(path, "r") as vcf_stream: with open(path, "r") as vcf_stream:
reader = vcf.Reader(vcf_stream) reader = vcfpy.Reader.from_stream(vcf_stream)
for record in reader: for record in reader:
vcf_records.append(record) vcf_records.append(record)
return vcf_records return vcf_records, reader.header
def filter_sample_datatype(record, *datatypes: str):
call_data = namedtuple("Data", *datatypes)
filtered_calls = []
modified_record = deepcopy(record)
for call in record.samples:
kept_data = {}
for datatype in datatypes:
kept_data[datatype] = call[datatype]
filtered_calls.append(call_data(**kept_data))
modified_record.samples = filtered_calls
return modified_record
def filter_all_sample_datatypes(records: list, *datatypes: str): def keep_specific_call_data(records: list[vcfpy.Record], header: vcfpy.Header, *datatypes: str):
lines_kept = [line for line in header.lines if line.key != "FORMAT"]
lines_kept.extend([line for line in header.get_lines("FORMAT") if line.id in datatypes])
modified_header = vcfpy.Header(lines_kept, header.samples)
modified_records = []
for record in records: for record in records:
yield filter_sample_datatype(record, *datatypes) modified_calls = []
for call in record.calls:
kept_data = OrderedDict()
for datatype in datatypes:
kept_data[datatype] = call.data[datatype]
modified_calls.append(Call(call.sample, kept_data, record))
modified_record = vcfpy.Record(
record.CHROM,
record.POS,
record.ID,
record.REF,
record.ALT,
record.QUAL,
record.FILTER,
record.INFO,
datatypes,
modified_calls,
)
modified_records.append(modified_record)
return modified_records, modified_header
def write_records_to_vcf(records: list, path: str): def normalize_gt_to_length(records: list[vcfpy.Record], header: vcfpy.Header, num: int):
for record in records:
for call in record.calls:
pass
def write_records_to_vcf(records: Iterable[vcfpy.Record], header: vcfpy.Header, path: str):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w") as vcf_stream: with open(path, "w") as vcf_stream:
writer = vcf.Writer(vcf_stream, records[0]) writer = vcfpy.Writer.from_stream(vcf_stream, header)
for record in records: for record in records:
writer.write_record(record) writer.write_record(record)

View File

@ -1,38 +1,18 @@
from modvcfsamples.sample import filter_sample_datatype, filter_all_sample_datatypes, get_records_from_vcf from modvcfsamples.sample import keep_specific_call_data, get_records_from_vcf
import os import os
def test_get_records_from_vcf_not_none():
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
assert len(records) > 0
def test_filter_sample_datatype_not_none():
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
filter_for = ["GT"]
modified_record = filter_sample_datatype(records[0], *filter_for)
assert modified_record is not None
def test_filter_sample_datatype_only_filtered():
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
filter_for = ["GT"]
modified_record = filter_sample_datatype(records[0], *filter_for)
for sample in modified_record.samples:
assert len(sample) <= len(filter_for)
for key, _ in sample._asdict().items():
assert key in filter_for
def test_filter_all_sample_datatypes_not_empty(): def test_filter_all_sample_datatypes_not_empty():
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
filter_for = ["GT"] filter_for = ["GT"]
modified_records = list(filter_all_sample_datatypes(records, *filter_for)) modified_records, header = keep_specific_call_data(records, header, *filter_for)
assert len(modified_records) == 11 assert len(modified_records) == 11
def test_filter_all_sample_datatypes_filtered(): def test_filter_all_sample_datatypes_filtered():
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
filter_for = ["GT"] filter_for = ["GT"]
modified_records = list(filter_all_sample_datatypes(records, *filter_for)) modified_records, header = keep_specific_call_data(records, header, *filter_for)
for modified_record in modified_records: for modified_record in modified_records:
for sample in modified_record.samples: for call in modified_record.calls:
assert len(sample) <= len(filter_for) assert len(call.data.keys()) <= len(filter_for)
for key, _ in sample._asdict().items(): for key, _ in call.data.items():
assert key in filter_for assert key in filter_for

View File

@ -0,0 +1,17 @@
##fileformat=VCFv4.1
##fileDate=10122015_22h01m13s
##source=SHAPEIT2.v837
##log_file=shapeit_10122015_22h01m13s_3f764d75-2fbb-42df-ab75-8c2dfd5731ce.log
##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Gambian Chinese French Brazilian Nigerian Pakistani English Colombian Indian Japanese
1 846808 rs4475691 C T 100 . AC=1276;AN=5008;DP=20368 GT 1|1 0|0 0|0 ./. 0|0 0|0 0|0 0|0 ./. 0|0
1 846854 rs111957712 G A 100 . AC=114;AN=5008;DP=20538 GT ./. 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0 0|0
1 846864 rs950122 G C 100 . AC=1116;AN=5008;DP=20582 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0
1 847228 rs3905286 C T 100 . AC=1215;AN=5008;DP=20731 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0
1 847297 rs11507768 G A 100 . AC=359;AN=5008;DP=20809 GT 1|0 0|0 0|0 ./. 0|0 0|0 ./. 0|0 0|0 0|0
1 847491 rs28407778 G A 100 . AC=1262;AN=5008;DP=16939 GT 1|1 0|0 0|0 0|0 0|0 ./. 0|0 0|0 0|1 0|0
1 848023 rs144407116 C A 100 . AC=52;AN=5008;DP=22562 GT 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
1 848090 rs4246505 G A 100 . AC=857;AN=5008;DP=19301 GT 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0 ./. 0|0
1 848445 rs4626817 G A 100 . AC=1255;AN=5008;DP=18444 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0
1 848456 rs11507767 A G 100 . AC=1266;AN=5008;DP=18137 GT 1|1 ./. 0|0 0|0 0|0 0|0 0|0 0|0 0|1 ./.
1 848738 rs3829741 C T 100 . AC=855;AN=5008;DP=16663 GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0