generated from ydeng/python-program
Tested and completed adding call sample filtering
Some checks failed
ydeng/modvcfsamples/pipeline/head There was a failure building this commit
Some checks failed
ydeng/modvcfsamples/pipeline/head There was a failure building this commit
This commit is contained in:
parent
515d790844
commit
08ba073ef9
2
.vscode/settings.json
vendored
2
.vscode/settings.json
vendored
@ -10,8 +10,10 @@
|
|||||||
"src"
|
"src"
|
||||||
],
|
],
|
||||||
"cSpell.words": [
|
"cSpell.words": [
|
||||||
|
"CHROM",
|
||||||
"pytest",
|
"pytest",
|
||||||
"pyvcf",
|
"pyvcf",
|
||||||
|
"vcfpy",
|
||||||
"vcfs"
|
"vcfs"
|
||||||
],
|
],
|
||||||
}
|
}
|
@ -4,9 +4,9 @@ from modvcfsamples import sample
|
|||||||
|
|
||||||
def run(vcfs: list[str], only: list[str], output_dir: str):
|
def run(vcfs: list[str], only: list[str], output_dir: str):
|
||||||
for vcf in vcfs:
|
for vcf in vcfs:
|
||||||
vcf_records = sample.get_records_from_vcf(vcf)
|
vcf_records, header = sample.get_records_from_vcf(vcf)
|
||||||
sample.filter_all_sample_datatypes(vcf_records, *only)
|
processed_vcfs, modified_header = sample.keep_specific_call_data(vcf_records, header, *only)
|
||||||
sample.write_records_to_vcf(vcf_records, os.path.join(output_dir, os.path.basename(vcf)))
|
sample.write_records_to_vcf(processed_vcfs, modified_header, os.path.join(output_dir, os.path.basename(vcf)))
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
@ -1,34 +1,58 @@
|
|||||||
import vcf
|
from typing import Iterable, OrderedDict
|
||||||
|
import vcfpy
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
import os
|
||||||
|
|
||||||
|
from vcfpy.record import Call
|
||||||
|
|
||||||
|
|
||||||
def get_records_from_vcf(path: str):
|
def get_records_from_vcf(path: str):
|
||||||
vcf_records = []
|
vcf_records = []
|
||||||
with open(path, "r") as vcf_stream:
|
with open(path, "r") as vcf_stream:
|
||||||
reader = vcf.Reader(vcf_stream)
|
reader = vcfpy.Reader.from_stream(vcf_stream)
|
||||||
for record in reader:
|
for record in reader:
|
||||||
vcf_records.append(record)
|
vcf_records.append(record)
|
||||||
|
|
||||||
return vcf_records
|
return vcf_records, reader.header
|
||||||
|
|
||||||
def filter_sample_datatype(record, *datatypes: str):
|
|
||||||
call_data = namedtuple("Data", *datatypes)
|
|
||||||
filtered_calls = []
|
|
||||||
modified_record = deepcopy(record)
|
|
||||||
for call in record.samples:
|
|
||||||
kept_data = {}
|
|
||||||
for datatype in datatypes:
|
|
||||||
kept_data[datatype] = call[datatype]
|
|
||||||
filtered_calls.append(call_data(**kept_data))
|
|
||||||
modified_record.samples = filtered_calls
|
|
||||||
return modified_record
|
|
||||||
|
|
||||||
def filter_all_sample_datatypes(records: list, *datatypes: str):
|
def keep_specific_call_data(records: list[vcfpy.Record], header: vcfpy.Header, *datatypes: str):
|
||||||
|
lines_kept = [line for line in header.lines if line.key != "FORMAT"]
|
||||||
|
lines_kept.extend([line for line in header.get_lines("FORMAT") if line.id in datatypes])
|
||||||
|
modified_header = vcfpy.Header(lines_kept, header.samples)
|
||||||
|
modified_records = []
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
yield filter_sample_datatype(record, *datatypes)
|
modified_calls = []
|
||||||
|
for call in record.calls:
|
||||||
|
kept_data = OrderedDict()
|
||||||
|
for datatype in datatypes:
|
||||||
|
kept_data[datatype] = call.data[datatype]
|
||||||
|
modified_calls.append(Call(call.sample, kept_data, record))
|
||||||
|
modified_record = vcfpy.Record(
|
||||||
|
record.CHROM,
|
||||||
|
record.POS,
|
||||||
|
record.ID,
|
||||||
|
record.REF,
|
||||||
|
record.ALT,
|
||||||
|
record.QUAL,
|
||||||
|
record.FILTER,
|
||||||
|
record.INFO,
|
||||||
|
datatypes,
|
||||||
|
modified_calls,
|
||||||
|
)
|
||||||
|
modified_records.append(modified_record)
|
||||||
|
return modified_records, modified_header
|
||||||
|
|
||||||
def write_records_to_vcf(records: list, path: str):
|
def normalize_gt_to_length(records: list[vcfpy.Record], header: vcfpy.Header, num: int):
|
||||||
|
for record in records:
|
||||||
|
for call in record.calls:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def write_records_to_vcf(records: Iterable[vcfpy.Record], header: vcfpy.Header, path: str):
|
||||||
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||||
with open(path, "w") as vcf_stream:
|
with open(path, "w") as vcf_stream:
|
||||||
writer = vcf.Writer(vcf_stream, records[0])
|
writer = vcfpy.Writer.from_stream(vcf_stream, header)
|
||||||
for record in records:
|
for record in records:
|
||||||
writer.write_record(record)
|
writer.write_record(record)
|
@ -1,38 +1,18 @@
|
|||||||
from modvcfsamples.sample import filter_sample_datatype, filter_all_sample_datatypes, get_records_from_vcf
|
from modvcfsamples.sample import keep_specific_call_data, get_records_from_vcf
|
||||||
import os
|
import os
|
||||||
|
|
||||||
def test_get_records_from_vcf_not_none():
|
|
||||||
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
|
||||||
assert len(records) > 0
|
|
||||||
|
|
||||||
def test_filter_sample_datatype_not_none():
|
|
||||||
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
|
||||||
filter_for = ["GT"]
|
|
||||||
modified_record = filter_sample_datatype(records[0], *filter_for)
|
|
||||||
assert modified_record is not None
|
|
||||||
|
|
||||||
def test_filter_sample_datatype_only_filtered():
|
|
||||||
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
|
||||||
filter_for = ["GT"]
|
|
||||||
modified_record = filter_sample_datatype(records[0], *filter_for)
|
|
||||||
for sample in modified_record.samples:
|
|
||||||
assert len(sample) <= len(filter_for)
|
|
||||||
for key, _ in sample._asdict().items():
|
|
||||||
assert key in filter_for
|
|
||||||
|
|
||||||
def test_filter_all_sample_datatypes_not_empty():
|
def test_filter_all_sample_datatypes_not_empty():
|
||||||
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
||||||
filter_for = ["GT"]
|
filter_for = ["GT"]
|
||||||
modified_records = list(filter_all_sample_datatypes(records, *filter_for))
|
modified_records, header = keep_specific_call_data(records, header, *filter_for)
|
||||||
assert len(modified_records) == 11
|
assert len(modified_records) == 11
|
||||||
|
|
||||||
def test_filter_all_sample_datatypes_filtered():
|
def test_filter_all_sample_datatypes_filtered():
|
||||||
records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf"))
|
||||||
filter_for = ["GT"]
|
filter_for = ["GT"]
|
||||||
modified_records = list(filter_all_sample_datatypes(records, *filter_for))
|
modified_records, header = keep_specific_call_data(records, header, *filter_for)
|
||||||
for modified_record in modified_records:
|
for modified_record in modified_records:
|
||||||
for sample in modified_record.samples:
|
for call in modified_record.calls:
|
||||||
assert len(sample) <= len(filter_for)
|
assert len(call.data.keys()) <= len(filter_for)
|
||||||
for key, _ in sample._asdict().items():
|
for key, _ in call.data.items():
|
||||||
assert key in filter_for
|
assert key in filter_for
|
||||||
|
|
||||||
|
17
tests/resources/expected_out/test_files_shortened.vcf
Normal file
17
tests/resources/expected_out/test_files_shortened.vcf
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
##fileformat=VCFv4.1
|
||||||
|
##fileDate=10122015_22h01m13s
|
||||||
|
##source=SHAPEIT2.v837
|
||||||
|
##log_file=shapeit_10122015_22h01m13s_3f764d75-2fbb-42df-ab75-8c2dfd5731ce.log
|
||||||
|
##FORMAT=<ID=GT,Number=1,Type=String,Description="Phased Genotype">
|
||||||
|
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Gambian Chinese French Brazilian Nigerian Pakistani English Colombian Indian Japanese
|
||||||
|
1 846808 rs4475691 C T 100 . AC=1276;AN=5008;DP=20368 GT 1|1 0|0 0|0 ./. 0|0 0|0 0|0 0|0 ./. 0|0
|
||||||
|
1 846854 rs111957712 G A 100 . AC=114;AN=5008;DP=20538 GT ./. 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0 0|0
|
||||||
|
1 846864 rs950122 G C 100 . AC=1116;AN=5008;DP=20582 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0
|
||||||
|
1 847228 rs3905286 C T 100 . AC=1215;AN=5008;DP=20731 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0
|
||||||
|
1 847297 rs11507768 G A 100 . AC=359;AN=5008;DP=20809 GT 1|0 0|0 0|0 ./. 0|0 0|0 ./. 0|0 0|0 0|0
|
||||||
|
1 847491 rs28407778 G A 100 . AC=1262;AN=5008;DP=16939 GT 1|1 0|0 0|0 0|0 0|0 ./. 0|0 0|0 0|1 0|0
|
||||||
|
1 848023 rs144407116 C A 100 . AC=52;AN=5008;DP=22562 GT 0|1 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0
|
||||||
|
1 848090 rs4246505 G A 100 . AC=857;AN=5008;DP=19301 GT 0|0 0|0 0|0 0|0 0|0 0|0 ./. 0|0 ./. 0|0
|
||||||
|
1 848445 rs4626817 G A 100 . AC=1255;AN=5008;DP=18444 GT 1|1 0|0 0|0 0|0 0|0 0|0 0|0 ./. ./. 0|0
|
||||||
|
1 848456 rs11507767 A G 100 . AC=1266;AN=5008;DP=18137 GT 1|1 ./. 0|0 0|0 0|0 0|0 0|0 0|0 0|1 ./.
|
||||||
|
1 848738 rs3829741 C T 100 . AC=855;AN=5008;DP=16663 GT 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|0 0|1 0|0
|
Loading…
Reference in New Issue
Block a user