From 515d79084415ee86ac29d5715f3c690059953815 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Mon, 26 Jun 2023 17:20:40 +0000 Subject: [PATCH] Basic filtering system and tests completed --- .vscode/settings.json | 17 +++++++++++ Jenkinsfile | 5 ++- environment.yml | 1 + setup.cfg | 14 ++++----- setup.py | 3 ++ src/modvcfsamples/cli.py | 39 ++++++++++++++++++++++++ src/modvcfsamples/sample.py | 34 +++++++++++++++++++++ src/program/program.py | 1 - tests/modvcfsamples/test_sample.py | 38 +++++++++++++++++++++++ tests/program/test_program.py | 2 -- tests/resources/test_files_shortened.vcf | 17 +++++++++++ 11 files changed, 158 insertions(+), 13 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 src/modvcfsamples/cli.py create mode 100644 src/modvcfsamples/sample.py delete mode 100644 src/program/program.py create mode 100644 tests/modvcfsamples/test_sample.py delete mode 100644 tests/program/test_program.py create mode 100644 tests/resources/test_files_shortened.vcf diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8c7d484 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + "python.testing.pytestArgs": [], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.linting.pylintArgs": [ + "--rcfile=setup.cfg" + ], + "python.analysis.autoSearchPaths": true, + "python.analysis.extraPaths": [ + "src" + ], + "cSpell.words": [ + "pytest", + "pyvcf", + "vcfs" + ], +} \ No newline at end of file diff --git a/Jenkinsfile b/Jenkinsfile index aba3f83..5583cfc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,6 @@ pipeline { } stage("unit tests") { steps { - // TODO Update the sh returnStatus: true, script: "python -m pytest --junitxml=unit_tests.xml --cov-report xml:test_coverage.xml --cov=program" xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)] cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'test_coverage.xml', failUnhealthy: false, failUnstable: false, maxNumberOfBuilds: 64, lineCoverageTargets: '50, 0, 0', methodCoverageTargets: '50, 0, 0', onlyStable: false, sourceEncoding: 'ASCII', zoomCoverageChart: false @@ -18,7 +17,7 @@ pipeline { stage("build") { steps { sh "python -m build" - // Additional build steps go here + // TODO Additional build steps go here } } stage("test installation") { @@ -30,7 +29,7 @@ pipeline { stage("archive") { steps { archiveArtifacts artifacts: 'dist/*.tar.gz, dist/*.whl' - // Additional archival or documentation steps go here + // TODO Additional archival or documentation steps go here } } stage("publish") { diff --git a/environment.yml b/environment.yml index 17aaf68..77a374c 100644 --- a/environment.yml +++ b/environment.yml @@ -8,3 +8,4 @@ dependencies: - pytest - twine - sphinx + - pyvcf diff --git a/setup.cfg b/setup.cfg index 01e2da6..9176cdd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,17 +1,17 @@ [metadata] -name = sample +name = modvcfsamples version = 0.0.1 [options] package_dir = - = src -# install_requires = -# cachier ==2.1 + = ./src +install_requires = + pyvcf ==0.6.8 -# [options.entry_points] -# console_scripts = -# avariantas = program.program:entry_function +[options.entry_points] +console_scripts = + modvcfsamples = modvcfsamples.cli:main [tool:pytest] pythonpath = src diff --git a/setup.py b/setup.py index e69de29..fc1f76c 100644 --- a/setup.py +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() \ No newline at end of file diff --git a/src/modvcfsamples/cli.py b/src/modvcfsamples/cli.py new file mode 100644 index 0000000..5283b96 --- /dev/null +++ b/src/modvcfsamples/cli.py @@ -0,0 +1,39 @@ +import argparse +import os +from modvcfsamples import sample + +def run(vcfs: list[str], only: list[str], output_dir: str): + for vcf in vcfs: + vcf_records = sample.get_records_from_vcf(vcf) + sample.filter_all_sample_datatypes(vcf_records, *only) + sample.write_records_to_vcf(vcf_records, os.path.join(output_dir, os.path.basename(vcf))) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "vcfs", + help="The VCFs to run filtering on", + nargs="+", + metavar="I", + type=str + ) + parser.add_argument( + "output_dir", + help="The output directory", + metavar="O", + type=str + ) + parser.add_argument( + "--only", + "-n", + help="Remove everything but the sample datatype", + action="append", + type=str + ) + + args = parser.parse_args() + run(args.vcfs, args.only, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/src/modvcfsamples/sample.py b/src/modvcfsamples/sample.py new file mode 100644 index 0000000..7bd1f90 --- /dev/null +++ b/src/modvcfsamples/sample.py @@ -0,0 +1,34 @@ +import vcf +from collections import namedtuple +from copy import deepcopy + +def get_records_from_vcf(path: str): + vcf_records = [] + with open(path, "r") as vcf_stream: + reader = vcf.Reader(vcf_stream) + for record in reader: + vcf_records.append(record) + + return vcf_records + +def filter_sample_datatype(record, *datatypes: str): + call_data = namedtuple("Data", *datatypes) + filtered_calls = [] + modified_record = deepcopy(record) + for call in record.samples: + kept_data = {} + for datatype in datatypes: + kept_data[datatype] = call[datatype] + filtered_calls.append(call_data(**kept_data)) + modified_record.samples = filtered_calls + return modified_record + +def filter_all_sample_datatypes(records: list, *datatypes: str): + for record in records: + yield filter_sample_datatype(record, *datatypes) + +def write_records_to_vcf(records: list, path: str): + with open(path, "w") as vcf_stream: + writer = vcf.Writer(vcf_stream, records[0]) + for record in records: + writer.write_record(record) \ No newline at end of file diff --git a/src/program/program.py b/src/program/program.py deleted file mode 100644 index 2d992a2..0000000 --- a/src/program/program.py +++ /dev/null @@ -1 +0,0 @@ -# TODO Do stuff! diff --git a/tests/modvcfsamples/test_sample.py b/tests/modvcfsamples/test_sample.py new file mode 100644 index 0000000..ea96ff7 --- /dev/null +++ b/tests/modvcfsamples/test_sample.py @@ -0,0 +1,38 @@ +from modvcfsamples.sample import filter_sample_datatype, filter_all_sample_datatypes, get_records_from_vcf +import os + +def test_get_records_from_vcf_not_none(): + records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + assert len(records) > 0 + +def test_filter_sample_datatype_not_none(): + records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + filter_for = ["GT"] + modified_record = filter_sample_datatype(records[0], *filter_for) + assert modified_record is not None + +def test_filter_sample_datatype_only_filtered(): + records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + filter_for = ["GT"] + modified_record = filter_sample_datatype(records[0], *filter_for) + for sample in modified_record.samples: + assert len(sample) <= len(filter_for) + for key, _ in sample._asdict().items(): + assert key in filter_for + +def test_filter_all_sample_datatypes_not_empty(): + records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + filter_for = ["GT"] + modified_records = list(filter_all_sample_datatypes(records, *filter_for)) + assert len(modified_records) == 11 + +def test_filter_all_sample_datatypes_filtered(): + records = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened.vcf")) + filter_for = ["GT"] + modified_records = list(filter_all_sample_datatypes(records, *filter_for)) + for modified_record in modified_records: + for sample in modified_record.samples: + assert len(sample) <= len(filter_for) + for key, _ in sample._asdict().items(): + assert key in filter_for + diff --git a/tests/program/test_program.py b/tests/program/test_program.py deleted file mode 100644 index aa247ae..0000000 --- a/tests/program/test_program.py +++ /dev/null @@ -1,2 +0,0 @@ -# TODO Test program! - diff --git a/tests/resources/test_files_shortened.vcf b/tests/resources/test_files_shortened.vcf new file mode 100644 index 0000000..c2ff70f --- /dev/null +++ b/tests/resources/test_files_shortened.vcf @@ -0,0 +1,17 @@ +##fileformat=VCFv4.1 +##fileDate=10122015_22h01m13s +##source=SHAPEIT2.v837 +##log_file=shapeit_10122015_22h01m13s_3f764d75-2fbb-42df-ab75-8c2dfd5731ce.log +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Gambian Chinese French Brazilian Nigerian Pakistani English Colombian Indian Japanese +1 846808 rs4475691 C T 100 . AC=1276;AN=5008;DP=20368 GT:GQ:DP 1|1:70:60 0|0:30:10 0|0:20:40 ./.:0:0 0|0:70:60 0|0:90:30 0|0:80:70 0|0:50:80 ./.:0:0 0|0:100:80 +1 846854 rs111957712 G A 100 . AC=114;AN=5008;DP=20538 GT:GQ:DP ./.:0:0 0|0:40:30 0|0:80:30 0|0:0:20 0|0:40:80 0|0:10:30 ./.:0:0 ./.:0:0 0|0:70:0 0|0:90:30 +1 846864 rs950122 G C 100 . AC=1116;AN=5008;DP=20582 GT:GQ:DP 1|1:60:0 0|0:20:0 0|0:90:70 0|0:80:50 0|0:80:30 0|0:10:80 0|0:100:20 0|0:30:40 ./.:0:0 0|0:30:90 +1 847228 rs3905286 C T 100 . AC=1215;AN=5008;DP=20731 GT:GQ:DP 1|1:70:80 0|0:90:80 0|0:60:50 0|0:60:90 0|0:30:40 0|0:70:10 0|0:100:80 0|0:100:50 0|1:70:100 0|0:40:40 +1 847297 rs11507768 G A 100 . AC=359;AN=5008;DP=20809 GT:GQ:DP 1|0:0:60 0|0:10:30 0|0:80:60 ./.:0:0 0|0:10:100 0|0:10:100 ./.:0:0 0|0:100:40 0|0:20:20 0|0:10:0 +1 847491 rs28407778 G A 100 . AC=1262;AN=5008;DP=16939 GT:GQ:DP 1|1:0:100 0|0:70:40 0|0:0:60 0|0:90:90 0|0:90:90 ./.:0:0 0|0:70:60 0|0:70:0 0|1:90:100 0|0:60:60 +1 848023 rs144407116 C A 100 . AC=52;AN=5008;DP=22562 GT:GQ:DP 0|1:10:60 0|0:20:80 0|0:20:50 0|0:20:90 0|0:90:10 0|0:70:60 0|0:40:30 0|0:60:0 0|0:40:40 0|0:10:10 +1 848090 rs4246505 G A 100 . AC=857;AN=5008;DP=19301 GT:GQ:DP 0|0:70:70 0|0:90:30 0|0:40:10 0|0:80:20 0|0:50:50 0|0:10:30 ./.:0:0 0|0:20:60 ./.:0:0 0|0:90:0 +1 848445 rs4626817 G A 100 . AC=1255;AN=5008;DP=18444 GT:GQ:DP 1|1:100:40 0|0:80:90 0|0:30:100 0|0:100:60 0|0:40:90 0|0:20:30 0|0:70:100 ./.:0:0 ./.:0:0 0|0:80:30 +1 848456 rs11507767 A G 100 . AC=1266;AN=5008;DP=18137 GT:GQ:DP 1|1:40:30 ./.:0:0 0|0:60:90 0|0:60:40 0|0:100:80 0|0:50:50 0|0:0:10 0|0:60:0 0|1:100:100 ./.:0:0 +1 848738 rs3829741 C T 100 . AC=855;AN=5008;DP=16663 GT:GQ:DP 0|0:50:90 0|0:50:50 0|0:50:30 0|0:60:60 0|0:80:40 0|0:50:80 0|0:0:80 0|0:0:30 0|1:10:0 0|0:70:30