From 7bae01b1af1d917f276be2dcb5dac4f5a3dc1f30 Mon Sep 17 00:00:00 2001
From: Harrison Deng <yunyangdeng@gmail.com>
Date: Tue, 27 Jun 2023 06:15:32 +0000
Subject: [PATCH] Implemented normalizing genotype

---
 Jenkinsfile                        |  2 +-
 environment.yml                    |  1 +
 src/modvcfsamples/cli.py           | 24 +++++++++++++++++++-----
 src/modvcfsamples/sample.py        | 25 ++++++++++++++++++++++++-
 tests/modvcfsamples/test_sample.py | 14 +++++++++++++-
 5 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5583cfc..cbd4f12 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -9,7 +9,7 @@ pipeline {
         }
         stage("unit tests") {
             steps {
-                sh returnStatus: true, script: "python -m pytest --junitxml=unit_tests.xml --cov-report xml:test_coverage.xml --cov=program"
+                sh returnStatus: true, script: "python -m pytest --junitxml=unit_tests.xml --cov-report xml:test_coverage.xml --cov=modvcfsamples"
                 xunit checksName: '', tools: [JUnit(excludesPattern: '', pattern: 'test_results.xml', stopProcessingIfError: true)]
                 cobertura autoUpdateHealth: false, autoUpdateStability: false, coberturaReportFile: 'test_coverage.xml', failUnhealthy: false, failUnstable: false, maxNumberOfBuilds: 64, lineCoverageTargets: '50, 0, 0', methodCoverageTargets: '50, 0, 0', onlyStable: false, sourceEncoding: 'ASCII', zoomCoverageChart: false
             }
diff --git a/environment.yml b/environment.yml
index 77a374c..f64b6ac 100644
--- a/environment.yml
+++ b/environment.yml
@@ -6,6 +6,7 @@ dependencies:
   - pip
   - python-build
   - pytest
+  - pytest-cov
   - twine
   - sphinx
   - pyvcf
diff --git a/src/modvcfsamples/cli.py b/src/modvcfsamples/cli.py
index 065fa85..6eeacc2 100644
--- a/src/modvcfsamples/cli.py
+++ b/src/modvcfsamples/cli.py
@@ -1,12 +1,18 @@
 import argparse
 import os
+from typing import Union
 from modvcfsamples import sample
 
-def run(vcfs: list[str], only: list[str], output_dir: str):
+def run(vcfs: list[str], only: list[str], gt: Union[int, None], output_dir: str):
     for vcf in vcfs:
         vcf_records, header = sample.get_records_from_vcf(vcf)
-        processed_vcfs, modified_header = sample.keep_specific_call_data(vcf_records, header, *only)
-        sample.write_records_to_vcf(processed_vcfs, modified_header, os.path.join(output_dir, os.path.basename(vcf)))
+        modified_vcfs = vcf_records
+        modified_header = header
+        if len(only) > 1:
+            modified_vcfs, modified_header = sample.keep_specific_call_data(modified_vcfs, modified_header, *only)
+        if gt is not None:
+            modified_vcfs, modified_header = sample.normalize_gt_to_length(modified_vcfs, modified_header, gt)
+        sample.write_records_to_vcf(modified_vcfs, modified_header, os.path.join(output_dir, os.path.basename(vcf)))
 
 def main():
     parser = argparse.ArgumentParser()
@@ -23,16 +29,24 @@ def main():
         metavar="O",
         type=str
     )
+    parser.add_argument(
+        "--gt-norm",
+        "-g",
+        help="Resizes haploid genotypes to n-ploid by repeating it.",
+        type=int,
+        required=False
+    )
     parser.add_argument(
         "--only",
         "-n",
         help="Remove everything but the sample datatype",
         action="append",
-        type=str
+        type=str,
+        required=False
     )
 
     args = parser.parse_args()
-    run(args.vcfs, args.only, args.output_dir)
+    run(args.vcfs, args.only, args.gt_norm, args.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/src/modvcfsamples/sample.py b/src/modvcfsamples/sample.py
index fc9fb65..daddc4e 100644
--- a/src/modvcfsamples/sample.py
+++ b/src/modvcfsamples/sample.py
@@ -46,9 +46,32 @@ def keep_specific_call_data(records: list[vcfpy.Record], header: vcfpy.Header, *
     return modified_records, modified_header
 
 def normalize_gt_to_length(records: list[vcfpy.Record], header: vcfpy.Header, num: int):
+    modified_records = []
     for record in records:
+        modified_calls = []
         for call in record.calls:
-            pass
+            gt_parts = call.data['GT'].replace("/", "|").split("|")
+            modified_call = deepcopy(call)
+            if len(gt_parts) > 1:
+                # TODO Add logging and output if gt_parts is longer.
+                pass
+            else:
+                modified_call.data['GT'] = "|".join([gt_parts[0]]  * num)
+            modified_calls.append(modified_call)
+        modified_record = vcfpy.Record(
+            record.CHROM,
+            record.POS,
+            record.ID,
+            record.REF,
+            record.ALT,
+            record.QUAL,
+            record.FILTER,
+            record.INFO,
+            record.FORMAT,
+            modified_calls,
+        )
+        modified_records.append(modified_record)
+    return modified_records, header
 
 def write_records_to_vcf(records: Iterable[vcfpy.Record], header: vcfpy.Header, path: str):
     os.makedirs(os.path.dirname(path), exist_ok=True)
diff --git a/tests/modvcfsamples/test_sample.py b/tests/modvcfsamples/test_sample.py
index 22e4dbc..d579299 100644
--- a/tests/modvcfsamples/test_sample.py
+++ b/tests/modvcfsamples/test_sample.py
@@ -1,4 +1,4 @@
-from modvcfsamples.sample import keep_specific_call_data, get_records_from_vcf
+from modvcfsamples.sample import keep_specific_call_data, get_records_from_vcf, normalize_gt_to_length
 import os
 
 def test_filter_all_sample_datatypes_not_empty():
@@ -16,3 +16,15 @@ def test_filter_all_sample_datatypes_filtered():
             assert len(call.data.keys()) <= len(filter_for)
             for key, _ in call.data.items():
                 assert key in filter_for
+
+def test_normalize_gt_to_length_not_empty():
+    records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened_haploid.vcf"))
+    modified_records, _ = normalize_gt_to_length(records, header, 4)
+    assert len(modified_records) > 0
+
+def test_normalize_gt_to_length_gt_normalized():
+    records, header = get_records_from_vcf(os.path.abspath("tests/resources/test_files_shortened_haploid.vcf"))
+    modified_records, _ = normalize_gt_to_length(records, header, 4)
+    for modified_record in modified_records:
+        for call in modified_record.calls:
+            assert len(call.data["GT"].split("|")) == 4 or "/" in call.data["GT"]
\ No newline at end of file