mutation-case-controller/vgaat/variants.py

import csv
import re
import os
from typing import Iterable
import vcfpy
from collections import defaultdict
import logging


class VariantRecordsOrganizer:
    def __init__(self, on_change=None):
        self._on_change = on_change
        self._vcfs_dir = ""
        self._lineage_report_csv_path = ""
        self._vcf_filename_regex = ""

    def update(self, vcfs_dir, lineage_report_csv_path, vcf_filename_regex, **kwargs):
        self._vcfs_dir = vcfs_dir
        self._lineage_report_csv_path = lineage_report_csv_path
        self._vcf_filename_regex = vcf_filename_regex
        if self._on_change:
            self._on_change(self, **kwargs)

    def build(self):
        return CategorizedVariantRecords.from_files(
            self._vcfs_dir, self._lineage_report_csv_path, self._vcf_filename_regex
        )


class VariantRecordGroup:
    def __init__(self, patient_id, sample_type, sample_day, records: list):
        self._patient_id = patient_id
        self._sample_type = sample_type
        self._sample_day = sample_day
        self._variants = records

    @property
    def vcf_info(self):
        return self._patient_id, self._sample_type, self._sample_day

    @property
    def variants(self):
        return self._variants

    def __hash__(self):
        return hash(self.vcf_info)

    def __eq__(self, o):
        return self.vcf_info == o.vcf_info

    def __iter__(self):
        return iter(self.variants)


class CategorizedVariantRecords:
    def __init__(self, vcf_data, lineage_info):
        self._primitive_category_groups = {}
        self._primitive_category_groups["sample day"] = defaultdict(set)
        self._primitive_category_groups["patient id"] = defaultdict(set)
        self._primitive_category_groups["sample type"] = defaultdict(set)
        self._primitive_category_groups["viral lineage"] = defaultdict(set)
        self._all_vcf = vcf_data
        self._all_lineage_info = lineage_info
        for vcf_info, records in vcf_data.items():
            patient_id, sample_type, sample_day = vcf_info
            self._primitive_category_groups["sample day"][sample_day].add(records)
            self._primitive_category_groups["patient id"][patient_id].add(records)
            self._primitive_category_groups["sample type"][sample_type].add(records)
            if vcf_info in lineage_info:
                self._primitive_category_groups["viral lineage"][
                        lineage_info[vcf_info]
                    ].add(records)

    def get_category_groups(self, category: str):
        return self._primitive_category_groups[category]

    def get_all_vcf(self):
        return self._all_vcf

    def get_all_lineage_info(self):
        return self._all_lineage_info

    @staticmethod
    def intersect_each_with_one(each: dict, one: Iterable):
        result = {}
        for key, value in each.items():
            result[key] = value & one
        return result

    @staticmethod
    def get_sample_info_from_filename(
        vcf_filename, filename_regex=r"([BNE]{1,2})(\d+)(?:-D(\d+))?"
    ):
        filename_match = re.match(filename_regex, vcf_filename)
        if filename_match is None:
            logging.warn(
                f'The regex "{filename_regex}" did not match ' f'"{vcf_filename}"'
            )
            return None
        sample_type = filename_match.group(1)
        sample_day = (
            filename_match.group(3) or "1"
        )  # We assume it's day 1 in the case there were no annotations.
        sample_id = filename_match.group(2)
        return sample_id, sample_type, sample_day

    @staticmethod
    def read_lineage_csv(
        file_name: str, sample_info_column: str, viral_strand_column: str
    ):
        lineage_info = {}
        with open(file_name) as lineage_csv_fd:
            reader = csv.reader(lineage_csv_fd)
            header = None
            for row in reader:
                if header is None:
                    header = row
                    continue
                sample_info = CategorizedVariantRecords.get_sample_info_from_filename(
                    row[header.index("taxon")]
                )
                viral_strand_ver = row[header.index(viral_strand_column)]
                if viral_strand_ver:
                    lineage_info[sample_info] = viral_strand_ver
                else:
                    lineage_info[sample_info] = "unknown"
        return lineage_info

    @staticmethod
    def count_samples_in_dict_of_sets(dict_samples: dict[str, list]):
        result = {}
        for key, value in dict_samples.items():
            result[key] = len(value)
        return result

    @staticmethod
    def from_files(
        vcfs_path,
        lineage_report_csv_path,
        filename_regex=r"([BNE]{1,2})(\d+)(?:-D(\d+))?",
    ):
        lineage_dict = CategorizedVariantRecords.read_lineage_csv(
            lineage_report_csv_path, "taxon", "scorpio_call"
        )
        vcf_records = {}
        for vcf_filename in os.listdir(vcfs_path):
            vcf_full_path = os.path.abspath(os.path.join(vcfs_path, vcf_filename))
            if os.path.isdir(vcf_full_path):
                continue
            if os.path.splitext(vcf_full_path)[1] != ".vcf":
                continue
            sample_info = CategorizedVariantRecords.get_sample_info_from_filename(
                vcf_filename, filename_regex
            )
            if sample_info is None:
                continue

            patient_id, sample_type, sample_day = sample_info
            with open(vcf_full_path) as vcf_file_desc:
                curr_list = []
                for vcf_record in vcfpy.Reader(vcf_file_desc):
                    curr_list.append(vcf_record)
                vcf_records[sample_info] = VariantRecordGroup(*sample_info, curr_list)
        return CategorizedVariantRecords(vcf_records, lineage_dict)
Updated 'statistics_tests.py' to conform to mutations API 2023-03-23 21:24:25 +00:00			`import csv`
			`import re`
			`import os`
			`from typing import Iterable`
			`import vcfpy`
			`from collections import defaultdict`
			`import logging`


			`class VariantRecordsOrganizer:`
			`def __init__(self, on_change=None):`
			`self._on_change = on_change`
			`self._vcfs_dir = ""`
			`self._lineage_report_csv_path = ""`
			`self._vcf_filename_regex = ""`

			`def update(self, vcfs_dir, lineage_report_csv_path, vcf_filename_regex, **kwargs):`
			`self._vcfs_dir = vcfs_dir`
			`self._lineage_report_csv_path = lineage_report_csv_path`
			`self._vcf_filename_regex = vcf_filename_regex`
			`if self._on_change:`
			`self._on_change(self, **kwargs)`

			`def build(self):`
			`return CategorizedVariantRecords.from_files(`
			`self._vcfs_dir, self._lineage_report_csv_path, self._vcf_filename_regex`
			`)`


			`class VariantRecordGroup:`
			`def __init__(self, patient_id, sample_type, sample_day, records: list):`
			`self._patient_id = patient_id`
			`self._sample_type = sample_type`
			`self._sample_day = sample_day`
			`self._variants = records`

			`@property`
			`def vcf_info(self):`
			`return self._patient_id, self._sample_type, self._sample_day`

			`@property`
			`def variants(self):`
			`return self._variants`

			`def __hash__(self):`
			`return hash(self.vcf_info)`

			`def __eq__(self, o):`
			`return self.vcf_info == o.vcf_info`

			`def __iter__(self):`
			`return iter(self.variants)`


			`class CategorizedVariantRecords:`
			`def __init__(self, vcf_data, lineage_info):`
			`self._primitive_category_groups = {}`
			`self._primitive_category_groups["sample day"] = defaultdict(set)`
			`self._primitive_category_groups["patient id"] = defaultdict(set)`
			`self._primitive_category_groups["sample type"] = defaultdict(set)`
			`self._primitive_category_groups["viral lineage"] = defaultdict(set)`
			`self._all_vcf = vcf_data`
			`self._all_lineage_info = lineage_info`
			`for vcf_info, records in vcf_data.items():`
			`patient_id, sample_type, sample_day = vcf_info`
			`self._primitive_category_groups["sample day"][sample_day].add(records)`
			`self._primitive_category_groups["patient id"][patient_id].add(records)`
			`self._primitive_category_groups["sample type"][sample_type].add(records)`
			`if vcf_info in lineage_info:`
			`self._primitive_category_groups["viral lineage"][`
			`lineage_info[vcf_info]`
			`].add(records)`

			`def get_category_groups(self, category: str):`
			`return self._primitive_category_groups[category]`

			`def get_all_vcf(self):`
			`return self._all_vcf`

			`def get_all_lineage_info(self):`
			`return self._all_lineage_info`

			`@staticmethod`
			`def intersect_each_with_one(each: dict, one: Iterable):`
			`result = {}`
			`for key, value in each.items():`
			`result[key] = value & one`
			`return result`

			`@staticmethod`
			`def get_sample_info_from_filename(`
			`vcf_filename, filename_regex=r"([BNE]{1,2})(\d+)(?:-D(\d+))?"`
			`):`
			`filename_match = re.match(filename_regex, vcf_filename)`
			`if filename_match is None:`
			`logging.warn(`
			`f'The regex "{filename_regex}" did not match ' f'"{vcf_filename}"'`
			`)`
			`return None`
			`sample_type = filename_match.group(1)`
			`sample_day = (`
			`filename_match.group(3) or "1"`
			`) # We assume it's day 1 in the case there were no annotations.`
			`sample_id = filename_match.group(2)`
			`return sample_id, sample_type, sample_day`

			`@staticmethod`
			`def read_lineage_csv(`
			`file_name: str, sample_info_column: str, viral_strand_column: str`
			`):`
			`lineage_info = {}`
			`with open(file_name) as lineage_csv_fd:`
			`reader = csv.reader(lineage_csv_fd)`
			`header = None`
			`for row in reader:`
			`if header is None:`
			`header = row`
			`continue`
			`sample_info = CategorizedVariantRecords.get_sample_info_from_filename(`
			`row[header.index("taxon")]`
			`)`
			`viral_strand_ver = row[header.index(viral_strand_column)]`
			`if viral_strand_ver:`
			`lineage_info[sample_info] = viral_strand_ver`
			`else:`
			`lineage_info[sample_info] = "unknown"`
			`return lineage_info`

			`@staticmethod`
			`def count_samples_in_dict_of_sets(dict_samples: dict[str, list]):`
			`result = {}`
			`for key, value in dict_samples.items():`
			`result[key] = len(value)`
			`return result`

			`@staticmethod`
			`def from_files(`
			`vcfs_path,`
			`lineage_report_csv_path,`
			`filename_regex=r"([BNE]{1,2})(\d+)(?:-D(\d+))?",`
			`):`
			`lineage_dict = CategorizedVariantRecords.read_lineage_csv(`
			`lineage_report_csv_path, "taxon", "scorpio_call"`
			`)`
			`vcf_records = {}`
			`for vcf_filename in os.listdir(vcfs_path):`
			`vcf_full_path = os.path.abspath(os.path.join(vcfs_path, vcf_filename))`
			`if os.path.isdir(vcf_full_path):`
			`continue`
			`if os.path.splitext(vcf_full_path)[1] != ".vcf":`
			`continue`
			`sample_info = CategorizedVariantRecords.get_sample_info_from_filename(`
			`vcf_filename, filename_regex`
			`)`
			`if sample_info is None:`
			`continue`

			`patient_id, sample_type, sample_day = sample_info`
			`with open(vcf_full_path) as vcf_file_desc:`
			`curr_list = []`
			`for vcf_record in vcfpy.Reader(vcf_file_desc):`
			`curr_list.append(vcf_record)`
			`vcf_records[sample_info] = VariantRecordGroup(*sample_info, curr_list)`
			`return CategorizedVariantRecords(vcf_records, lineage_dict)`