mutation-case-controller/vgaat/statistics_tests.py

116 lines
4.0 KiB
Python

import pandas as pd
from mutations import CategorizedMutations
from variants import CategorizedVariantRecords
import utils
from collections import defaultdict
def mutation_site_count_vs_sample_type(
cat_muts: CategorizedMutations,
cat_vars: CategorizedVariantRecords,
viral_strand,
region,
non_synonymous,
allow_fishers,
):
# Create list of earliest mutations
earliest_mutations_category = "earliest mutations"
cat_muts.pcintersect_category_groups_with_categories(
earliest_mutations_category,
"sample type",
tuple(cat_muts.flattened_pcategory_group("patient earliest"))
)
working_category_name = earliest_mutations_category
next_working_category_name = earliest_mutations_category
if viral_strand != "all":
next_working_category_name += f" - {viral_strand}"
cat_muts.pcintersect_category_groups_with_categories(
next_working_category_name,
working_category_name,
tuple(cat_muts.pget_category_group("viral lineage")[viral_strand]),
)
working_category_name = next_working_category_name
if region != "all":
next_working_category_name += f" - {region}"
cat_muts.pcintersect_category_groups_with_categories(
next_working_category_name,
working_category_name,
tuple(cat_muts.pget_category_group("regions")[region]),
)
working_category_name = next_working_category_name
if non_synonymous:
next_working_category_name += " - non-synonymous"
cat_muts.pcintersect_category_groups_with_categories(
next_working_category_name,
working_category_name,
tuple(cat_muts.pget_category("non-synonymous")),
)
working_category_name = next_working_category_name
mutated_site_counts_per_sample_type = (
cat_muts.mutation_sites_for_pccategory_group(working_category_name)
)
reference_site_counts_per_sample_type = dict(mutated_site_counts_per_sample_type)
for key, value in reference_site_counts_per_sample_type.items():
reference_site_counts_per_sample_type[key] = len(cat_muts.ref_sequence) - value
conti_table_dict = {
"Mutated": mutated_site_counts_per_sample_type,
"Reference": reference_site_counts_per_sample_type,
}
contigency_test = pd.DataFrame(conti_table_dict)
result = utils.row_pairwise_dependency_test(contigency_test, allow_fishers)
return (
"Is mutation site count dependent on sample type?",
"Here, we will count the sites of mutation in the "
"reference sequence and compare that number for each "
"type of mutation. The following is the table we will "
"we will use: ",
utils.publish_results(result),
)
# TODO Write individual nucleotide test
def individual_mutation_count_vs_sample_type(
cat_muts: CategorizedMutations,
cat_vars: CategorizedVariantRecords,
viral_strand,
region,
non_synonymous,
allow_fishers,
):
results = []
for location, location_based_mutations in cat_muts.pget_category_group("locations"):
contingency_table_per_location = defaultdict(dict)
for sample_type, sample_type_based_mutations in cat_muts.pget_category_group(
"sample type"
):
contingency_table_per_location[f"Mutations at {location}"][
sample_type
] = len(
cat_muts.intersect(
location_based_mutations, sample_type_based_mutations
)
)
contingency_table_per_location["Reference"][sample_type] = len(
location_based_mutations
) - len(cat_muts.pget_category("patients"))
utils.publish_results(contingency_table_per_location, results)
return "Location based mutations vs Sample type", results
# TODO Write distribution difference test (?)
tests = [
(
"Correlation between the number of mutation locations and sample types",
mutation_site_count_vs_sample_type,
)
]