116 lines
4.0 KiB
Python
116 lines
4.0 KiB
Python
|
import pandas as pd
|
||
|
from mutations import CategorizedMutations
|
||
|
from variants import CategorizedVariantRecords
|
||
|
import utils
|
||
|
from collections import defaultdict
|
||
|
|
||
|
|
||
|
def mutation_site_count_vs_sample_type(
|
||
|
cat_muts: CategorizedMutations,
|
||
|
cat_vars: CategorizedVariantRecords,
|
||
|
viral_strand,
|
||
|
region,
|
||
|
non_synonymous,
|
||
|
allow_fishers,
|
||
|
):
|
||
|
# Create list of earliest mutations
|
||
|
earliest_mutations_category = "earliest mutations"
|
||
|
cat_muts.pcintersect_category_groups_with_categories(
|
||
|
earliest_mutations_category,
|
||
|
"sample type",
|
||
|
tuple(cat_muts.flattened_pcategory_group("patient earliest"))
|
||
|
)
|
||
|
|
||
|
working_category_name = earliest_mutations_category
|
||
|
next_working_category_name = earliest_mutations_category
|
||
|
if viral_strand != "all":
|
||
|
next_working_category_name += f" - {viral_strand}"
|
||
|
cat_muts.pcintersect_category_groups_with_categories(
|
||
|
next_working_category_name,
|
||
|
working_category_name,
|
||
|
tuple(cat_muts.pget_category_group("viral lineage")[viral_strand]),
|
||
|
)
|
||
|
working_category_name = next_working_category_name
|
||
|
if region != "all":
|
||
|
next_working_category_name += f" - {region}"
|
||
|
cat_muts.pcintersect_category_groups_with_categories(
|
||
|
next_working_category_name,
|
||
|
working_category_name,
|
||
|
tuple(cat_muts.pget_category_group("regions")[region]),
|
||
|
)
|
||
|
working_category_name = next_working_category_name
|
||
|
|
||
|
if non_synonymous:
|
||
|
next_working_category_name += " - non-synonymous"
|
||
|
cat_muts.pcintersect_category_groups_with_categories(
|
||
|
next_working_category_name,
|
||
|
working_category_name,
|
||
|
tuple(cat_muts.pget_category("non-synonymous")),
|
||
|
)
|
||
|
working_category_name = next_working_category_name
|
||
|
|
||
|
mutated_site_counts_per_sample_type = (
|
||
|
cat_muts.mutation_sites_for_pccategory_group(working_category_name)
|
||
|
)
|
||
|
reference_site_counts_per_sample_type = dict(mutated_site_counts_per_sample_type)
|
||
|
|
||
|
for key, value in reference_site_counts_per_sample_type.items():
|
||
|
reference_site_counts_per_sample_type[key] = len(cat_muts.ref_sequence) - value
|
||
|
|
||
|
conti_table_dict = {
|
||
|
"Mutated": mutated_site_counts_per_sample_type,
|
||
|
"Reference": reference_site_counts_per_sample_type,
|
||
|
}
|
||
|
|
||
|
contigency_test = pd.DataFrame(conti_table_dict)
|
||
|
result = utils.row_pairwise_dependency_test(contigency_test, allow_fishers)
|
||
|
|
||
|
return (
|
||
|
"Is mutation site count dependent on sample type?",
|
||
|
"Here, we will count the sites of mutation in the "
|
||
|
"reference sequence and compare that number for each "
|
||
|
"type of mutation. The following is the table we will "
|
||
|
"we will use: ",
|
||
|
utils.publish_results(result),
|
||
|
)
|
||
|
|
||
|
|
||
|
# TODO Write individual nucleotide test
|
||
|
def individual_mutation_count_vs_sample_type(
|
||
|
cat_muts: CategorizedMutations,
|
||
|
cat_vars: CategorizedVariantRecords,
|
||
|
viral_strand,
|
||
|
region,
|
||
|
non_synonymous,
|
||
|
allow_fishers,
|
||
|
):
|
||
|
results = []
|
||
|
for location, location_based_mutations in cat_muts.pget_category_group("locations"):
|
||
|
contingency_table_per_location = defaultdict(dict)
|
||
|
for sample_type, sample_type_based_mutations in cat_muts.pget_category_group(
|
||
|
"sample type"
|
||
|
):
|
||
|
contingency_table_per_location[f"Mutations at {location}"][
|
||
|
sample_type
|
||
|
] = len(
|
||
|
cat_muts.intersect(
|
||
|
location_based_mutations, sample_type_based_mutations
|
||
|
)
|
||
|
)
|
||
|
contingency_table_per_location["Reference"][sample_type] = len(
|
||
|
location_based_mutations
|
||
|
) - len(cat_muts.pget_category("patients"))
|
||
|
utils.publish_results(contingency_table_per_location, results)
|
||
|
|
||
|
return "Location based mutations vs Sample type", results
|
||
|
|
||
|
|
||
|
# TODO Write distribution difference test (?)
|
||
|
|
||
|
tests = [
|
||
|
(
|
||
|
"Correlation between the number of mutation locations and sample types",
|
||
|
mutation_site_count_vs_sample_type,
|
||
|
)
|
||
|
]
|