Added problem cataloging.

2023-03-22 15:13:37 -05:00
parent e5ead8f197
commit 89cde939b0
2 changed files with 70 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,28 @@
 # MSA Splitter

 Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
+
+## Features
+
+ - Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes 
+ - Trim off stop codon
+ - Batch process multiple genes from one MSA
+ - Correct gene start and stop locations based on start and stop codon location
+ - Catalogues errors that occurred in human-readable CSV file
+ 
+## Planned
+ 
+ - Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
+ - Simple to use GUI
+ - Run without system-wide python install
+
+## Use
+
+### Command Line
+
+1. Install python 3
+2. Install `biopython`
+    - Using `pip`: `pip install biopython` or `pip3 install biopython`
+    - Using `conda`: ` conda install -c conda-forge biopython`
+3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
+    - `python3 msa_splitter.py -h` for help 
--- a/msa_splitter.py
+++ b/msa_splitter.py
@@ -116,6 +116,7 @@ def trim(
    nt_no_stop_sequence_records = []
    aa_sequence_records = []
    aa_no_stop_sequence_records = []
+    problems = []

    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
    for s_record in msa_records:
@@ -141,15 +142,12 @@ def trim(
                start_shift = -i
                break
        if not found_start_codon:
-            warning(
-                f"Could not find start codon for region {tru_start} - {end} "
-                f"with sequence ID {s_record.id}. Continuing without "
-                f"correction...")
+            problems.append([start, end, s_record.id,
+                             "Could not find start codon"])
        if found_start_codon and start_shift != 0:
-            warning(
-                "Start codon was not found at expected location for region "
-                f"{tru_start} - {end} with sequence ID {s_record.id}. "
-                f"Correcting start location to {tru_start + start_shift}")
+            problems.append([start, end, s_record.id,
+                             "Corrected start codon to "
+                             f"{tru_start + start_shift}"])

        end_shift = 0
        found_stop_codon = False
@@ -164,15 +162,13 @@ def trim(
                end_shift = -i
                break
        if not found_stop_codon:
-            warning(
-                f"Could not find stop codon for region {tru_start} - {end} "
-                f"with sequence ID {s_record.id}. Continuing without "
-                f"correction...")
+            problems.append([start, end, s_record.id,
+                             "Could not find stop codon"])
        if found_stop_codon and end_shift != 0:
-            warning(
-                f"Stop codon was not found at expected location for region "
-                f"{tru_start} - {end} with sequence ID {s_record.id}. "
-                f"Correcting end location to: {end + end_shift}.")
+            problems.append([start, end, s_record.id,
+                             "Corrected stop codon to "
+                             f"{end + end_shift}"])
+
        nt_sequence = s_record.seq[tru_start +
                                   start_shift: end + end_shift]  # Cropping
        nt_no_stop_sequence = s_record.seq[tru_start +
@@ -201,9 +197,23 @@ def trim(
        nt_no_stop_sequence_records,
        aa_sequence_records,
        aa_no_stop_sequence_records,
+        problems
    )


+def log_problems(gene: str, problems: list[list[str]]):
+    for start, end, id, desc in problems:
+        warning(f"{gene} ({start} - {end}) of {id}: {desc}")
+
+
+def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
+    header = ["Start", "End", "Sequence ID", "Problem"]
+    with open(output_path, "w") as problem_csv_fd:
+        writer = csv.writer(problem_csv_fd)
+        writer.writerow(header)
+        writer.writerows(problems)
+
+
 def main(args):
    logging.basicConfig(level=args.log_level.upper())

@@ -233,8 +243,18 @@ def main(args):
            nt_no_stop_sequence_records,
            aa_sequence_records,
            aa_no_stop_sequence_records,
+            problems
        ) = trim(start, end, args.gen_cut_stop_codon,
                 msa_records, correction_range=args.correction_range)
+        if len(problems) > 0:
+            warning(f"There were {len(problems)} problems "
+                    f"during trimming {gene_name}!")
+        if args.catalogue_problems:
+            output_as_csv(
+                gene_name,
+                problems,
+                os.path.join(args.output_dir, f"{gene_name} - problems.csv")
+            )
        write_to_file(
            args.output_dir,
            gene_name,
@@ -377,4 +397,13 @@ if __name__ == "__main__":
        default="INFO"
    )

+    parser.add_argument(
+        "-C",
+        "--catalogue-problems",
+        help="Generates a CSV for each gene listing the problems that "
+        "occurred during trimming.",
+        required=False,
+        action="store_true"
+    )
+
    main(parser.parse_args())