Added problem cataloging.

2023-03-22 15:13:37 -05:00
parent e5ead8f197
commit 89cde939b0
2 changed files with 70 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,28 @@
 # MSA Splitter
 Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
 ## Features
 - Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes 
 - Trim off stop codon
 - Batch process multiple genes from one MSA
 - Correct gene start and stop locations based on start and stop codon location
 - Catalogues errors that occurred in human-readable CSV file
 ## Planned
 - Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
 - Simple to use GUI
 - Run without system-wide python install
 ## Use
 ### Command Line
 1. Install python 3
 2. Install `biopython`
    - Using `pip`: `pip install biopython` or `pip3 install biopython`
    - Using `conda`: ` conda install -c conda-forge biopython`
 3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
    - `python3 msa_splitter.py -h` for help 
--- a/msa_splitter.py
+++ b/msa_splitter.py
@@ -116,6 +116,7 @@ def trim(
    nt_no_stop_sequence_records = []
    aa_sequence_records = []
    aa_no_stop_sequence_records = []
    problems = []
    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
    for s_record in msa_records:
@@ -141,15 +142,12 @@ def trim(
                start_shift = -i
                break
        if not found_start_codon:
-            warning(
+            problems.append([start, end, s_record.id,
-                f"Could not find start codon for region {tru_start} - {end} "
+                             "Could not find start codon"])
                f"with sequence ID {s_record.id}. Continuing without "
                f"correction...")
        if found_start_codon and start_shift != 0:
-            warning(
+            problems.append([start, end, s_record.id,
-                "Start codon was not found at expected location for region "
+                             "Corrected start codon to "
-                f"{tru_start} - {end} with sequence ID {s_record.id}. "
+                             f"{tru_start + start_shift}"])
                f"Correcting start location to {tru_start + start_shift}")
        end_shift = 0
        found_stop_codon = False
@@ -164,15 +162,13 @@ def trim(
                end_shift = -i
                break
        if not found_stop_codon:
-            warning(
+            problems.append([start, end, s_record.id,
-                f"Could not find stop codon for region {tru_start} - {end} "
+                             "Could not find stop codon"])
                f"with sequence ID {s_record.id}. Continuing without "
                f"correction...")
        if found_stop_codon and end_shift != 0:
-            warning(
+            problems.append([start, end, s_record.id,
-                f"Stop codon was not found at expected location for region "
+                             "Corrected stop codon to "
-                f"{tru_start} - {end} with sequence ID {s_record.id}. "
+                             f"{end + end_shift}"])
-                f"Correcting end location to: {end + end_shift}.")
+
        nt_sequence = s_record.seq[tru_start +
                                   start_shift: end + end_shift]  # Cropping
        nt_no_stop_sequence = s_record.seq[tru_start +
@@ -201,9 +197,23 @@ def trim(
        nt_no_stop_sequence_records,
        aa_sequence_records,
        aa_no_stop_sequence_records,
        problems
    )
 def log_problems(gene: str, problems: list[list[str]]):
    for start, end, id, desc in problems:
        warning(f"{gene} ({start} - {end}) of {id}: {desc}")
 def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
    header = ["Start", "End", "Sequence ID", "Problem"]
    with open(output_path, "w") as problem_csv_fd:
        writer = csv.writer(problem_csv_fd)
        writer.writerow(header)
        writer.writerows(problems)
 def main(args):
    logging.basicConfig(level=args.log_level.upper())
@@ -233,8 +243,18 @@ def main(args):
            nt_no_stop_sequence_records,
            aa_sequence_records,
            aa_no_stop_sequence_records,
            problems
        ) = trim(start, end, args.gen_cut_stop_codon,
                 msa_records, correction_range=args.correction_range)
        if len(problems) > 0:
            warning(f"There were {len(problems)} problems "
                    f"during trimming {gene_name}!")
        if args.catalogue_problems:
            output_as_csv(
                gene_name,
                problems,
                os.path.join(args.output_dir, f"{gene_name} - problems.csv")
            )
        write_to_file(
            args.output_dir,
            gene_name,
@@ -377,4 +397,13 @@ if __name__ == "__main__":
        default="INFO"
    )
    parser.add_argument(
        "-C",
        "--catalogue-problems",
        help="Generates a CSV for each gene listing the problems that "
        "occurred during trimming.",
        required=False,
        action="store_true"
    )
    main(parser.parse_args())