Added problem cataloging.

Reduced verbosity caused by inherit off-by-one errors, cleared up logging.
2023-03-22 15:13:37 -05:00 · 2023-03-22 14:38:45 -05:00
2 changed files with 103 additions and 27 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,28 @@
 # MSA Splitter

 Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
+
+## Features
+
+ - Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes 
+ - Trim off stop codon
+ - Batch process multiple genes from one MSA
+ - Correct gene start and stop locations based on start and stop codon location
+ - Catalogues errors that occurred in human-readable CSV file
+ 
+## Planned
+ 
+ - Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
+ - Simple to use GUI
+ - Run without system-wide python install
+
+## Use
+
+### Command Line
+
+1. Install python 3
+2. Install `biopython`
+    - Using `pip`: `pip install biopython` or `pip3 install biopython`
+    - Using `conda`: ` conda install -c conda-forge biopython`
+3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
+    - `python3 msa_splitter.py -h` for help 
--- a/msa_splitter.py
+++ b/msa_splitter.py
@@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
                csv_header = list(row)
            else:
                gene_name = row[csv_header.index("Gene")]
-                gene_start = int(row[csv_header.index("Start")]) - 1
+                gene_start = int(row[csv_header.index("Start")])
                gene_end = int(row[csv_header.index("End")])
                genes.append((gene_name, gene_start, gene_end))
    return genes
@@ -104,14 +104,21 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
 START_CODON = "ATG"


-def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
-
+def trim(
+    start: int,
+    end: int,
+    gen_cut_stop_codon: bool,
+    msa_records,
+    correction_range=16
+):
+    tru_start = start - 1
    nt_sequence_records = []
    nt_no_stop_sequence_records = []
    aa_sequence_records = []
    aa_no_stop_sequence_records = []
+    problems = []

-    debug(f"Beginning sequence trimming for {start} - {end}.")
+    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
    for s_record in msa_records:
        record_metadata = (
            s_record.id,
@@ -125,20 +132,22 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
        found_start_codon = False
        start_shift = 0
        for i in range(correction_range):
-            if s_record.seq[start + i: start + i + 3] == START_CODON:
+            if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
                found_start_codon = True
                start_shift = i
                break
-            if s_record.seq[start - i: start - i + 3] == START_CODON:
+            if not found_start_codon and s_record.seq[
+                    tru_start - i: tru_start - i + 3] == START_CODON:
                found_start_codon = True
                start_shift = -i
                break
        if not found_start_codon:
-            warning(
-                f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if start_shift != 0:
-            warning(
-                f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
+            problems.append([start, end, s_record.id,
+                             "Could not find start codon"])
+        if found_start_codon and start_shift != 0:
+            problems.append([start, end, s_record.id,
+                             "Corrected start codon to "
+                             f"{tru_start + start_shift}"])

        end_shift = 0
        found_stop_codon = False
@@ -147,19 +156,22 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
                found_stop_codon = True
                end_shift = i
                break
-            if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
+            if not found_stop_codon and s_record.seq[
+                    end - i - 3: end - i] in STOP_CODONS:
                found_stop_codon = True
                end_shift = -i
                break
        if not found_stop_codon:
-            warning(
-                f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if end_shift != 0:
-            warning(
-                f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
-        nt_sequence = s_record.seq[start +
-                                   start_shift: end + end_shift]  # Cropping step
-        nt_no_stop_sequence = s_record.seq[start +
+            problems.append([start, end, s_record.id,
+                             "Could not find stop codon"])
+        if found_stop_codon and end_shift != 0:
+            problems.append([start, end, s_record.id,
+                             "Corrected stop codon to "
+                             f"{end + end_shift}"])
+
+        nt_sequence = s_record.seq[tru_start +
+                                   start_shift: end + end_shift]  # Cropping
+        nt_no_stop_sequence = s_record.seq[tru_start +
                                           start_shift: end - 3 + end_shift]
        nt_sequence_records.append(
            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
@@ -179,15 +191,29 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
            )
        debug(f"Trimming for {s_record.id} complete.")
-    debug(f"Sequence trimming for {start} - {end} complete.")
+    debug(f"Sequence trimming for {tru_start} - {end} complete.")
    return (
        nt_sequence_records,
        nt_no_stop_sequence_records,
        aa_sequence_records,
        aa_no_stop_sequence_records,
+        problems
    )


+def log_problems(gene: str, problems: list[list[str]]):
+    for start, end, id, desc in problems:
+        warning(f"{gene} ({start} - {end}) of {id}: {desc}")
+
+
+def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
+    header = ["Start", "End", "Sequence ID", "Problem"]
+    with open(output_path, "w") as problem_csv_fd:
+        writer = csv.writer(problem_csv_fd)
+        writer.writerow(header)
+        writer.writerows(problems)
+
+
 def main(args):
    logging.basicConfig(level=args.log_level.upper())

@@ -197,23 +223,38 @@ def main(args):
    if args.gene_list:
        genes = read_genes_from_csv(args.gene_list)
        info(
-            f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
+            f"Gene list read from {args.gene_list} resulted in {len(genes)} "
+            "genes.")
    else:
        if args.gene_name and args.start and args.end:
            genes.append([args.gene_name, args.start, args.end])
            info(
-                f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
+                f"Extracting {args.gene_name} starting at {args.start} to "
+                f"{args.end}.")
        else:
            raise Exception(
-                "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
+                "Need either a gene list by --gene-list or a start and end "
+                "via --start, and --end respectively."
            )
    for gene_name, start, end in genes:
+        info(f"Started on gene {gene_name} ({start} - {end})")
        (
            nt_sequence_records,
            nt_no_stop_sequence_records,
            aa_sequence_records,
            aa_no_stop_sequence_records,
-        ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
+            problems
+        ) = trim(start, end, args.gen_cut_stop_codon,
+                 msa_records, correction_range=args.correction_range)
+        if len(problems) > 0:
+            warning(f"There were {len(problems)} problems "
+                    f"during trimming {gene_name}!")
+        if args.catalogue_problems:
+            output_as_csv(
+                gene_name,
+                problems,
+                os.path.join(args.output_dir, f"{gene_name} - problems.csv")
+            )
        write_to_file(
            args.output_dir,
            gene_name,
@@ -227,7 +268,7 @@ def main(args):
            aa_sequence_records,
            aa_no_stop_sequence_records,
        )
-        info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
+        info(f"Completed gene {gene_name} ({start} - {end})")


 if __name__ == "__main__":
@@ -340,7 +381,8 @@ if __name__ == "__main__":
    parser.add_argument(
        "--correction-range",
        "-R",
-        help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
+        help="The number of offsets in terms of nucleotides to try in both "
+        "directions to correct start and stop codons before giving up.",
        type=int,
        required=False,
        default=9
@@ -355,4 +397,13 @@ if __name__ == "__main__":
        default="INFO"
    )

+    parser.add_argument(
+        "-C",
+        "--catalogue-problems",
+        help="Generates a CSV for each gene listing the problems that "
+        "occurred during trimming.",
+        required=False,
+        action="store_true"
+    )
+
    main(parser.parse_args())
Author	SHA1	Message	Date
Harrison Deng	89cde939b0	Added problem cataloging.	2023-03-22 15:13:37 -05:00
Harrison Deng	e5ead8f197	Reduced verbosity caused by inherit off-by-one errors, cleared up logging.	2023-03-22 14:38:45 -05:00