From e5ead8f197f4c0c5c25f4b6953083a38676cf266 Mon Sep 17 00:00:00 2001
From: Harrison Deng <yunyangdeng@gmail.com>
Date: Wed, 22 Mar 2023 14:38:45 -0500
Subject: [PATCH] Reduced verbosity caused by inherit off-by-one errors,
 cleared up logging.

---
 msa_splitter.py | 68 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/msa_splitter.py b/msa_splitter.py
index 232d429..5faf1b2 100755
--- a/msa_splitter.py
+++ b/msa_splitter.py
@@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
                 csv_header = list(row)
             else:
                 gene_name = row[csv_header.index("Gene")]
-                gene_start = int(row[csv_header.index("Start")]) - 1
+                gene_start = int(row[csv_header.index("Start")])
                 gene_end = int(row[csv_header.index("End")])
                 genes.append((gene_name, gene_start, gene_end))
     return genes
@@ -104,14 +104,20 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
 START_CODON = "ATG"
 
 
-def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
-
+def trim(
+    start: int,
+    end: int,
+    gen_cut_stop_codon: bool,
+    msa_records,
+    correction_range=16
+):
+    tru_start = start - 1
     nt_sequence_records = []
     nt_no_stop_sequence_records = []
     aa_sequence_records = []
     aa_no_stop_sequence_records = []
 
-    debug(f"Beginning sequence trimming for {start} - {end}.")
+    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
     for s_record in msa_records:
         record_metadata = (
             s_record.id,
@@ -125,20 +131,25 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
         found_start_codon = False
         start_shift = 0
         for i in range(correction_range):
-            if s_record.seq[start + i: start + i + 3] == START_CODON:
+            if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
                 found_start_codon = True
                 start_shift = i
                 break
-            if s_record.seq[start - i: start - i + 3] == START_CODON:
+            if not found_start_codon and s_record.seq[
+                    tru_start - i: tru_start - i + 3] == START_CODON:
                 found_start_codon = True
                 start_shift = -i
                 break
         if not found_start_codon:
             warning(
-                f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if start_shift != 0:
+                f"Could not find start codon for region {tru_start} - {end} "
+                f"with sequence ID {s_record.id}. Continuing without "
+                f"correction...")
+        if found_start_codon and start_shift != 0:
             warning(
-                f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
+                "Start codon was not found at expected location for region "
+                f"{tru_start} - {end} with sequence ID {s_record.id}. "
+                f"Correcting start location to {tru_start + start_shift}")
 
         end_shift = 0
         found_stop_codon = False
@@ -147,19 +158,24 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
                 found_stop_codon = True
                 end_shift = i
                 break
-            if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
+            if not found_stop_codon and s_record.seq[
+                    end - i - 3: end - i] in STOP_CODONS:
                 found_stop_codon = True
                 end_shift = -i
                 break
         if not found_stop_codon:
             warning(
-                f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if end_shift != 0:
+                f"Could not find stop codon for region {tru_start} - {end} "
+                f"with sequence ID {s_record.id}. Continuing without "
+                f"correction...")
+        if found_stop_codon and end_shift != 0:
             warning(
-                f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
-        nt_sequence = s_record.seq[start +
-                                   start_shift: end + end_shift]  # Cropping step
-        nt_no_stop_sequence = s_record.seq[start +
+                f"Stop codon was not found at expected location for region "
+                f"{tru_start} - {end} with sequence ID {s_record.id}. "
+                f"Correcting end location to: {end + end_shift}.")
+        nt_sequence = s_record.seq[tru_start +
+                                   start_shift: end + end_shift]  # Cropping
+        nt_no_stop_sequence = s_record.seq[tru_start +
                                            start_shift: end - 3 + end_shift]
         nt_sequence_records.append(
             SeqRecord.SeqRecord(nt_sequence, *record_metadata))
@@ -179,7 +195,7 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
                 SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
             )
         debug(f"Trimming for {s_record.id} complete.")
-    debug(f"Sequence trimming for {start} - {end} complete.")
+    debug(f"Sequence trimming for {tru_start} - {end} complete.")
     return (
         nt_sequence_records,
         nt_no_stop_sequence_records,
@@ -197,23 +213,28 @@ def main(args):
     if args.gene_list:
         genes = read_genes_from_csv(args.gene_list)
         info(
-            f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
+            f"Gene list read from {args.gene_list} resulted in {len(genes)} "
+            "genes.")
     else:
         if args.gene_name and args.start and args.end:
             genes.append([args.gene_name, args.start, args.end])
             info(
-                f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
+                f"Extracting {args.gene_name} starting at {args.start} to "
+                f"{args.end}.")
         else:
             raise Exception(
-                "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
+                "Need either a gene list by --gene-list or a start and end "
+                "via --start, and --end respectively."
             )
     for gene_name, start, end in genes:
+        info(f"Started on gene {gene_name} ({start} - {end})")
         (
             nt_sequence_records,
             nt_no_stop_sequence_records,
             aa_sequence_records,
             aa_no_stop_sequence_records,
-        ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
+        ) = trim(start, end, args.gen_cut_stop_codon,
+                 msa_records, correction_range=args.correction_range)
         write_to_file(
             args.output_dir,
             gene_name,
@@ -227,7 +248,7 @@ def main(args):
             aa_sequence_records,
             aa_no_stop_sequence_records,
         )
-        info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
+        info(f"Completed gene {gene_name} ({start} - {end})")
 
 
 if __name__ == "__main__":
@@ -340,7 +361,8 @@ if __name__ == "__main__":
     parser.add_argument(
         "--correction-range",
         "-R",
-        help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
+        help="The number of offsets in terms of nucleotides to try in both "
+        "directions to correct start and stop codons before giving up.",
         type=int,
         required=False,
         default=9