2 changed files with 27 additions and 103 deletions
--- a/README.md
+++ b/README.md
@ -1,28 +1,3 @@
 # MSA Splitter
 Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
 ## Features
 - Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes 
 - Trim off stop codon
 - Batch process multiple genes from one MSA
 - Correct gene start and stop locations based on start and stop codon location
 - Catalogues errors that occurred in human-readable CSV file
 ## Planned
 - Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
 - Simple to use GUI
 - Run without system-wide python install
 ## Use
 ### Command Line
 1. Install python 3
 2. Install `biopython`
    - Using `pip`: `pip install biopython` or `pip3 install biopython`
    - Using `conda`: ` conda install -c conda-forge biopython`
 3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
    - `python3 msa_splitter.py -h` for help 
--- a/msa_splitter.py
+++ b/msa_splitter.py
@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
                csv_header = list(row)
            else:
                gene_name = row[csv_header.index("Gene")]
-                gene_start = int(row[csv_header.index("Start")])
+                gene_start = int(row[csv_header.index("Start")]) - 1
                gene_end = int(row[csv_header.index("End")])
                genes.append((gene_name, gene_start, gene_end))
    return genes
@ -104,21 +104,14 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
 START_CODON = "ATG"
-def trim(
+def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
-    start: int,
+
    end: int,
    gen_cut_stop_codon: bool,
    msa_records,
    correction_range=16
 ):
    tru_start = start - 1
    nt_sequence_records = []
    nt_no_stop_sequence_records = []
    aa_sequence_records = []
    aa_no_stop_sequence_records = []
    problems = []
-    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
+    debug(f"Beginning sequence trimming for {start} - {end}.")
    for s_record in msa_records:
        record_metadata = (
            s_record.id,
@ -132,22 +125,20 @@ def trim(
        found_start_codon = False
        start_shift = 0
        for i in range(correction_range):
-            if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
+            if s_record.seq[start + i: start + i + 3] == START_CODON:
                found_start_codon = True
                start_shift = i
                break
-            if not found_start_codon and s_record.seq[
+            if s_record.seq[start - i: start - i + 3] == START_CODON:
                    tru_start - i: tru_start - i + 3] == START_CODON:
                found_start_codon = True
                start_shift = -i
                break
        if not found_start_codon:
-            problems.append([start, end, s_record.id,
+            warning(
-                             "Could not find start codon"])
+                f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if found_start_codon and start_shift != 0:
+        if start_shift != 0:
-            problems.append([start, end, s_record.id,
+            warning(
-                             "Corrected start codon to "
+                f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
                             f"{tru_start + start_shift}"])
        end_shift = 0
        found_stop_codon = False
@ -156,22 +147,19 @@ def trim(
                found_stop_codon = True
                end_shift = i
                break
-            if not found_stop_codon and s_record.seq[
+            if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
                    end - i - 3: end - i] in STOP_CODONS:
                found_stop_codon = True
                end_shift = -i
                break
        if not found_stop_codon:
-            problems.append([start, end, s_record.id,
+            warning(
-                             "Could not find stop codon"])
+                f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
-        if found_stop_codon and end_shift != 0:
+        if end_shift != 0:
-            problems.append([start, end, s_record.id,
+            warning(
-                             "Corrected stop codon to "
+                f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
-                             f"{end + end_shift}"])
+        nt_sequence = s_record.seq[start +
-
+                                   start_shift: end + end_shift]  # Cropping step
-        nt_sequence = s_record.seq[tru_start +
+        nt_no_stop_sequence = s_record.seq[start +
                                   start_shift: end + end_shift]  # Cropping
        nt_no_stop_sequence = s_record.seq[tru_start +
                                           start_shift: end - 3 + end_shift]
        nt_sequence_records.append(
            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
@ -191,29 +179,15 @@ def trim(
                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
            )
        debug(f"Trimming for {s_record.id} complete.")
-    debug(f"Sequence trimming for {tru_start} - {end} complete.")
+    debug(f"Sequence trimming for {start} - {end} complete.")
    return (
        nt_sequence_records,
        nt_no_stop_sequence_records,
        aa_sequence_records,
        aa_no_stop_sequence_records,
        problems
    )
 def log_problems(gene: str, problems: list[list[str]]):
    for start, end, id, desc in problems:
        warning(f"{gene} ({start} - {end}) of {id}: {desc}")
 def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
    header = ["Start", "End", "Sequence ID", "Problem"]
    with open(output_path, "w") as problem_csv_fd:
        writer = csv.writer(problem_csv_fd)
        writer.writerow(header)
        writer.writerows(problems)
 def main(args):
    logging.basicConfig(level=args.log_level.upper())
@ -223,38 +197,23 @@ def main(args):
    if args.gene_list:
        genes = read_genes_from_csv(args.gene_list)
        info(
-            f"Gene list read from {args.gene_list} resulted in {len(genes)} "
+            f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
            "genes.")
    else:
        if args.gene_name and args.start and args.end:
            genes.append([args.gene_name, args.start, args.end])
            info(
-                f"Extracting {args.gene_name} starting at {args.start} to "
+                f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
                f"{args.end}.")
        else:
            raise Exception(
-                "Need either a gene list by --gene-list or a start and end "
+                "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
                "via --start, and --end respectively."
            )
    for gene_name, start, end in genes:
        info(f"Started on gene {gene_name} ({start} - {end})")
        (
            nt_sequence_records,
            nt_no_stop_sequence_records,
            aa_sequence_records,
            aa_no_stop_sequence_records,
-            problems
+        ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
        ) = trim(start, end, args.gen_cut_stop_codon,
                 msa_records, correction_range=args.correction_range)
        if len(problems) > 0:
            warning(f"There were {len(problems)} problems "
                    f"during trimming {gene_name}!")
        if args.catalogue_problems:
            output_as_csv(
                gene_name,
                problems,
                os.path.join(args.output_dir, f"{gene_name} - problems.csv")
            )
        write_to_file(
            args.output_dir,
            gene_name,
@ -268,7 +227,7 @@ def main(args):
            aa_sequence_records,
            aa_no_stop_sequence_records,
        )
-        info(f"Completed gene {gene_name} ({start} - {end})")
+        info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
 if __name__ == "__main__":
@ -381,8 +340,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--correction-range",
        "-R",
-        help="The number of offsets in terms of nucleotides to try in both "
+        help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
        "directions to correct start and stop codons before giving up.",
        type=int,
        required=False,
        default=9
@ -397,13 +355,4 @@ if __name__ == "__main__":
        default="INFO"
    )
    parser.add_argument(
        "-C",
        "--catalogue-problems",
        help="Generates a CSV for each gene listing the problems that "
        "occurred during trimming.",
        required=False,
        action="store_true"
    )
    main(parser.parse_args())