Incomplete translation mechanism

2023-03-27 16:23:12 -05:00
parent 89cde939b0
commit 03d87a670f
3 changed files with 38345 additions and 61 deletions
--- a/msa_splitter.py
+++ b/msa_splitter.py
@@ -12,6 +12,7 @@ from logging import debug, error, info, warning
 import logging
 import os
 from Bio import SeqIO, SeqRecord
 from Bio.Seq import Seq
 import csv
@@ -49,10 +50,9 @@ def write_to_file(
    aa_suffix: str,
    nt_sequence_records: list,
    nt_no_stop_sequence_records: list,
-    aa_sequence_records: list = None,
+    aa_sequence_records: list = [],
-    aa_no_stop_sequence_records: list = None,
+    aa_no_stop_sequence_records: list = [],
 ):
    output_path = os.path.abspath(output_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_path)
@@ -65,16 +65,17 @@ def write_to_file(
        ),
        "fasta",
    )
-    SeqIO.write(
+    if len(nt_no_stop_sequence_records):
-        nt_no_stop_sequence_records,
+        SeqIO.write(
-        os.path.join(
+            nt_no_stop_sequence_records,
-            output_path,
+            os.path.join(
-            f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
+                output_path,
-        ),
+                f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
-        "fasta",
+            ),
-    )
+            "fasta",
        )
-    if aa_sequence_records:
+    if len(aa_sequence_records):
        SeqIO.write(
            aa_sequence_records,
            os.path.join(
@@ -86,7 +87,7 @@ def write_to_file(
            "fasta",
        )
-    if aa_no_stop_sequence_records:
+    if len(aa_no_stop_sequence_records):
        SeqIO.write(
            aa_no_stop_sequence_records,
            os.path.join(
@@ -108,8 +109,9 @@ def trim(
    start: int,
    end: int,
    gen_cut_stop_codon: bool,
    perform_translation: bool,
    msa_records,
-    correction_range=16
+    correction_range=16,
 ):
    tru_start = start - 1
    nt_sequence_records = []
@@ -132,64 +134,79 @@ def trim(
        found_start_codon = False
        start_shift = 0
        for i in range(correction_range):
-            if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
+            if s_record.seq[tru_start + i : tru_start + i + 3] == START_CODON:
                found_start_codon = True
                start_shift = i
                break
-            if not found_start_codon and s_record.seq[
+            if (
-                    tru_start - i: tru_start - i + 3] == START_CODON:
+                not found_start_codon
                and s_record.seq[tru_start - i : tru_start - i + 3] == START_CODON
            ):
                found_start_codon = True
                start_shift = -i
                break
        if not found_start_codon:
-            problems.append([start, end, s_record.id,
+            problems.append([start, end, s_record.id, "Could not find start codon"])
                             "Could not find start codon"])
        if found_start_codon and start_shift != 0:
-            problems.append([start, end, s_record.id,
+            problems.append(
-                             "Corrected start codon to "
+                [
-                             f"{tru_start + start_shift}"])
+                    start,
                    end,
                    s_record.id,
                    "Corrected start codon to " f"{tru_start + start_shift}",
                ]
            )
        end_shift = 0
        found_stop_codon = False
        for i in range(correction_range):
-            if s_record.seq[end + i - 3: end + i] in STOP_CODONS:
+            if s_record.seq[end + i - 3 : end + i] in STOP_CODONS:
                found_stop_codon = True
                end_shift = i
                break
-            if not found_stop_codon and s_record.seq[
+            if (
-                    end - i - 3: end - i] in STOP_CODONS:
+                not found_stop_codon
                and s_record.seq[end - i - 3 : end - i] in STOP_CODONS
            ):
                found_stop_codon = True
                end_shift = -i
                break
        if not found_stop_codon:
-            problems.append([start, end, s_record.id,
+            problems.append([start, end, s_record.id, "Could not find stop codon"])
                             "Could not find stop codon"])
        if found_stop_codon and end_shift != 0:
-            problems.append([start, end, s_record.id,
+            problems.append(
-                             "Corrected stop codon to "
+                [
-                             f"{end + end_shift}"])
+                    start,
-
+                    end,
-        nt_sequence = s_record.seq[tru_start +
+                    s_record.id,
-                                   start_shift: end + end_shift]  # Cropping
+                    "Corrected stop codon to " f"{end + end_shift}",
-        nt_no_stop_sequence = s_record.seq[tru_start +
+                ]
-                                           start_shift: end - 3 + end_shift]
+            )
        nt_sequence_records.append(
            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
        nt_no_stop_sequence_records.append(
            SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
        )
        nt_sequence = s_record.seq[
            tru_start + start_shift : end + end_shift
        ]  # Cropping
        nt_sequence_records.append(SeqRecord.SeqRecord(nt_sequence, *record_metadata))
        if gen_cut_stop_codon:
-            aa_sequence = nt_sequence.translate()
+            nt_no_stop_sequence = s_record.seq[
-            aa_no_stop_sequence = nt_no_stop_sequence.translate()
+                tru_start + start_shift : end - 3 + end_shift
            ]
            nt_no_stop_sequence_records.append(
                SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
            )
        if perform_translation:
            aa_sequence = Seq(nt_sequence.replace("-", "n")).translate()
            aa_sequence_records.append(
                SeqRecord.SeqRecord(aa_sequence, *record_metadata)
            )
-
+            if gen_cut_stop_codon:
-            aa_no_stop_sequence_records.append(
+                aa_no_stop_sequence = Seq(
-                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
+                    nt_no_stop_sequence.replace("-", "n")
-            )
+                ).translate()
                aa_no_stop_sequence_records.append(
                    SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
                )
        debug(f"Trimming for {s_record.id} complete.")
    debug(f"Sequence trimming for {tru_start} - {end} complete.")
    return (
@@ -197,7 +214,7 @@ def trim(
        nt_no_stop_sequence_records,
        aa_sequence_records,
        aa_no_stop_sequence_records,
-        problems
+        problems,
    )
@@ -222,15 +239,14 @@ def main(args):
    genes = []
    if args.gene_list:
        genes = read_genes_from_csv(args.gene_list)
-        info(
+        info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
            f"Gene list read from {args.gene_list} resulted in {len(genes)} "
            "genes.")
    else:
        if args.gene_name and args.start and args.end:
            genes.append([args.gene_name, args.start, args.end])
            info(
                f"Extracting {args.gene_name} starting at {args.start} to "
-                f"{args.end}.")
+                f"{args.end}."
            )
        else:
            raise Exception(
                "Need either a gene list by --gene-list or a start and end "
@@ -243,17 +259,24 @@ def main(args):
            nt_no_stop_sequence_records,
            aa_sequence_records,
            aa_no_stop_sequence_records,
-            problems
+            problems,
-        ) = trim(start, end, args.gen_cut_stop_codon,
+        ) = trim(
-                 msa_records, correction_range=args.correction_range)
+            start,
            end,
            args.gen_cut_stop_codon,
            args.do_translate,
            msa_records,
            correction_range=args.correction_range,
        )
        if len(problems) > 0:
-            warning(f"There were {len(problems)} problems "
+            warning(
-                    f"during trimming {gene_name}!")
+                f"There were {len(problems)} problems " f"during trimming {gene_name}!"
            )
        if args.catalogue_problems:
            output_as_csv(
                gene_name,
                problems,
-                os.path.join(args.output_dir, f"{gene_name} - problems.csv")
+                os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
            )
        write_to_file(
            args.output_dir,
@@ -342,7 +365,7 @@ if __name__ == "__main__":
        """,
        required=False,
        default=False,
-        action="store_true"
+        action="store_true",
    )
    parser.add_argument(
@@ -378,6 +401,14 @@ if __name__ == "__main__":
        default="aa",
    )
    parser.add_argument(
        "-A",
        "--do-translate",
        help="Attempts to translate all nucleotide sequences to amino acid sequences.",
        required=False,
        action="store_true",
    )
    parser.add_argument(
        "--correction-range",
        "-R",
@@ -385,7 +416,7 @@ if __name__ == "__main__":
        "directions to correct start and stop codons before giving up.",
        type=int,
        required=False,
-        default=9
+        default=9,
    )
    parser.add_argument(
@@ -394,7 +425,7 @@ if __name__ == "__main__":
        help="The verbosity of the program.",
        type=str,
        required=False,
-        default="INFO"
+        default="INFO",
    )
    parser.add_argument(
@@ -403,7 +434,7 @@ if __name__ == "__main__":
        help="Generates a CSV for each gene listing the problems that "
        "occurred during trimming.",
        required=False,
-        action="store_true"
+        action="store_true",
    )
    main(parser.parse_args())
--- a/tests/resources/test_msa.fa
+++ b/tests/resources/test_msa.fa
--- a/tox.ini
+++ b/tox.ini
@@ -0,0 +1,3 @@
 [flake8]
 max-line-length = 88
 extend-ignore = E203