Incomplete translation mechanism

This commit is contained in:
Harrison Deng 2023-03-27 16:23:12 -05:00
parent 89cde939b0
commit 03d87a670f
3 changed files with 38345 additions and 61 deletions

View File

@ -12,6 +12,7 @@ from logging import debug, error, info, warning
import logging
import os
from Bio import SeqIO, SeqRecord
from Bio.Seq import Seq
import csv
@ -49,10 +50,9 @@ def write_to_file(
aa_suffix: str,
nt_sequence_records: list,
nt_no_stop_sequence_records: list,
aa_sequence_records: list = None,
aa_no_stop_sequence_records: list = None,
aa_sequence_records: list = [],
aa_no_stop_sequence_records: list = [],
):
output_path = os.path.abspath(output_dir)
if not os.path.isdir(output_dir):
os.makedirs(output_path)
@ -65,6 +65,7 @@ def write_to_file(
),
"fasta",
)
if len(nt_no_stop_sequence_records):
SeqIO.write(
nt_no_stop_sequence_records,
os.path.join(
@ -74,7 +75,7 @@ def write_to_file(
"fasta",
)
if aa_sequence_records:
if len(aa_sequence_records):
SeqIO.write(
aa_sequence_records,
os.path.join(
@ -86,7 +87,7 @@ def write_to_file(
"fasta",
)
if aa_no_stop_sequence_records:
if len(aa_no_stop_sequence_records):
SeqIO.write(
aa_no_stop_sequence_records,
os.path.join(
@ -108,8 +109,9 @@ def trim(
start: int,
end: int,
gen_cut_stop_codon: bool,
perform_translation: bool,
msa_records,
correction_range=16
correction_range=16,
):
tru_start = start - 1
nt_sequence_records = []
@ -136,18 +138,24 @@ def trim(
found_start_codon = True
start_shift = i
break
if not found_start_codon and s_record.seq[
tru_start - i: tru_start - i + 3] == START_CODON:
if (
not found_start_codon
and s_record.seq[tru_start - i : tru_start - i + 3] == START_CODON
):
found_start_codon = True
start_shift = -i
break
if not found_start_codon:
problems.append([start, end, s_record.id,
"Could not find start codon"])
problems.append([start, end, s_record.id, "Could not find start codon"])
if found_start_codon and start_shift != 0:
problems.append([start, end, s_record.id,
"Corrected start codon to "
f"{tru_start + start_shift}"])
problems.append(
[
start,
end,
s_record.id,
"Corrected start codon to " f"{tru_start + start_shift}",
]
)
end_shift = 0
found_stop_codon = False
@ -156,37 +164,46 @@ def trim(
found_stop_codon = True
end_shift = i
break
if not found_stop_codon and s_record.seq[
end - i - 3: end - i] in STOP_CODONS:
if (
not found_stop_codon
and s_record.seq[end - i - 3 : end - i] in STOP_CODONS
):
found_stop_codon = True
end_shift = -i
break
if not found_stop_codon:
problems.append([start, end, s_record.id,
"Could not find stop codon"])
problems.append([start, end, s_record.id, "Could not find stop codon"])
if found_stop_codon and end_shift != 0:
problems.append([start, end, s_record.id,
"Corrected stop codon to "
f"{end + end_shift}"])
problems.append(
[
start,
end,
s_record.id,
"Corrected stop codon to " f"{end + end_shift}",
]
)
nt_sequence = s_record.seq[tru_start +
start_shift: end + end_shift] # Cropping
nt_no_stop_sequence = s_record.seq[tru_start +
start_shift: end - 3 + end_shift]
nt_sequence_records.append(
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
nt_sequence = s_record.seq[
tru_start + start_shift : end + end_shift
] # Cropping
nt_sequence_records.append(SeqRecord.SeqRecord(nt_sequence, *record_metadata))
if gen_cut_stop_codon:
nt_no_stop_sequence = s_record.seq[
tru_start + start_shift : end - 3 + end_shift
]
nt_no_stop_sequence_records.append(
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
)
if gen_cut_stop_codon:
aa_sequence = nt_sequence.translate()
aa_no_stop_sequence = nt_no_stop_sequence.translate()
if perform_translation:
aa_sequence = Seq(nt_sequence.replace("-", "n")).translate()
aa_sequence_records.append(
SeqRecord.SeqRecord(aa_sequence, *record_metadata)
)
if gen_cut_stop_codon:
aa_no_stop_sequence = Seq(
nt_no_stop_sequence.replace("-", "n")
).translate()
aa_no_stop_sequence_records.append(
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
)
@ -197,7 +214,7 @@ def trim(
nt_no_stop_sequence_records,
aa_sequence_records,
aa_no_stop_sequence_records,
problems
problems,
)
@ -222,15 +239,14 @@ def main(args):
genes = []
if args.gene_list:
genes = read_genes_from_csv(args.gene_list)
info(
f"Gene list read from {args.gene_list} resulted in {len(genes)} "
"genes.")
info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
else:
if args.gene_name and args.start and args.end:
genes.append([args.gene_name, args.start, args.end])
info(
f"Extracting {args.gene_name} starting at {args.start} to "
f"{args.end}.")
f"{args.end}."
)
else:
raise Exception(
"Need either a gene list by --gene-list or a start and end "
@ -243,17 +259,24 @@ def main(args):
nt_no_stop_sequence_records,
aa_sequence_records,
aa_no_stop_sequence_records,
problems
) = trim(start, end, args.gen_cut_stop_codon,
msa_records, correction_range=args.correction_range)
problems,
) = trim(
start,
end,
args.gen_cut_stop_codon,
args.do_translate,
msa_records,
correction_range=args.correction_range,
)
if len(problems) > 0:
warning(f"There were {len(problems)} problems "
f"during trimming {gene_name}!")
warning(
f"There were {len(problems)} problems " f"during trimming {gene_name}!"
)
if args.catalogue_problems:
output_as_csv(
gene_name,
problems,
os.path.join(args.output_dir, f"{gene_name} - problems.csv")
os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
)
write_to_file(
args.output_dir,
@ -342,7 +365,7 @@ if __name__ == "__main__":
""",
required=False,
default=False,
action="store_true"
action="store_true",
)
parser.add_argument(
@ -378,6 +401,14 @@ if __name__ == "__main__":
default="aa",
)
parser.add_argument(
"-A",
"--do-translate",
help="Attempts to translate all nucleotide sequences to amino acid sequences.",
required=False,
action="store_true",
)
parser.add_argument(
"--correction-range",
"-R",
@ -385,7 +416,7 @@ if __name__ == "__main__":
"directions to correct start and stop codons before giving up.",
type=int,
required=False,
default=9
default=9,
)
parser.add_argument(
@ -394,7 +425,7 @@ if __name__ == "__main__":
help="The verbosity of the program.",
type=str,
required=False,
default="INFO"
default="INFO",
)
parser.add_argument(
@ -403,7 +434,7 @@ if __name__ == "__main__":
help="Generates a CSV for each gene listing the problems that "
"occurred during trimming.",
required=False,
action="store_true"
action="store_true",
)
main(parser.parse_args())

38250
tests/resources/test_msa.fa Normal file

File diff suppressed because it is too large Load Diff

3
tox.ini Normal file
View File

@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203