Incomplete translation mechanism

This commit is contained in:
Harrison Deng 2023-03-27 16:23:12 -05:00
parent 89cde939b0
commit 03d87a670f
3 changed files with 38345 additions and 61 deletions

View File

@ -12,6 +12,7 @@ from logging import debug, error, info, warning
import logging import logging
import os import os
from Bio import SeqIO, SeqRecord from Bio import SeqIO, SeqRecord
from Bio.Seq import Seq
import csv import csv
@ -49,10 +50,9 @@ def write_to_file(
aa_suffix: str, aa_suffix: str,
nt_sequence_records: list, nt_sequence_records: list,
nt_no_stop_sequence_records: list, nt_no_stop_sequence_records: list,
aa_sequence_records: list = None, aa_sequence_records: list = [],
aa_no_stop_sequence_records: list = None, aa_no_stop_sequence_records: list = [],
): ):
output_path = os.path.abspath(output_dir) output_path = os.path.abspath(output_dir)
if not os.path.isdir(output_dir): if not os.path.isdir(output_dir):
os.makedirs(output_path) os.makedirs(output_path)
@ -65,16 +65,17 @@ def write_to_file(
), ),
"fasta", "fasta",
) )
SeqIO.write( if len(nt_no_stop_sequence_records):
nt_no_stop_sequence_records, SeqIO.write(
os.path.join( nt_no_stop_sequence_records,
output_path, os.path.join(
f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta", output_path,
), f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
"fasta", ),
) "fasta",
)
if aa_sequence_records: if len(aa_sequence_records):
SeqIO.write( SeqIO.write(
aa_sequence_records, aa_sequence_records,
os.path.join( os.path.join(
@ -86,7 +87,7 @@ def write_to_file(
"fasta", "fasta",
) )
if aa_no_stop_sequence_records: if len(aa_no_stop_sequence_records):
SeqIO.write( SeqIO.write(
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
os.path.join( os.path.join(
@ -108,8 +109,9 @@ def trim(
start: int, start: int,
end: int, end: int,
gen_cut_stop_codon: bool, gen_cut_stop_codon: bool,
perform_translation: bool,
msa_records, msa_records,
correction_range=16 correction_range=16,
): ):
tru_start = start - 1 tru_start = start - 1
nt_sequence_records = [] nt_sequence_records = []
@ -132,64 +134,79 @@ def trim(
found_start_codon = False found_start_codon = False
start_shift = 0 start_shift = 0
for i in range(correction_range): for i in range(correction_range):
if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON: if s_record.seq[tru_start + i : tru_start + i + 3] == START_CODON:
found_start_codon = True found_start_codon = True
start_shift = i start_shift = i
break break
if not found_start_codon and s_record.seq[ if (
tru_start - i: tru_start - i + 3] == START_CODON: not found_start_codon
and s_record.seq[tru_start - i : tru_start - i + 3] == START_CODON
):
found_start_codon = True found_start_codon = True
start_shift = -i start_shift = -i
break break
if not found_start_codon: if not found_start_codon:
problems.append([start, end, s_record.id, problems.append([start, end, s_record.id, "Could not find start codon"])
"Could not find start codon"])
if found_start_codon and start_shift != 0: if found_start_codon and start_shift != 0:
problems.append([start, end, s_record.id, problems.append(
"Corrected start codon to " [
f"{tru_start + start_shift}"]) start,
end,
s_record.id,
"Corrected start codon to " f"{tru_start + start_shift}",
]
)
end_shift = 0 end_shift = 0
found_stop_codon = False found_stop_codon = False
for i in range(correction_range): for i in range(correction_range):
if s_record.seq[end + i - 3: end + i] in STOP_CODONS: if s_record.seq[end + i - 3 : end + i] in STOP_CODONS:
found_stop_codon = True found_stop_codon = True
end_shift = i end_shift = i
break break
if not found_stop_codon and s_record.seq[ if (
end - i - 3: end - i] in STOP_CODONS: not found_stop_codon
and s_record.seq[end - i - 3 : end - i] in STOP_CODONS
):
found_stop_codon = True found_stop_codon = True
end_shift = -i end_shift = -i
break break
if not found_stop_codon: if not found_stop_codon:
problems.append([start, end, s_record.id, problems.append([start, end, s_record.id, "Could not find stop codon"])
"Could not find stop codon"])
if found_stop_codon and end_shift != 0: if found_stop_codon and end_shift != 0:
problems.append([start, end, s_record.id, problems.append(
"Corrected stop codon to " [
f"{end + end_shift}"]) start,
end,
nt_sequence = s_record.seq[tru_start + s_record.id,
start_shift: end + end_shift] # Cropping "Corrected stop codon to " f"{end + end_shift}",
nt_no_stop_sequence = s_record.seq[tru_start + ]
start_shift: end - 3 + end_shift] )
nt_sequence_records.append(
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
nt_no_stop_sequence_records.append(
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
)
nt_sequence = s_record.seq[
tru_start + start_shift : end + end_shift
] # Cropping
nt_sequence_records.append(SeqRecord.SeqRecord(nt_sequence, *record_metadata))
if gen_cut_stop_codon: if gen_cut_stop_codon:
aa_sequence = nt_sequence.translate() nt_no_stop_sequence = s_record.seq[
aa_no_stop_sequence = nt_no_stop_sequence.translate() tru_start + start_shift : end - 3 + end_shift
]
nt_no_stop_sequence_records.append(
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
)
if perform_translation:
aa_sequence = Seq(nt_sequence.replace("-", "n")).translate()
aa_sequence_records.append( aa_sequence_records.append(
SeqRecord.SeqRecord(aa_sequence, *record_metadata) SeqRecord.SeqRecord(aa_sequence, *record_metadata)
) )
if gen_cut_stop_codon:
aa_no_stop_sequence_records.append( aa_no_stop_sequence = Seq(
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata) nt_no_stop_sequence.replace("-", "n")
) ).translate()
aa_no_stop_sequence_records.append(
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
)
debug(f"Trimming for {s_record.id} complete.") debug(f"Trimming for {s_record.id} complete.")
debug(f"Sequence trimming for {tru_start} - {end} complete.") debug(f"Sequence trimming for {tru_start} - {end} complete.")
return ( return (
@ -197,7 +214,7 @@ def trim(
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems problems,
) )
@ -222,15 +239,14 @@ def main(args):
genes = [] genes = []
if args.gene_list: if args.gene_list:
genes = read_genes_from_csv(args.gene_list) genes = read_genes_from_csv(args.gene_list)
info( info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
f"Gene list read from {args.gene_list} resulted in {len(genes)} "
"genes.")
else: else:
if args.gene_name and args.start and args.end: if args.gene_name and args.start and args.end:
genes.append([args.gene_name, args.start, args.end]) genes.append([args.gene_name, args.start, args.end])
info( info(
f"Extracting {args.gene_name} starting at {args.start} to " f"Extracting {args.gene_name} starting at {args.start} to "
f"{args.end}.") f"{args.end}."
)
else: else:
raise Exception( raise Exception(
"Need either a gene list by --gene-list or a start and end " "Need either a gene list by --gene-list or a start and end "
@ -243,17 +259,24 @@ def main(args):
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems problems,
) = trim(start, end, args.gen_cut_stop_codon, ) = trim(
msa_records, correction_range=args.correction_range) start,
end,
args.gen_cut_stop_codon,
args.do_translate,
msa_records,
correction_range=args.correction_range,
)
if len(problems) > 0: if len(problems) > 0:
warning(f"There were {len(problems)} problems " warning(
f"during trimming {gene_name}!") f"There were {len(problems)} problems " f"during trimming {gene_name}!"
)
if args.catalogue_problems: if args.catalogue_problems:
output_as_csv( output_as_csv(
gene_name, gene_name,
problems, problems,
os.path.join(args.output_dir, f"{gene_name} - problems.csv") os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
) )
write_to_file( write_to_file(
args.output_dir, args.output_dir,
@ -342,7 +365,7 @@ if __name__ == "__main__":
""", """,
required=False, required=False,
default=False, default=False,
action="store_true" action="store_true",
) )
parser.add_argument( parser.add_argument(
@ -378,6 +401,14 @@ if __name__ == "__main__":
default="aa", default="aa",
) )
parser.add_argument(
"-A",
"--do-translate",
help="Attempts to translate all nucleotide sequences to amino acid sequences.",
required=False,
action="store_true",
)
parser.add_argument( parser.add_argument(
"--correction-range", "--correction-range",
"-R", "-R",
@ -385,7 +416,7 @@ if __name__ == "__main__":
"directions to correct start and stop codons before giving up.", "directions to correct start and stop codons before giving up.",
type=int, type=int,
required=False, required=False,
default=9 default=9,
) )
parser.add_argument( parser.add_argument(
@ -394,7 +425,7 @@ if __name__ == "__main__":
help="The verbosity of the program.", help="The verbosity of the program.",
type=str, type=str,
required=False, required=False,
default="INFO" default="INFO",
) )
parser.add_argument( parser.add_argument(
@ -403,7 +434,7 @@ if __name__ == "__main__":
help="Generates a CSV for each gene listing the problems that " help="Generates a CSV for each gene listing the problems that "
"occurred during trimming.", "occurred during trimming.",
required=False, required=False,
action="store_true" action="store_true",
) )
main(parser.parse_args()) main(parser.parse_args())

38250
tests/resources/test_msa.fa Normal file

File diff suppressed because it is too large Load Diff

3
tox.ini Normal file
View File

@ -0,0 +1,3 @@
[flake8]
max-line-length = 88
extend-ignore = E203