Incomplete translation mechanism
This commit is contained in:
parent
89cde939b0
commit
03d87a670f
153
msa_splitter.py
153
msa_splitter.py
@ -12,6 +12,7 @@ from logging import debug, error, info, warning
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from Bio import SeqIO, SeqRecord
|
from Bio import SeqIO, SeqRecord
|
||||||
|
from Bio.Seq import Seq
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
|
||||||
@ -49,10 +50,9 @@ def write_to_file(
|
|||||||
aa_suffix: str,
|
aa_suffix: str,
|
||||||
nt_sequence_records: list,
|
nt_sequence_records: list,
|
||||||
nt_no_stop_sequence_records: list,
|
nt_no_stop_sequence_records: list,
|
||||||
aa_sequence_records: list = None,
|
aa_sequence_records: list = [],
|
||||||
aa_no_stop_sequence_records: list = None,
|
aa_no_stop_sequence_records: list = [],
|
||||||
):
|
):
|
||||||
|
|
||||||
output_path = os.path.abspath(output_dir)
|
output_path = os.path.abspath(output_dir)
|
||||||
if not os.path.isdir(output_dir):
|
if not os.path.isdir(output_dir):
|
||||||
os.makedirs(output_path)
|
os.makedirs(output_path)
|
||||||
@ -65,16 +65,17 @@ def write_to_file(
|
|||||||
),
|
),
|
||||||
"fasta",
|
"fasta",
|
||||||
)
|
)
|
||||||
SeqIO.write(
|
if len(nt_no_stop_sequence_records):
|
||||||
nt_no_stop_sequence_records,
|
SeqIO.write(
|
||||||
os.path.join(
|
nt_no_stop_sequence_records,
|
||||||
output_path,
|
os.path.join(
|
||||||
f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
|
output_path,
|
||||||
),
|
f'{gene_name or f"{start} - {end - 3}"}' f" - {ns_suffix}.fasta",
|
||||||
"fasta",
|
),
|
||||||
)
|
"fasta",
|
||||||
|
)
|
||||||
|
|
||||||
if aa_sequence_records:
|
if len(aa_sequence_records):
|
||||||
SeqIO.write(
|
SeqIO.write(
|
||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
os.path.join(
|
os.path.join(
|
||||||
@ -86,7 +87,7 @@ def write_to_file(
|
|||||||
"fasta",
|
"fasta",
|
||||||
)
|
)
|
||||||
|
|
||||||
if aa_no_stop_sequence_records:
|
if len(aa_no_stop_sequence_records):
|
||||||
SeqIO.write(
|
SeqIO.write(
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
os.path.join(
|
os.path.join(
|
||||||
@ -108,8 +109,9 @@ def trim(
|
|||||||
start: int,
|
start: int,
|
||||||
end: int,
|
end: int,
|
||||||
gen_cut_stop_codon: bool,
|
gen_cut_stop_codon: bool,
|
||||||
|
perform_translation: bool,
|
||||||
msa_records,
|
msa_records,
|
||||||
correction_range=16
|
correction_range=16,
|
||||||
):
|
):
|
||||||
tru_start = start - 1
|
tru_start = start - 1
|
||||||
nt_sequence_records = []
|
nt_sequence_records = []
|
||||||
@ -132,64 +134,79 @@ def trim(
|
|||||||
found_start_codon = False
|
found_start_codon = False
|
||||||
start_shift = 0
|
start_shift = 0
|
||||||
for i in range(correction_range):
|
for i in range(correction_range):
|
||||||
if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
|
if s_record.seq[tru_start + i : tru_start + i + 3] == START_CODON:
|
||||||
found_start_codon = True
|
found_start_codon = True
|
||||||
start_shift = i
|
start_shift = i
|
||||||
break
|
break
|
||||||
if not found_start_codon and s_record.seq[
|
if (
|
||||||
tru_start - i: tru_start - i + 3] == START_CODON:
|
not found_start_codon
|
||||||
|
and s_record.seq[tru_start - i : tru_start - i + 3] == START_CODON
|
||||||
|
):
|
||||||
found_start_codon = True
|
found_start_codon = True
|
||||||
start_shift = -i
|
start_shift = -i
|
||||||
break
|
break
|
||||||
if not found_start_codon:
|
if not found_start_codon:
|
||||||
problems.append([start, end, s_record.id,
|
problems.append([start, end, s_record.id, "Could not find start codon"])
|
||||||
"Could not find start codon"])
|
|
||||||
if found_start_codon and start_shift != 0:
|
if found_start_codon and start_shift != 0:
|
||||||
problems.append([start, end, s_record.id,
|
problems.append(
|
||||||
"Corrected start codon to "
|
[
|
||||||
f"{tru_start + start_shift}"])
|
start,
|
||||||
|
end,
|
||||||
|
s_record.id,
|
||||||
|
"Corrected start codon to " f"{tru_start + start_shift}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
end_shift = 0
|
end_shift = 0
|
||||||
found_stop_codon = False
|
found_stop_codon = False
|
||||||
for i in range(correction_range):
|
for i in range(correction_range):
|
||||||
if s_record.seq[end + i - 3: end + i] in STOP_CODONS:
|
if s_record.seq[end + i - 3 : end + i] in STOP_CODONS:
|
||||||
found_stop_codon = True
|
found_stop_codon = True
|
||||||
end_shift = i
|
end_shift = i
|
||||||
break
|
break
|
||||||
if not found_stop_codon and s_record.seq[
|
if (
|
||||||
end - i - 3: end - i] in STOP_CODONS:
|
not found_stop_codon
|
||||||
|
and s_record.seq[end - i - 3 : end - i] in STOP_CODONS
|
||||||
|
):
|
||||||
found_stop_codon = True
|
found_stop_codon = True
|
||||||
end_shift = -i
|
end_shift = -i
|
||||||
break
|
break
|
||||||
if not found_stop_codon:
|
if not found_stop_codon:
|
||||||
problems.append([start, end, s_record.id,
|
problems.append([start, end, s_record.id, "Could not find stop codon"])
|
||||||
"Could not find stop codon"])
|
|
||||||
if found_stop_codon and end_shift != 0:
|
if found_stop_codon and end_shift != 0:
|
||||||
problems.append([start, end, s_record.id,
|
problems.append(
|
||||||
"Corrected stop codon to "
|
[
|
||||||
f"{end + end_shift}"])
|
start,
|
||||||
|
end,
|
||||||
nt_sequence = s_record.seq[tru_start +
|
s_record.id,
|
||||||
start_shift: end + end_shift] # Cropping
|
"Corrected stop codon to " f"{end + end_shift}",
|
||||||
nt_no_stop_sequence = s_record.seq[tru_start +
|
]
|
||||||
start_shift: end - 3 + end_shift]
|
)
|
||||||
nt_sequence_records.append(
|
|
||||||
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
|
|
||||||
nt_no_stop_sequence_records.append(
|
|
||||||
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
nt_sequence = s_record.seq[
|
||||||
|
tru_start + start_shift : end + end_shift
|
||||||
|
] # Cropping
|
||||||
|
nt_sequence_records.append(SeqRecord.SeqRecord(nt_sequence, *record_metadata))
|
||||||
if gen_cut_stop_codon:
|
if gen_cut_stop_codon:
|
||||||
aa_sequence = nt_sequence.translate()
|
nt_no_stop_sequence = s_record.seq[
|
||||||
aa_no_stop_sequence = nt_no_stop_sequence.translate()
|
tru_start + start_shift : end - 3 + end_shift
|
||||||
|
]
|
||||||
|
nt_no_stop_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(nt_no_stop_sequence, *record_metadata)
|
||||||
|
)
|
||||||
|
|
||||||
|
if perform_translation:
|
||||||
|
aa_sequence = Seq(nt_sequence.replace("-", "n")).translate()
|
||||||
aa_sequence_records.append(
|
aa_sequence_records.append(
|
||||||
SeqRecord.SeqRecord(aa_sequence, *record_metadata)
|
SeqRecord.SeqRecord(aa_sequence, *record_metadata)
|
||||||
)
|
)
|
||||||
|
if gen_cut_stop_codon:
|
||||||
aa_no_stop_sequence_records.append(
|
aa_no_stop_sequence = Seq(
|
||||||
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
|
nt_no_stop_sequence.replace("-", "n")
|
||||||
)
|
).translate()
|
||||||
|
aa_no_stop_sequence_records.append(
|
||||||
|
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
|
||||||
|
)
|
||||||
debug(f"Trimming for {s_record.id} complete.")
|
debug(f"Trimming for {s_record.id} complete.")
|
||||||
debug(f"Sequence trimming for {tru_start} - {end} complete.")
|
debug(f"Sequence trimming for {tru_start} - {end} complete.")
|
||||||
return (
|
return (
|
||||||
@ -197,7 +214,7 @@ def trim(
|
|||||||
nt_no_stop_sequence_records,
|
nt_no_stop_sequence_records,
|
||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
problems
|
problems,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -222,15 +239,14 @@ def main(args):
|
|||||||
genes = []
|
genes = []
|
||||||
if args.gene_list:
|
if args.gene_list:
|
||||||
genes = read_genes_from_csv(args.gene_list)
|
genes = read_genes_from_csv(args.gene_list)
|
||||||
info(
|
info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
|
||||||
f"Gene list read from {args.gene_list} resulted in {len(genes)} "
|
|
||||||
"genes.")
|
|
||||||
else:
|
else:
|
||||||
if args.gene_name and args.start and args.end:
|
if args.gene_name and args.start and args.end:
|
||||||
genes.append([args.gene_name, args.start, args.end])
|
genes.append([args.gene_name, args.start, args.end])
|
||||||
info(
|
info(
|
||||||
f"Extracting {args.gene_name} starting at {args.start} to "
|
f"Extracting {args.gene_name} starting at {args.start} to "
|
||||||
f"{args.end}.")
|
f"{args.end}."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Need either a gene list by --gene-list or a start and end "
|
"Need either a gene list by --gene-list or a start and end "
|
||||||
@ -243,17 +259,24 @@ def main(args):
|
|||||||
nt_no_stop_sequence_records,
|
nt_no_stop_sequence_records,
|
||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
problems
|
problems,
|
||||||
) = trim(start, end, args.gen_cut_stop_codon,
|
) = trim(
|
||||||
msa_records, correction_range=args.correction_range)
|
start,
|
||||||
|
end,
|
||||||
|
args.gen_cut_stop_codon,
|
||||||
|
args.do_translate,
|
||||||
|
msa_records,
|
||||||
|
correction_range=args.correction_range,
|
||||||
|
)
|
||||||
if len(problems) > 0:
|
if len(problems) > 0:
|
||||||
warning(f"There were {len(problems)} problems "
|
warning(
|
||||||
f"during trimming {gene_name}!")
|
f"There were {len(problems)} problems " f"during trimming {gene_name}!"
|
||||||
|
)
|
||||||
if args.catalogue_problems:
|
if args.catalogue_problems:
|
||||||
output_as_csv(
|
output_as_csv(
|
||||||
gene_name,
|
gene_name,
|
||||||
problems,
|
problems,
|
||||||
os.path.join(args.output_dir, f"{gene_name} - problems.csv")
|
os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
|
||||||
)
|
)
|
||||||
write_to_file(
|
write_to_file(
|
||||||
args.output_dir,
|
args.output_dir,
|
||||||
@ -342,7 +365,7 @@ if __name__ == "__main__":
|
|||||||
""",
|
""",
|
||||||
required=False,
|
required=False,
|
||||||
default=False,
|
default=False,
|
||||||
action="store_true"
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -378,6 +401,14 @@ if __name__ == "__main__":
|
|||||||
default="aa",
|
default="aa",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"-A",
|
||||||
|
"--do-translate",
|
||||||
|
help="Attempts to translate all nucleotide sequences to amino acid sequences.",
|
||||||
|
required=False,
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--correction-range",
|
"--correction-range",
|
||||||
"-R",
|
"-R",
|
||||||
@ -385,7 +416,7 @@ if __name__ == "__main__":
|
|||||||
"directions to correct start and stop codons before giving up.",
|
"directions to correct start and stop codons before giving up.",
|
||||||
type=int,
|
type=int,
|
||||||
required=False,
|
required=False,
|
||||||
default=9
|
default=9,
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -394,7 +425,7 @@ if __name__ == "__main__":
|
|||||||
help="The verbosity of the program.",
|
help="The verbosity of the program.",
|
||||||
type=str,
|
type=str,
|
||||||
required=False,
|
required=False,
|
||||||
default="INFO"
|
default="INFO",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -403,7 +434,7 @@ if __name__ == "__main__":
|
|||||||
help="Generates a CSV for each gene listing the problems that "
|
help="Generates a CSV for each gene listing the problems that "
|
||||||
"occurred during trimming.",
|
"occurred during trimming.",
|
||||||
required=False,
|
required=False,
|
||||||
action="store_true"
|
action="store_true",
|
||||||
)
|
)
|
||||||
|
|
||||||
main(parser.parse_args())
|
main(parser.parse_args())
|
||||||
|
38250
tests/resources/test_msa.fa
Normal file
38250
tests/resources/test_msa.fa
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user