Reduced verbosity caused by inherit off-by-one errors, cleared up logging.

This commit is contained in:
Harrison Deng 2023-03-22 14:38:45 -05:00
parent 99f75942ac
commit e5ead8f197

View File

@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
csv_header = list(row) csv_header = list(row)
else: else:
gene_name = row[csv_header.index("Gene")] gene_name = row[csv_header.index("Gene")]
gene_start = int(row[csv_header.index("Start")]) - 1 gene_start = int(row[csv_header.index("Start")])
gene_end = int(row[csv_header.index("End")]) gene_end = int(row[csv_header.index("End")])
genes.append((gene_name, gene_start, gene_end)) genes.append((gene_name, gene_start, gene_end))
return genes return genes
@ -104,14 +104,20 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
START_CODON = "ATG" START_CODON = "ATG"
def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16): def trim(
start: int,
end: int,
gen_cut_stop_codon: bool,
msa_records,
correction_range=16
):
tru_start = start - 1
nt_sequence_records = [] nt_sequence_records = []
nt_no_stop_sequence_records = [] nt_no_stop_sequence_records = []
aa_sequence_records = [] aa_sequence_records = []
aa_no_stop_sequence_records = [] aa_no_stop_sequence_records = []
debug(f"Beginning sequence trimming for {start} - {end}.") debug(f"Beginning sequence trimming for {tru_start} - {end}.")
for s_record in msa_records: for s_record in msa_records:
record_metadata = ( record_metadata = (
s_record.id, s_record.id,
@ -125,20 +131,25 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
found_start_codon = False found_start_codon = False
start_shift = 0 start_shift = 0
for i in range(correction_range): for i in range(correction_range):
if s_record.seq[start + i: start + i + 3] == START_CODON: if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
found_start_codon = True found_start_codon = True
start_shift = i start_shift = i
break break
if s_record.seq[start - i: start - i + 3] == START_CODON: if not found_start_codon and s_record.seq[
tru_start - i: tru_start - i + 3] == START_CODON:
found_start_codon = True found_start_codon = True
start_shift = -i start_shift = -i
break break
if not found_start_codon: if not found_start_codon:
warning( warning(
f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") f"Could not find start codon for region {tru_start} - {end} "
if start_shift != 0: f"with sequence ID {s_record.id}. Continuing without "
f"correction...")
if found_start_codon and start_shift != 0:
warning( warning(
f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}") "Start codon was not found at expected location for region "
f"{tru_start} - {end} with sequence ID {s_record.id}. "
f"Correcting start location to {tru_start + start_shift}")
end_shift = 0 end_shift = 0
found_stop_codon = False found_stop_codon = False
@ -147,19 +158,24 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
found_stop_codon = True found_stop_codon = True
end_shift = i end_shift = i
break break
if s_record.seq[end - i - 3: end - i] in STOP_CODONS: if not found_stop_codon and s_record.seq[
end - i - 3: end - i] in STOP_CODONS:
found_stop_codon = True found_stop_codon = True
end_shift = -i end_shift = -i
break break
if not found_stop_codon: if not found_stop_codon:
warning( warning(
f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") f"Could not find stop codon for region {tru_start} - {end} "
if end_shift != 0: f"with sequence ID {s_record.id}. Continuing without "
f"correction...")
if found_stop_codon and end_shift != 0:
warning( warning(
f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.") f"Stop codon was not found at expected location for region "
nt_sequence = s_record.seq[start + f"{tru_start} - {end} with sequence ID {s_record.id}. "
start_shift: end + end_shift] # Cropping step f"Correcting end location to: {end + end_shift}.")
nt_no_stop_sequence = s_record.seq[start + nt_sequence = s_record.seq[tru_start +
start_shift: end + end_shift] # Cropping
nt_no_stop_sequence = s_record.seq[tru_start +
start_shift: end - 3 + end_shift] start_shift: end - 3 + end_shift]
nt_sequence_records.append( nt_sequence_records.append(
SeqRecord.SeqRecord(nt_sequence, *record_metadata)) SeqRecord.SeqRecord(nt_sequence, *record_metadata))
@ -179,7 +195,7 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata) SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
) )
debug(f"Trimming for {s_record.id} complete.") debug(f"Trimming for {s_record.id} complete.")
debug(f"Sequence trimming for {start} - {end} complete.") debug(f"Sequence trimming for {tru_start} - {end} complete.")
return ( return (
nt_sequence_records, nt_sequence_records,
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
@ -197,23 +213,28 @@ def main(args):
if args.gene_list: if args.gene_list:
genes = read_genes_from_csv(args.gene_list) genes = read_genes_from_csv(args.gene_list)
info( info(
f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.") f"Gene list read from {args.gene_list} resulted in {len(genes)} "
"genes.")
else: else:
if args.gene_name and args.start and args.end: if args.gene_name and args.start and args.end:
genes.append([args.gene_name, args.start, args.end]) genes.append([args.gene_name, args.start, args.end])
info( info(
f"Extracting {args.gene_name} starting at {args.start} to {args.end}.") f"Extracting {args.gene_name} starting at {args.start} to "
f"{args.end}.")
else: else:
raise Exception( raise Exception(
"Need either a gene list by --gene-list or a start and end via --start, and --end respectively." "Need either a gene list by --gene-list or a start and end "
"via --start, and --end respectively."
) )
for gene_name, start, end in genes: for gene_name, start, end in genes:
info(f"Started on gene {gene_name} ({start} - {end})")
( (
nt_sequence_records, nt_sequence_records,
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range) ) = trim(start, end, args.gen_cut_stop_codon,
msa_records, correction_range=args.correction_range)
write_to_file( write_to_file(
args.output_dir, args.output_dir,
gene_name, gene_name,
@ -227,7 +248,7 @@ def main(args):
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
) )
info(f"Extracted gene sequence for {gene_name} ({start} - {end})") info(f"Completed gene {gene_name} ({start} - {end})")
if __name__ == "__main__": if __name__ == "__main__":
@ -340,7 +361,8 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--correction-range", "--correction-range",
"-R", "-R",
help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.", help="The number of offsets in terms of nucleotides to try in both "
"directions to correct start and stop codons before giving up.",
type=int, type=int,
required=False, required=False,
default=9 default=9