From e5ead8f197f4c0c5c25f4b6953083a38676cf266 Mon Sep 17 00:00:00 2001 From: Harrison Deng Date: Wed, 22 Mar 2023 14:38:45 -0500 Subject: [PATCH] Reduced verbosity caused by inherit off-by-one errors, cleared up logging. --- msa_splitter.py | 68 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/msa_splitter.py b/msa_splitter.py index 232d429..5faf1b2 100755 --- a/msa_splitter.py +++ b/msa_splitter.py @@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str): csv_header = list(row) else: gene_name = row[csv_header.index("Gene")] - gene_start = int(row[csv_header.index("Start")]) - 1 + gene_start = int(row[csv_header.index("Start")]) gene_end = int(row[csv_header.index("End")]) genes.append((gene_name, gene_start, gene_end)) return genes @@ -104,14 +104,20 @@ STOP_CODONS = {"TAG", "TAA", "TGA"} START_CODON = "ATG" -def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16): - +def trim( + start: int, + end: int, + gen_cut_stop_codon: bool, + msa_records, + correction_range=16 +): + tru_start = start - 1 nt_sequence_records = [] nt_no_stop_sequence_records = [] aa_sequence_records = [] aa_no_stop_sequence_records = [] - debug(f"Beginning sequence trimming for {start} - {end}.") + debug(f"Beginning sequence trimming for {tru_start} - {end}.") for s_record in msa_records: record_metadata = ( s_record.id, @@ -125,20 +131,25 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction found_start_codon = False start_shift = 0 for i in range(correction_range): - if s_record.seq[start + i: start + i + 3] == START_CODON: + if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON: found_start_codon = True start_shift = i break - if s_record.seq[start - i: start - i + 3] == START_CODON: + if not found_start_codon and s_record.seq[ + tru_start - i: tru_start - i + 3] == START_CODON: found_start_codon = True start_shift = -i break if not found_start_codon: warning( - f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") - if start_shift != 0: + f"Could not find start codon for region {tru_start} - {end} " + f"with sequence ID {s_record.id}. Continuing without " + f"correction...") + if found_start_codon and start_shift != 0: warning( - f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}") + "Start codon was not found at expected location for region " + f"{tru_start} - {end} with sequence ID {s_record.id}. " + f"Correcting start location to {tru_start + start_shift}") end_shift = 0 found_stop_codon = False @@ -147,19 +158,24 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction found_stop_codon = True end_shift = i break - if s_record.seq[end - i - 3: end - i] in STOP_CODONS: + if not found_stop_codon and s_record.seq[ + end - i - 3: end - i] in STOP_CODONS: found_stop_codon = True end_shift = -i break if not found_stop_codon: warning( - f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...") - if end_shift != 0: + f"Could not find stop codon for region {tru_start} - {end} " + f"with sequence ID {s_record.id}. Continuing without " + f"correction...") + if found_stop_codon and end_shift != 0: warning( - f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.") - nt_sequence = s_record.seq[start + - start_shift: end + end_shift] # Cropping step - nt_no_stop_sequence = s_record.seq[start + + f"Stop codon was not found at expected location for region " + f"{tru_start} - {end} with sequence ID {s_record.id}. " + f"Correcting end location to: {end + end_shift}.") + nt_sequence = s_record.seq[tru_start + + start_shift: end + end_shift] # Cropping + nt_no_stop_sequence = s_record.seq[tru_start + start_shift: end - 3 + end_shift] nt_sequence_records.append( SeqRecord.SeqRecord(nt_sequence, *record_metadata)) @@ -179,7 +195,7 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata) ) debug(f"Trimming for {s_record.id} complete.") - debug(f"Sequence trimming for {start} - {end} complete.") + debug(f"Sequence trimming for {tru_start} - {end} complete.") return ( nt_sequence_records, nt_no_stop_sequence_records, @@ -197,23 +213,28 @@ def main(args): if args.gene_list: genes = read_genes_from_csv(args.gene_list) info( - f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.") + f"Gene list read from {args.gene_list} resulted in {len(genes)} " + "genes.") else: if args.gene_name and args.start and args.end: genes.append([args.gene_name, args.start, args.end]) info( - f"Extracting {args.gene_name} starting at {args.start} to {args.end}.") + f"Extracting {args.gene_name} starting at {args.start} to " + f"{args.end}.") else: raise Exception( - "Need either a gene list by --gene-list or a start and end via --start, and --end respectively." + "Need either a gene list by --gene-list or a start and end " + "via --start, and --end respectively." ) for gene_name, start, end in genes: + info(f"Started on gene {gene_name} ({start} - {end})") ( nt_sequence_records, nt_no_stop_sequence_records, aa_sequence_records, aa_no_stop_sequence_records, - ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range) + ) = trim(start, end, args.gen_cut_stop_codon, + msa_records, correction_range=args.correction_range) write_to_file( args.output_dir, gene_name, @@ -227,7 +248,7 @@ def main(args): aa_sequence_records, aa_no_stop_sequence_records, ) - info(f"Extracted gene sequence for {gene_name} ({start} - {end})") + info(f"Completed gene {gene_name} ({start} - {end})") if __name__ == "__main__": @@ -340,7 +361,8 @@ if __name__ == "__main__": parser.add_argument( "--correction-range", "-R", - help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.", + help="The number of offsets in terms of nucleotides to try in both " + "directions to correct start and stop codons before giving up.", type=int, required=False, default=9