Reduced verbosity caused by inherit off-by-one errors, cleared up logging.
This commit is contained in:
		@@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
 | 
				
			|||||||
                csv_header = list(row)
 | 
					                csv_header = list(row)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                gene_name = row[csv_header.index("Gene")]
 | 
					                gene_name = row[csv_header.index("Gene")]
 | 
				
			||||||
                gene_start = int(row[csv_header.index("Start")]) - 1
 | 
					                gene_start = int(row[csv_header.index("Start")])
 | 
				
			||||||
                gene_end = int(row[csv_header.index("End")])
 | 
					                gene_end = int(row[csv_header.index("End")])
 | 
				
			||||||
                genes.append((gene_name, gene_start, gene_end))
 | 
					                genes.append((gene_name, gene_start, gene_end))
 | 
				
			||||||
    return genes
 | 
					    return genes
 | 
				
			||||||
@@ -104,14 +104,20 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
 | 
				
			|||||||
START_CODON = "ATG"
 | 
					START_CODON = "ATG"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
 | 
					def trim(
 | 
				
			||||||
 | 
					    start: int,
 | 
				
			||||||
 | 
					    end: int,
 | 
				
			||||||
 | 
					    gen_cut_stop_codon: bool,
 | 
				
			||||||
 | 
					    msa_records,
 | 
				
			||||||
 | 
					    correction_range=16
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    tru_start = start - 1
 | 
				
			||||||
    nt_sequence_records = []
 | 
					    nt_sequence_records = []
 | 
				
			||||||
    nt_no_stop_sequence_records = []
 | 
					    nt_no_stop_sequence_records = []
 | 
				
			||||||
    aa_sequence_records = []
 | 
					    aa_sequence_records = []
 | 
				
			||||||
    aa_no_stop_sequence_records = []
 | 
					    aa_no_stop_sequence_records = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    debug(f"Beginning sequence trimming for {start} - {end}.")
 | 
					    debug(f"Beginning sequence trimming for {tru_start} - {end}.")
 | 
				
			||||||
    for s_record in msa_records:
 | 
					    for s_record in msa_records:
 | 
				
			||||||
        record_metadata = (
 | 
					        record_metadata = (
 | 
				
			||||||
            s_record.id,
 | 
					            s_record.id,
 | 
				
			||||||
@@ -125,20 +131,25 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
 | 
				
			|||||||
        found_start_codon = False
 | 
					        found_start_codon = False
 | 
				
			||||||
        start_shift = 0
 | 
					        start_shift = 0
 | 
				
			||||||
        for i in range(correction_range):
 | 
					        for i in range(correction_range):
 | 
				
			||||||
            if s_record.seq[start + i: start + i + 3] == START_CODON:
 | 
					            if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
 | 
				
			||||||
                found_start_codon = True
 | 
					                found_start_codon = True
 | 
				
			||||||
                start_shift = i
 | 
					                start_shift = i
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
            if s_record.seq[start - i: start - i + 3] == START_CODON:
 | 
					            if not found_start_codon and s_record.seq[
 | 
				
			||||||
 | 
					                    tru_start - i: tru_start - i + 3] == START_CODON:
 | 
				
			||||||
                found_start_codon = True
 | 
					                found_start_codon = True
 | 
				
			||||||
                start_shift = -i
 | 
					                start_shift = -i
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
        if not found_start_codon:
 | 
					        if not found_start_codon:
 | 
				
			||||||
            warning(
 | 
					            warning(
 | 
				
			||||||
                f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
 | 
					                f"Could not find start codon for region {tru_start} - {end} "
 | 
				
			||||||
        if start_shift != 0:
 | 
					                f"with sequence ID {s_record.id}. Continuing without "
 | 
				
			||||||
 | 
					                f"correction...")
 | 
				
			||||||
 | 
					        if found_start_codon and start_shift != 0:
 | 
				
			||||||
            warning(
 | 
					            warning(
 | 
				
			||||||
                f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
 | 
					                "Start codon was not found at expected location for region "
 | 
				
			||||||
 | 
					                f"{tru_start} - {end} with sequence ID {s_record.id}. "
 | 
				
			||||||
 | 
					                f"Correcting start location to {tru_start + start_shift}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        end_shift = 0
 | 
					        end_shift = 0
 | 
				
			||||||
        found_stop_codon = False
 | 
					        found_stop_codon = False
 | 
				
			||||||
@@ -147,19 +158,24 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
 | 
				
			|||||||
                found_stop_codon = True
 | 
					                found_stop_codon = True
 | 
				
			||||||
                end_shift = i
 | 
					                end_shift = i
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
            if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
 | 
					            if not found_stop_codon and s_record.seq[
 | 
				
			||||||
 | 
					                    end - i - 3: end - i] in STOP_CODONS:
 | 
				
			||||||
                found_stop_codon = True
 | 
					                found_stop_codon = True
 | 
				
			||||||
                end_shift = -i
 | 
					                end_shift = -i
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
        if not found_stop_codon:
 | 
					        if not found_stop_codon:
 | 
				
			||||||
            warning(
 | 
					            warning(
 | 
				
			||||||
                f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
 | 
					                f"Could not find stop codon for region {tru_start} - {end} "
 | 
				
			||||||
        if end_shift != 0:
 | 
					                f"with sequence ID {s_record.id}. Continuing without "
 | 
				
			||||||
 | 
					                f"correction...")
 | 
				
			||||||
 | 
					        if found_stop_codon and end_shift != 0:
 | 
				
			||||||
            warning(
 | 
					            warning(
 | 
				
			||||||
                f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
 | 
					                f"Stop codon was not found at expected location for region "
 | 
				
			||||||
        nt_sequence = s_record.seq[start +
 | 
					                f"{tru_start} - {end} with sequence ID {s_record.id}. "
 | 
				
			||||||
                                   start_shift: end + end_shift]  # Cropping step
 | 
					                f"Correcting end location to: {end + end_shift}.")
 | 
				
			||||||
        nt_no_stop_sequence = s_record.seq[start +
 | 
					        nt_sequence = s_record.seq[tru_start +
 | 
				
			||||||
 | 
					                                   start_shift: end + end_shift]  # Cropping
 | 
				
			||||||
 | 
					        nt_no_stop_sequence = s_record.seq[tru_start +
 | 
				
			||||||
                                           start_shift: end - 3 + end_shift]
 | 
					                                           start_shift: end - 3 + end_shift]
 | 
				
			||||||
        nt_sequence_records.append(
 | 
					        nt_sequence_records.append(
 | 
				
			||||||
            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
 | 
					            SeqRecord.SeqRecord(nt_sequence, *record_metadata))
 | 
				
			||||||
@@ -179,7 +195,7 @@ def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction
 | 
				
			|||||||
                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
 | 
					                SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        debug(f"Trimming for {s_record.id} complete.")
 | 
					        debug(f"Trimming for {s_record.id} complete.")
 | 
				
			||||||
    debug(f"Sequence trimming for {start} - {end} complete.")
 | 
					    debug(f"Sequence trimming for {tru_start} - {end} complete.")
 | 
				
			||||||
    return (
 | 
					    return (
 | 
				
			||||||
        nt_sequence_records,
 | 
					        nt_sequence_records,
 | 
				
			||||||
        nt_no_stop_sequence_records,
 | 
					        nt_no_stop_sequence_records,
 | 
				
			||||||
@@ -197,23 +213,28 @@ def main(args):
 | 
				
			|||||||
    if args.gene_list:
 | 
					    if args.gene_list:
 | 
				
			||||||
        genes = read_genes_from_csv(args.gene_list)
 | 
					        genes = read_genes_from_csv(args.gene_list)
 | 
				
			||||||
        info(
 | 
					        info(
 | 
				
			||||||
            f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
 | 
					            f"Gene list read from {args.gene_list} resulted in {len(genes)} "
 | 
				
			||||||
 | 
					            "genes.")
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if args.gene_name and args.start and args.end:
 | 
					        if args.gene_name and args.start and args.end:
 | 
				
			||||||
            genes.append([args.gene_name, args.start, args.end])
 | 
					            genes.append([args.gene_name, args.start, args.end])
 | 
				
			||||||
            info(
 | 
					            info(
 | 
				
			||||||
                f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
 | 
					                f"Extracting {args.gene_name} starting at {args.start} to "
 | 
				
			||||||
 | 
					                f"{args.end}.")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            raise Exception(
 | 
					            raise Exception(
 | 
				
			||||||
                "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
 | 
					                "Need either a gene list by --gene-list or a start and end "
 | 
				
			||||||
 | 
					                "via --start, and --end respectively."
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
    for gene_name, start, end in genes:
 | 
					    for gene_name, start, end in genes:
 | 
				
			||||||
 | 
					        info(f"Started on gene {gene_name} ({start} - {end})")
 | 
				
			||||||
        (
 | 
					        (
 | 
				
			||||||
            nt_sequence_records,
 | 
					            nt_sequence_records,
 | 
				
			||||||
            nt_no_stop_sequence_records,
 | 
					            nt_no_stop_sequence_records,
 | 
				
			||||||
            aa_sequence_records,
 | 
					            aa_sequence_records,
 | 
				
			||||||
            aa_no_stop_sequence_records,
 | 
					            aa_no_stop_sequence_records,
 | 
				
			||||||
        ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
 | 
					        ) = trim(start, end, args.gen_cut_stop_codon,
 | 
				
			||||||
 | 
					                 msa_records, correction_range=args.correction_range)
 | 
				
			||||||
        write_to_file(
 | 
					        write_to_file(
 | 
				
			||||||
            args.output_dir,
 | 
					            args.output_dir,
 | 
				
			||||||
            gene_name,
 | 
					            gene_name,
 | 
				
			||||||
@@ -227,7 +248,7 @@ def main(args):
 | 
				
			|||||||
            aa_sequence_records,
 | 
					            aa_sequence_records,
 | 
				
			||||||
            aa_no_stop_sequence_records,
 | 
					            aa_no_stop_sequence_records,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
 | 
					        info(f"Completed gene {gene_name} ({start} - {end})")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
@@ -340,7 +361,8 @@ if __name__ == "__main__":
 | 
				
			|||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        "--correction-range",
 | 
					        "--correction-range",
 | 
				
			||||||
        "-R",
 | 
					        "-R",
 | 
				
			||||||
        help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
 | 
					        help="The number of offsets in terms of nucleotides to try in both "
 | 
				
			||||||
 | 
					        "directions to correct start and stop codons before giving up.",
 | 
				
			||||||
        type=int,
 | 
					        type=int,
 | 
				
			||||||
        required=False,
 | 
					        required=False,
 | 
				
			||||||
        default=9
 | 
					        default=9
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user