diff --git a/README.md b/README.md index 96efe71..d560a84 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,28 @@ # MSA Splitter Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file. + +## Features + + - Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes + - Trim off stop codon + - Batch process multiple genes from one MSA + - Correct gene start and stop locations based on start and stop codon location + - Catalogues errors that occurred in human-readable CSV file + +## Planned + + - Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift) + - Simple to use GUI + - Run without system-wide python install + +## Use + +### Command Line + +1. Install python 3 +2. Install `biopython` + - Using `pip`: `pip install biopython` or `pip3 install biopython` + - Using `conda`: ` conda install -c conda-forge biopython` +3. Download `msa_splitter.py` and run with `python3 msa_splitter.py` + - `python3 msa_splitter.py -h` for help \ No newline at end of file diff --git a/msa_splitter.py b/msa_splitter.py index 5faf1b2..0dc05b4 100755 --- a/msa_splitter.py +++ b/msa_splitter.py @@ -116,6 +116,7 @@ def trim( nt_no_stop_sequence_records = [] aa_sequence_records = [] aa_no_stop_sequence_records = [] + problems = [] debug(f"Beginning sequence trimming for {tru_start} - {end}.") for s_record in msa_records: @@ -141,15 +142,12 @@ def trim( start_shift = -i break if not found_start_codon: - warning( - f"Could not find start codon for region {tru_start} - {end} " - f"with sequence ID {s_record.id}. Continuing without " - f"correction...") + problems.append([start, end, s_record.id, + "Could not find start codon"]) if found_start_codon and start_shift != 0: - warning( - "Start codon was not found at expected location for region " - f"{tru_start} - {end} with sequence ID {s_record.id}. " - f"Correcting start location to {tru_start + start_shift}") + problems.append([start, end, s_record.id, + "Corrected start codon to " + f"{tru_start + start_shift}"]) end_shift = 0 found_stop_codon = False @@ -164,15 +162,13 @@ def trim( end_shift = -i break if not found_stop_codon: - warning( - f"Could not find stop codon for region {tru_start} - {end} " - f"with sequence ID {s_record.id}. Continuing without " - f"correction...") + problems.append([start, end, s_record.id, + "Could not find stop codon"]) if found_stop_codon and end_shift != 0: - warning( - f"Stop codon was not found at expected location for region " - f"{tru_start} - {end} with sequence ID {s_record.id}. " - f"Correcting end location to: {end + end_shift}.") + problems.append([start, end, s_record.id, + "Corrected stop codon to " + f"{end + end_shift}"]) + nt_sequence = s_record.seq[tru_start + start_shift: end + end_shift] # Cropping nt_no_stop_sequence = s_record.seq[tru_start + @@ -201,9 +197,23 @@ def trim( nt_no_stop_sequence_records, aa_sequence_records, aa_no_stop_sequence_records, + problems ) +def log_problems(gene: str, problems: list[list[str]]): + for start, end, id, desc in problems: + warning(f"{gene} ({start} - {end}) of {id}: {desc}") + + +def output_as_csv(gene: str, problems: list[list[str]], output_path: str): + header = ["Start", "End", "Sequence ID", "Problem"] + with open(output_path, "w") as problem_csv_fd: + writer = csv.writer(problem_csv_fd) + writer.writerow(header) + writer.writerows(problems) + + def main(args): logging.basicConfig(level=args.log_level.upper()) @@ -233,8 +243,18 @@ def main(args): nt_no_stop_sequence_records, aa_sequence_records, aa_no_stop_sequence_records, + problems ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range) + if len(problems) > 0: + warning(f"There were {len(problems)} problems " + f"during trimming {gene_name}!") + if args.catalogue_problems: + output_as_csv( + gene_name, + problems, + os.path.join(args.output_dir, f"{gene_name} - problems.csv") + ) write_to_file( args.output_dir, gene_name, @@ -377,4 +397,13 @@ if __name__ == "__main__": default="INFO" ) + parser.add_argument( + "-C", + "--catalogue-problems", + help="Generates a CSV for each gene listing the problems that " + "occurred during trimming.", + required=False, + action="store_true" + ) + main(parser.parse_args())