Added problem cataloging.

This commit is contained in:
Harrison Deng 2023-03-22 15:13:37 -05:00
parent e5ead8f197
commit 89cde939b0
2 changed files with 70 additions and 16 deletions

View File

@ -1,3 +1,28 @@
# MSA Splitter # MSA Splitter
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file. Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
## Features
- Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes
- Trim off stop codon
- Batch process multiple genes from one MSA
- Correct gene start and stop locations based on start and stop codon location
- Catalogues errors that occurred in human-readable CSV file
## Planned
- Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
- Simple to use GUI
- Run without system-wide python install
## Use
### Command Line
1. Install python 3
2. Install `biopython`
- Using `pip`: `pip install biopython` or `pip3 install biopython`
- Using `conda`: ` conda install -c conda-forge biopython`
3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
- `python3 msa_splitter.py -h` for help

View File

@ -116,6 +116,7 @@ def trim(
nt_no_stop_sequence_records = [] nt_no_stop_sequence_records = []
aa_sequence_records = [] aa_sequence_records = []
aa_no_stop_sequence_records = [] aa_no_stop_sequence_records = []
problems = []
debug(f"Beginning sequence trimming for {tru_start} - {end}.") debug(f"Beginning sequence trimming for {tru_start} - {end}.")
for s_record in msa_records: for s_record in msa_records:
@ -141,15 +142,12 @@ def trim(
start_shift = -i start_shift = -i
break break
if not found_start_codon: if not found_start_codon:
warning( problems.append([start, end, s_record.id,
f"Could not find start codon for region {tru_start} - {end} " "Could not find start codon"])
f"with sequence ID {s_record.id}. Continuing without "
f"correction...")
if found_start_codon and start_shift != 0: if found_start_codon and start_shift != 0:
warning( problems.append([start, end, s_record.id,
"Start codon was not found at expected location for region " "Corrected start codon to "
f"{tru_start} - {end} with sequence ID {s_record.id}. " f"{tru_start + start_shift}"])
f"Correcting start location to {tru_start + start_shift}")
end_shift = 0 end_shift = 0
found_stop_codon = False found_stop_codon = False
@ -164,15 +162,13 @@ def trim(
end_shift = -i end_shift = -i
break break
if not found_stop_codon: if not found_stop_codon:
warning( problems.append([start, end, s_record.id,
f"Could not find stop codon for region {tru_start} - {end} " "Could not find stop codon"])
f"with sequence ID {s_record.id}. Continuing without "
f"correction...")
if found_stop_codon and end_shift != 0: if found_stop_codon and end_shift != 0:
warning( problems.append([start, end, s_record.id,
f"Stop codon was not found at expected location for region " "Corrected stop codon to "
f"{tru_start} - {end} with sequence ID {s_record.id}. " f"{end + end_shift}"])
f"Correcting end location to: {end + end_shift}.")
nt_sequence = s_record.seq[tru_start + nt_sequence = s_record.seq[tru_start +
start_shift: end + end_shift] # Cropping start_shift: end + end_shift] # Cropping
nt_no_stop_sequence = s_record.seq[tru_start + nt_no_stop_sequence = s_record.seq[tru_start +
@ -201,9 +197,23 @@ def trim(
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems
) )
def log_problems(gene: str, problems: list[list[str]]):
for start, end, id, desc in problems:
warning(f"{gene} ({start} - {end}) of {id}: {desc}")
def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
header = ["Start", "End", "Sequence ID", "Problem"]
with open(output_path, "w") as problem_csv_fd:
writer = csv.writer(problem_csv_fd)
writer.writerow(header)
writer.writerows(problems)
def main(args): def main(args):
logging.basicConfig(level=args.log_level.upper()) logging.basicConfig(level=args.log_level.upper())
@ -233,8 +243,18 @@ def main(args):
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems
) = trim(start, end, args.gen_cut_stop_codon, ) = trim(start, end, args.gen_cut_stop_codon,
msa_records, correction_range=args.correction_range) msa_records, correction_range=args.correction_range)
if len(problems) > 0:
warning(f"There were {len(problems)} problems "
f"during trimming {gene_name}!")
if args.catalogue_problems:
output_as_csv(
gene_name,
problems,
os.path.join(args.output_dir, f"{gene_name} - problems.csv")
)
write_to_file( write_to_file(
args.output_dir, args.output_dir,
gene_name, gene_name,
@ -377,4 +397,13 @@ if __name__ == "__main__":
default="INFO" default="INFO"
) )
parser.add_argument(
"-C",
"--catalogue-problems",
help="Generates a CSV for each gene listing the problems that "
"occurred during trimming.",
required=False,
action="store_true"
)
main(parser.parse_args()) main(parser.parse_args())