Compare commits

..

No commits in common. "89cde939b091a89a32120662a9b5d74ff6e3956a" and "99f75942ac5758ceaf686fb6d0babd6a93388376" have entirely different histories.

2 changed files with 27 additions and 103 deletions

View File

@ -1,28 +1,3 @@
# MSA Splitter # MSA Splitter
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file. Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
## Features
- Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes
- Trim off stop codon
- Batch process multiple genes from one MSA
- Correct gene start and stop locations based on start and stop codon location
- Catalogues errors that occurred in human-readable CSV file
## Planned
- Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
- Simple to use GUI
- Run without system-wide python install
## Use
### Command Line
1. Install python 3
2. Install `biopython`
- Using `pip`: `pip install biopython` or `pip3 install biopython`
- Using `conda`: ` conda install -c conda-forge biopython`
3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
- `python3 msa_splitter.py -h` for help

View File

@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
csv_header = list(row) csv_header = list(row)
else: else:
gene_name = row[csv_header.index("Gene")] gene_name = row[csv_header.index("Gene")]
gene_start = int(row[csv_header.index("Start")]) gene_start = int(row[csv_header.index("Start")]) - 1
gene_end = int(row[csv_header.index("End")]) gene_end = int(row[csv_header.index("End")])
genes.append((gene_name, gene_start, gene_end)) genes.append((gene_name, gene_start, gene_end))
return genes return genes
@ -104,21 +104,14 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
START_CODON = "ATG" START_CODON = "ATG"
def trim( def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
start: int,
end: int,
gen_cut_stop_codon: bool,
msa_records,
correction_range=16
):
tru_start = start - 1
nt_sequence_records = [] nt_sequence_records = []
nt_no_stop_sequence_records = [] nt_no_stop_sequence_records = []
aa_sequence_records = [] aa_sequence_records = []
aa_no_stop_sequence_records = [] aa_no_stop_sequence_records = []
problems = []
debug(f"Beginning sequence trimming for {tru_start} - {end}.") debug(f"Beginning sequence trimming for {start} - {end}.")
for s_record in msa_records: for s_record in msa_records:
record_metadata = ( record_metadata = (
s_record.id, s_record.id,
@ -132,22 +125,20 @@ def trim(
found_start_codon = False found_start_codon = False
start_shift = 0 start_shift = 0
for i in range(correction_range): for i in range(correction_range):
if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON: if s_record.seq[start + i: start + i + 3] == START_CODON:
found_start_codon = True found_start_codon = True
start_shift = i start_shift = i
break break
if not found_start_codon and s_record.seq[ if s_record.seq[start - i: start - i + 3] == START_CODON:
tru_start - i: tru_start - i + 3] == START_CODON:
found_start_codon = True found_start_codon = True
start_shift = -i start_shift = -i
break break
if not found_start_codon: if not found_start_codon:
problems.append([start, end, s_record.id, warning(
"Could not find start codon"]) f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
if found_start_codon and start_shift != 0: if start_shift != 0:
problems.append([start, end, s_record.id, warning(
"Corrected start codon to " f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
f"{tru_start + start_shift}"])
end_shift = 0 end_shift = 0
found_stop_codon = False found_stop_codon = False
@ -156,22 +147,19 @@ def trim(
found_stop_codon = True found_stop_codon = True
end_shift = i end_shift = i
break break
if not found_stop_codon and s_record.seq[ if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
end - i - 3: end - i] in STOP_CODONS:
found_stop_codon = True found_stop_codon = True
end_shift = -i end_shift = -i
break break
if not found_stop_codon: if not found_stop_codon:
problems.append([start, end, s_record.id, warning(
"Could not find stop codon"]) f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
if found_stop_codon and end_shift != 0: if end_shift != 0:
problems.append([start, end, s_record.id, warning(
"Corrected stop codon to " f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
f"{end + end_shift}"]) nt_sequence = s_record.seq[start +
start_shift: end + end_shift] # Cropping step
nt_sequence = s_record.seq[tru_start + nt_no_stop_sequence = s_record.seq[start +
start_shift: end + end_shift] # Cropping
nt_no_stop_sequence = s_record.seq[tru_start +
start_shift: end - 3 + end_shift] start_shift: end - 3 + end_shift]
nt_sequence_records.append( nt_sequence_records.append(
SeqRecord.SeqRecord(nt_sequence, *record_metadata)) SeqRecord.SeqRecord(nt_sequence, *record_metadata))
@ -191,29 +179,15 @@ def trim(
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata) SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
) )
debug(f"Trimming for {s_record.id} complete.") debug(f"Trimming for {s_record.id} complete.")
debug(f"Sequence trimming for {tru_start} - {end} complete.") debug(f"Sequence trimming for {start} - {end} complete.")
return ( return (
nt_sequence_records, nt_sequence_records,
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems
) )
def log_problems(gene: str, problems: list[list[str]]):
for start, end, id, desc in problems:
warning(f"{gene} ({start} - {end}) of {id}: {desc}")
def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
header = ["Start", "End", "Sequence ID", "Problem"]
with open(output_path, "w") as problem_csv_fd:
writer = csv.writer(problem_csv_fd)
writer.writerow(header)
writer.writerows(problems)
def main(args): def main(args):
logging.basicConfig(level=args.log_level.upper()) logging.basicConfig(level=args.log_level.upper())
@ -223,38 +197,23 @@ def main(args):
if args.gene_list: if args.gene_list:
genes = read_genes_from_csv(args.gene_list) genes = read_genes_from_csv(args.gene_list)
info( info(
f"Gene list read from {args.gene_list} resulted in {len(genes)} " f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
"genes.")
else: else:
if args.gene_name and args.start and args.end: if args.gene_name and args.start and args.end:
genes.append([args.gene_name, args.start, args.end]) genes.append([args.gene_name, args.start, args.end])
info( info(
f"Extracting {args.gene_name} starting at {args.start} to " f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
f"{args.end}.")
else: else:
raise Exception( raise Exception(
"Need either a gene list by --gene-list or a start and end " "Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
"via --start, and --end respectively."
) )
for gene_name, start, end in genes: for gene_name, start, end in genes:
info(f"Started on gene {gene_name} ({start} - {end})")
( (
nt_sequence_records, nt_sequence_records,
nt_no_stop_sequence_records, nt_no_stop_sequence_records,
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
problems ) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
) = trim(start, end, args.gen_cut_stop_codon,
msa_records, correction_range=args.correction_range)
if len(problems) > 0:
warning(f"There were {len(problems)} problems "
f"during trimming {gene_name}!")
if args.catalogue_problems:
output_as_csv(
gene_name,
problems,
os.path.join(args.output_dir, f"{gene_name} - problems.csv")
)
write_to_file( write_to_file(
args.output_dir, args.output_dir,
gene_name, gene_name,
@ -268,7 +227,7 @@ def main(args):
aa_sequence_records, aa_sequence_records,
aa_no_stop_sequence_records, aa_no_stop_sequence_records,
) )
info(f"Completed gene {gene_name} ({start} - {end})") info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
if __name__ == "__main__": if __name__ == "__main__":
@ -381,8 +340,7 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--correction-range", "--correction-range",
"-R", "-R",
help="The number of offsets in terms of nucleotides to try in both " help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
"directions to correct start and stop codons before giving up.",
type=int, type=int,
required=False, required=False,
default=9 default=9
@ -397,13 +355,4 @@ if __name__ == "__main__":
default="INFO" default="INFO"
) )
parser.add_argument(
"-C",
"--catalogue-problems",
help="Generates a CSV for each gene listing the problems that "
"occurred during trimming.",
required=False,
action="store_true"
)
main(parser.parse_args()) main(parser.parse_args())