Compare commits
No commits in common. "89cde939b091a89a32120662a9b5d74ff6e3956a" and "99f75942ac5758ceaf686fb6d0babd6a93388376" have entirely different histories.
89cde939b0
...
99f75942ac
25
README.md
25
README.md
@ -1,28 +1,3 @@
|
|||||||
# MSA Splitter
|
# MSA Splitter
|
||||||
|
|
||||||
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
|
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
|
||||||
|
|
||||||
## Features
|
|
||||||
|
|
||||||
- Split large fasta files that contain a multiple sequence alignment (MSA) into individual genes
|
|
||||||
- Trim off stop codon
|
|
||||||
- Batch process multiple genes from one MSA
|
|
||||||
- Correct gene start and stop locations based on start and stop codon location
|
|
||||||
- Catalogues errors that occurred in human-readable CSV file
|
|
||||||
|
|
||||||
## Planned
|
|
||||||
|
|
||||||
- Translate MSA into amino acids while maintaining alignment (shows type of mutation if frameshift)
|
|
||||||
- Simple to use GUI
|
|
||||||
- Run without system-wide python install
|
|
||||||
|
|
||||||
## Use
|
|
||||||
|
|
||||||
### Command Line
|
|
||||||
|
|
||||||
1. Install python 3
|
|
||||||
2. Install `biopython`
|
|
||||||
- Using `pip`: `pip install biopython` or `pip3 install biopython`
|
|
||||||
- Using `conda`: ` conda install -c conda-forge biopython`
|
|
||||||
3. Download `msa_splitter.py` and run with `python3 msa_splitter.py`
|
|
||||||
- `python3 msa_splitter.py -h` for help
|
|
105
msa_splitter.py
105
msa_splitter.py
@ -25,7 +25,7 @@ def read_genes_from_csv(batch_genes_csv_path: str):
|
|||||||
csv_header = list(row)
|
csv_header = list(row)
|
||||||
else:
|
else:
|
||||||
gene_name = row[csv_header.index("Gene")]
|
gene_name = row[csv_header.index("Gene")]
|
||||||
gene_start = int(row[csv_header.index("Start")])
|
gene_start = int(row[csv_header.index("Start")]) - 1
|
||||||
gene_end = int(row[csv_header.index("End")])
|
gene_end = int(row[csv_header.index("End")])
|
||||||
genes.append((gene_name, gene_start, gene_end))
|
genes.append((gene_name, gene_start, gene_end))
|
||||||
return genes
|
return genes
|
||||||
@ -104,21 +104,14 @@ STOP_CODONS = {"TAG", "TAA", "TGA"}
|
|||||||
START_CODON = "ATG"
|
START_CODON = "ATG"
|
||||||
|
|
||||||
|
|
||||||
def trim(
|
def trim(start: int, end: int, gen_cut_stop_codon: bool, msa_records, correction_range=16):
|
||||||
start: int,
|
|
||||||
end: int,
|
|
||||||
gen_cut_stop_codon: bool,
|
|
||||||
msa_records,
|
|
||||||
correction_range=16
|
|
||||||
):
|
|
||||||
tru_start = start - 1
|
|
||||||
nt_sequence_records = []
|
nt_sequence_records = []
|
||||||
nt_no_stop_sequence_records = []
|
nt_no_stop_sequence_records = []
|
||||||
aa_sequence_records = []
|
aa_sequence_records = []
|
||||||
aa_no_stop_sequence_records = []
|
aa_no_stop_sequence_records = []
|
||||||
problems = []
|
|
||||||
|
|
||||||
debug(f"Beginning sequence trimming for {tru_start} - {end}.")
|
debug(f"Beginning sequence trimming for {start} - {end}.")
|
||||||
for s_record in msa_records:
|
for s_record in msa_records:
|
||||||
record_metadata = (
|
record_metadata = (
|
||||||
s_record.id,
|
s_record.id,
|
||||||
@ -132,22 +125,20 @@ def trim(
|
|||||||
found_start_codon = False
|
found_start_codon = False
|
||||||
start_shift = 0
|
start_shift = 0
|
||||||
for i in range(correction_range):
|
for i in range(correction_range):
|
||||||
if s_record.seq[tru_start + i: tru_start + i + 3] == START_CODON:
|
if s_record.seq[start + i: start + i + 3] == START_CODON:
|
||||||
found_start_codon = True
|
found_start_codon = True
|
||||||
start_shift = i
|
start_shift = i
|
||||||
break
|
break
|
||||||
if not found_start_codon and s_record.seq[
|
if s_record.seq[start - i: start - i + 3] == START_CODON:
|
||||||
tru_start - i: tru_start - i + 3] == START_CODON:
|
|
||||||
found_start_codon = True
|
found_start_codon = True
|
||||||
start_shift = -i
|
start_shift = -i
|
||||||
break
|
break
|
||||||
if not found_start_codon:
|
if not found_start_codon:
|
||||||
problems.append([start, end, s_record.id,
|
warning(
|
||||||
"Could not find start codon"])
|
f"Could not find start codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
|
||||||
if found_start_codon and start_shift != 0:
|
if start_shift != 0:
|
||||||
problems.append([start, end, s_record.id,
|
warning(
|
||||||
"Corrected start codon to "
|
f"Start codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting start location to {start + start_shift}")
|
||||||
f"{tru_start + start_shift}"])
|
|
||||||
|
|
||||||
end_shift = 0
|
end_shift = 0
|
||||||
found_stop_codon = False
|
found_stop_codon = False
|
||||||
@ -156,22 +147,19 @@ def trim(
|
|||||||
found_stop_codon = True
|
found_stop_codon = True
|
||||||
end_shift = i
|
end_shift = i
|
||||||
break
|
break
|
||||||
if not found_stop_codon and s_record.seq[
|
if s_record.seq[end - i - 3: end - i] in STOP_CODONS:
|
||||||
end - i - 3: end - i] in STOP_CODONS:
|
|
||||||
found_stop_codon = True
|
found_stop_codon = True
|
||||||
end_shift = -i
|
end_shift = -i
|
||||||
break
|
break
|
||||||
if not found_stop_codon:
|
if not found_stop_codon:
|
||||||
problems.append([start, end, s_record.id,
|
warning(
|
||||||
"Could not find stop codon"])
|
f"Could not find stop codon for region {start} - {end} with sequence ID {s_record.id}. Continuing without shift...")
|
||||||
if found_stop_codon and end_shift != 0:
|
if end_shift != 0:
|
||||||
problems.append([start, end, s_record.id,
|
warning(
|
||||||
"Corrected stop codon to "
|
f"Stop codon was not found at expected location for region {start} - {end} with sequence ID {s_record.id}. Correcting end location to: {end + end_shift}.")
|
||||||
f"{end + end_shift}"])
|
nt_sequence = s_record.seq[start +
|
||||||
|
start_shift: end + end_shift] # Cropping step
|
||||||
nt_sequence = s_record.seq[tru_start +
|
nt_no_stop_sequence = s_record.seq[start +
|
||||||
start_shift: end + end_shift] # Cropping
|
|
||||||
nt_no_stop_sequence = s_record.seq[tru_start +
|
|
||||||
start_shift: end - 3 + end_shift]
|
start_shift: end - 3 + end_shift]
|
||||||
nt_sequence_records.append(
|
nt_sequence_records.append(
|
||||||
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
|
SeqRecord.SeqRecord(nt_sequence, *record_metadata))
|
||||||
@ -191,29 +179,15 @@ def trim(
|
|||||||
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
|
SeqRecord.SeqRecord(aa_no_stop_sequence, *record_metadata)
|
||||||
)
|
)
|
||||||
debug(f"Trimming for {s_record.id} complete.")
|
debug(f"Trimming for {s_record.id} complete.")
|
||||||
debug(f"Sequence trimming for {tru_start} - {end} complete.")
|
debug(f"Sequence trimming for {start} - {end} complete.")
|
||||||
return (
|
return (
|
||||||
nt_sequence_records,
|
nt_sequence_records,
|
||||||
nt_no_stop_sequence_records,
|
nt_no_stop_sequence_records,
|
||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
problems
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def log_problems(gene: str, problems: list[list[str]]):
|
|
||||||
for start, end, id, desc in problems:
|
|
||||||
warning(f"{gene} ({start} - {end}) of {id}: {desc}")
|
|
||||||
|
|
||||||
|
|
||||||
def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
|
|
||||||
header = ["Start", "End", "Sequence ID", "Problem"]
|
|
||||||
with open(output_path, "w") as problem_csv_fd:
|
|
||||||
writer = csv.writer(problem_csv_fd)
|
|
||||||
writer.writerow(header)
|
|
||||||
writer.writerows(problems)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
logging.basicConfig(level=args.log_level.upper())
|
logging.basicConfig(level=args.log_level.upper())
|
||||||
|
|
||||||
@ -223,38 +197,23 @@ def main(args):
|
|||||||
if args.gene_list:
|
if args.gene_list:
|
||||||
genes = read_genes_from_csv(args.gene_list)
|
genes = read_genes_from_csv(args.gene_list)
|
||||||
info(
|
info(
|
||||||
f"Gene list read from {args.gene_list} resulted in {len(genes)} "
|
f"Gene list read from {args.gene_list} resulted in {len(genes)} genes.")
|
||||||
"genes.")
|
|
||||||
else:
|
else:
|
||||||
if args.gene_name and args.start and args.end:
|
if args.gene_name and args.start and args.end:
|
||||||
genes.append([args.gene_name, args.start, args.end])
|
genes.append([args.gene_name, args.start, args.end])
|
||||||
info(
|
info(
|
||||||
f"Extracting {args.gene_name} starting at {args.start} to "
|
f"Extracting {args.gene_name} starting at {args.start} to {args.end}.")
|
||||||
f"{args.end}.")
|
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Need either a gene list by --gene-list or a start and end "
|
"Need either a gene list by --gene-list or a start and end via --start, and --end respectively."
|
||||||
"via --start, and --end respectively."
|
|
||||||
)
|
)
|
||||||
for gene_name, start, end in genes:
|
for gene_name, start, end in genes:
|
||||||
info(f"Started on gene {gene_name} ({start} - {end})")
|
|
||||||
(
|
(
|
||||||
nt_sequence_records,
|
nt_sequence_records,
|
||||||
nt_no_stop_sequence_records,
|
nt_no_stop_sequence_records,
|
||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
problems
|
) = trim(start, end, args.gen_cut_stop_codon, msa_records, correction_range=args.correction_range)
|
||||||
) = trim(start, end, args.gen_cut_stop_codon,
|
|
||||||
msa_records, correction_range=args.correction_range)
|
|
||||||
if len(problems) > 0:
|
|
||||||
warning(f"There were {len(problems)} problems "
|
|
||||||
f"during trimming {gene_name}!")
|
|
||||||
if args.catalogue_problems:
|
|
||||||
output_as_csv(
|
|
||||||
gene_name,
|
|
||||||
problems,
|
|
||||||
os.path.join(args.output_dir, f"{gene_name} - problems.csv")
|
|
||||||
)
|
|
||||||
write_to_file(
|
write_to_file(
|
||||||
args.output_dir,
|
args.output_dir,
|
||||||
gene_name,
|
gene_name,
|
||||||
@ -268,7 +227,7 @@ def main(args):
|
|||||||
aa_sequence_records,
|
aa_sequence_records,
|
||||||
aa_no_stop_sequence_records,
|
aa_no_stop_sequence_records,
|
||||||
)
|
)
|
||||||
info(f"Completed gene {gene_name} ({start} - {end})")
|
info(f"Extracted gene sequence for {gene_name} ({start} - {end})")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -381,8 +340,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--correction-range",
|
"--correction-range",
|
||||||
"-R",
|
"-R",
|
||||||
help="The number of offsets in terms of nucleotides to try in both "
|
help="The number of offsets in terms of nucleotides to try in both directions to correct start and stop codons before giving up.",
|
||||||
"directions to correct start and stop codons before giving up.",
|
|
||||||
type=int,
|
type=int,
|
||||||
required=False,
|
required=False,
|
||||||
default=9
|
default=9
|
||||||
@ -397,13 +355,4 @@ if __name__ == "__main__":
|
|||||||
default="INFO"
|
default="INFO"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"-C",
|
|
||||||
"--catalogue-problems",
|
|
||||||
help="Generates a CSV for each gene listing the problems that "
|
|
||||||
"occurred during trimming.",
|
|
||||||
required=False,
|
|
||||||
action="store_true"
|
|
||||||
)
|
|
||||||
|
|
||||||
main(parser.parse_args())
|
main(parser.parse_args())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user