191 lines
5.4 KiB
Python
191 lines
5.4 KiB
Python
import os
|
|
import argparse
|
|
from Bio import SeqIO
|
|
import logging
|
|
from bmlsa.aligner import align_many_to_one_ssw
|
|
|
|
from bmlsa.io import read_annotations_from_csv, save_alignments_to_csv
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_ALIGNMENT_PARAMETERS = {
|
|
"BLASTp": {
|
|
"extend_gap_score": -1,
|
|
"open_gap_score": -11,
|
|
"substitution_matrix": "BLOSUM62",
|
|
"alignment_mode": "local",
|
|
}
|
|
}
|
|
|
|
|
|
def run(args):
|
|
if (
|
|
not (args.extend_gap_score and args.open_gap_score and args.alignment_mode)
|
|
and not args.behave_as
|
|
):
|
|
logger.error(
|
|
'Must either specify all of "--extend-gap-score", "--open-gap-score", '
|
|
'"--alignment-mode", or, at least "--behave-as". See help (-h) for '
|
|
"more information."
|
|
)
|
|
exit(1)
|
|
queries = read_annotations_from_csv(
|
|
args.annotations,
|
|
args.id_header,
|
|
args.seq_header,
|
|
args.name_header,
|
|
args.desc_header,
|
|
args.start_header,
|
|
args.end_header,
|
|
)
|
|
|
|
scoring_parameter = (
|
|
{
|
|
"extend_gap_score": args.extend_gap_score,
|
|
"open_gap_score": args.open_gap_score,
|
|
"alignment_mode": args.alignment_mode,
|
|
"substitution_matrix": args.substitution_matrix
|
|
if args.substitution_matrix is not None
|
|
else "BLOSUM62",
|
|
}
|
|
if args.behave_as not in DEFAULT_ALIGNMENT_PARAMETERS
|
|
else DEFAULT_ALIGNMENT_PARAMETERS[args.behave_as]
|
|
)
|
|
|
|
with open(args.sequence, "r") as sequence_fd:
|
|
for sequence in SeqIO.parse(sequence_fd, "fasta"):
|
|
aligned_annotations = align_many_to_one_ssw(
|
|
str(sequence.seq), queries, **scoring_parameter
|
|
)
|
|
save_alignments_to_csv(
|
|
aligned_annotations,
|
|
os.path.join(
|
|
args.output,
|
|
sequence.id.replace("|", "+").replace(".", "_") + ".csv",
|
|
),
|
|
)
|
|
|
|
|
|
def main():
|
|
arg_parser = argparse.ArgumentParser("bmlsa")
|
|
arg_parser.add_argument(
|
|
"annotations",
|
|
type=str,
|
|
help=(
|
|
"Path to CSV containing the sequences to align as well as the "
|
|
"annotations for the respective sequences."
|
|
),
|
|
metavar="a",
|
|
)
|
|
arg_parser.add_argument(
|
|
"sequence",
|
|
type=str,
|
|
help=(
|
|
"Path to the sequence to use as reference in FASTA format. "
|
|
"If multiple sequences are present in the same FASTA file, "
|
|
"each will be used as a separate reference sequence for separate "
|
|
"runs automatically."
|
|
),
|
|
metavar="s",
|
|
)
|
|
arg_parser.add_argument(
|
|
"output", type=str, help="Path to output location", metavar="o"
|
|
)
|
|
arg_parser.add_argument(
|
|
"-I",
|
|
"--id-header",
|
|
type=str,
|
|
help="The header of the column for the ID of the sequence to align to "
|
|
"the reference sequence.",
|
|
required=True,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-N",
|
|
"--name-header",
|
|
type=str,
|
|
help="The header of the column for the name of the sequence to align to "
|
|
"the reference sequence.",
|
|
required=False,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-D",
|
|
"--desc-header",
|
|
type=str,
|
|
help="The header of the column for the description of the sequence to "
|
|
"align to the reference sequence.",
|
|
required=False,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-T",
|
|
"--start-header",
|
|
type=str,
|
|
help="The header of the column for the start position of the sequence to "
|
|
"align to the reference sequence.",
|
|
required=False,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-E",
|
|
"--end-header",
|
|
type=str,
|
|
help="The header of the column for end position of the sequence to "
|
|
"align to the reference sequence.",
|
|
required=False,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-S",
|
|
"--seq-header",
|
|
type=str,
|
|
help="The header of the column for the actual sequence to align to the "
|
|
"reference sequence.",
|
|
required=True,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-e",
|
|
"--extend-gap-score",
|
|
type=int,
|
|
help="The scoring for extending a gap.",
|
|
required=False,
|
|
default=None,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-o",
|
|
"--open-gap-score",
|
|
type=int,
|
|
help="The scoring for opening a gap.",
|
|
required=False,
|
|
default=None,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-M",
|
|
"--alignment-mode",
|
|
type=str,
|
|
help="The alignment mode.",
|
|
choices=["local", "global"],
|
|
required=False,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-m",
|
|
"--substitution-matrix",
|
|
type=str,
|
|
help="The name of the substitution matrix.",
|
|
required=False,
|
|
default=None,
|
|
)
|
|
arg_parser.add_argument(
|
|
"-B",
|
|
"--behave-as",
|
|
type=str,
|
|
help="Use built-in parameters for alignment scoring. If this is specified "
|
|
"along either of the scoring arguments, the alignment parameter arguments "
|
|
'("--open-gap-score" and/or "--extend-gab-score" and "--alignment-mode") '
|
|
"will override any defaults set by this argument.",
|
|
required=False,
|
|
default=None,
|
|
)
|
|
args = arg_parser.parse_args()
|
|
run(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|