Renamed project to "SplitMSA" and added pipeline file
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				ydeng/splitmsa/pipeline/head There was a failure building this commit
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	ydeng/splitmsa/pipeline/head There was a failure building this commit
				
			This commit is contained in:
		
							
								
								
									
										26
									
								
								Jenkinsfile
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								Jenkinsfile
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,26 @@
 | 
			
		||||
pipeline {
 | 
			
		||||
    agent any
 | 
			
		||||
    stages {
 | 
			
		||||
        stage("install") {
 | 
			
		||||
            steps {
 | 
			
		||||
                sh 'conda env update --file environment.yml'
 | 
			
		||||
                sh 'echo "conda activate splitmsa" >> ~/.bashrc'
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        stage("build") {
 | 
			
		||||
            steps {
 | 
			
		||||
                sh "python -m build"
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        stage("publish") {
 | 
			
		||||
            when {
 | 
			
		||||
                branch '**/master'
 | 
			
		||||
            }
 | 
			
		||||
            steps {
 | 
			
		||||
                withCredentials([usernamePassword(credentialsId: 'rs-git-package-registry-ydeng', passwordVariable: 'PASS', usernameVariable: 'USER')]) {
 | 
			
		||||
                    sh "python -m twine upload --repository-url https://git.reslate.systems/api/packages/${USER}/pypi -u ${USER} -p ${PASS} --non-interactive --disable-progress-bar --verbose dist/*"
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -1,4 +1,4 @@
 | 
			
		||||
# MSA Splitter
 | 
			
		||||
# SplitMSA
 | 
			
		||||
 | 
			
		||||
Simple FASTA file splitter. Capable of batch trimming a large amount of sequences in the form of a MSA in a FASTA file.
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,46 +1,5 @@
 | 
			
		||||
name: /home/ydeng/msa-splitter/envs
 | 
			
		||||
name: splitmsa
 | 
			
		||||
channels:
 | 
			
		||||
  - conda-forge
 | 
			
		||||
dependencies:
 | 
			
		||||
  - _libgcc_mutex=0.1=conda_forge
 | 
			
		||||
  - _openmp_mutex=4.5=2_gnu
 | 
			
		||||
  - biopython=1.81=py311h2582759_0
 | 
			
		||||
  - black=23.3.0=py311h38be061_0
 | 
			
		||||
  - bzip2=1.0.8=h7f98852_4
 | 
			
		||||
  - ca-certificates=2022.12.7=ha878542_0
 | 
			
		||||
  - click=8.1.3=unix_pyhd8ed1ab_2
 | 
			
		||||
  - ld_impl_linux-64=2.40=h41732ed_0
 | 
			
		||||
  - libblas=3.9.0=16_linux64_openblas
 | 
			
		||||
  - libcblas=3.9.0=16_linux64_openblas
 | 
			
		||||
  - libexpat=2.5.0=hcb278e6_1
 | 
			
		||||
  - libffi=3.4.2=h7f98852_5
 | 
			
		||||
  - libgcc-ng=12.2.0=h65d4601_19
 | 
			
		||||
  - libgfortran-ng=12.2.0=h69a702a_19
 | 
			
		||||
  - libgfortran5=12.2.0=h337968e_19
 | 
			
		||||
  - libgomp=12.2.0=h65d4601_19
 | 
			
		||||
  - liblapack=3.9.0=16_linux64_openblas
 | 
			
		||||
  - libnsl=2.0.0=h7f98852_0
 | 
			
		||||
  - libopenblas=0.3.21=pthreads_h78a6416_3
 | 
			
		||||
  - libsqlite=3.40.0=h753d276_0
 | 
			
		||||
  - libstdcxx-ng=12.2.0=h46fd767_19
 | 
			
		||||
  - libuuid=2.38.1=h0b41bf4_0
 | 
			
		||||
  - libzlib=1.2.13=h166bdaf_4
 | 
			
		||||
  - mypy_extensions=1.0.0=pyha770c72_0
 | 
			
		||||
  - ncurses=6.3=h27087fc_1
 | 
			
		||||
  - numpy=1.24.2=py311h8e6699e_0
 | 
			
		||||
  - openssl=3.1.0=h0b41bf4_0
 | 
			
		||||
  - packaging=23.0=pyhd8ed1ab_0
 | 
			
		||||
  - pathspec=0.11.1=pyhd8ed1ab_0
 | 
			
		||||
  - pip=23.0.1=pyhd8ed1ab_0
 | 
			
		||||
  - platformdirs=3.2.0=pyhd8ed1ab_0
 | 
			
		||||
  - python=3.11.1=h2755cc3_0_cpython
 | 
			
		||||
  - python_abi=3.11=3_cp311
 | 
			
		||||
  - readline=8.2=h8228510_1
 | 
			
		||||
  - setuptools=67.6.1=pyhd8ed1ab_0
 | 
			
		||||
  - tk=8.6.12=h27826a3_0
 | 
			
		||||
  - typing-extensions=4.5.0=hd8ed1ab_0
 | 
			
		||||
  - typing_extensions=4.5.0=pyha770c72_0
 | 
			
		||||
  - tzdata=2023c=h71feb2d_0
 | 
			
		||||
  - wheel=0.40.0=pyhd8ed1ab_0
 | 
			
		||||
  - xz=5.2.6=h166bdaf_0
 | 
			
		||||
prefix: /home/ydeng/msa-splitter/envs
 | 
			
		||||
  - biopython=1.81
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								pyproject.toml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
[build-system]
 | 
			
		||||
build-backend = "setuptools.build_meta"
 | 
			
		||||
requires = ["setuptools", "wheel"]
 | 
			
		||||
							
								
								
									
										12
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								setup.cfg
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,12 @@
 | 
			
		||||
[metadata]
 | 
			
		||||
name = splitmsa
 | 
			
		||||
version = 0.0.1
 | 
			
		||||
 | 
			
		||||
[options]
 | 
			
		||||
packages = splitmsa
 | 
			
		||||
install_requires =
 | 
			
		||||
    Bio
 | 
			
		||||
 | 
			
		||||
[options.entry_points]
 | 
			
		||||
console_scripts =
 | 
			
		||||
    splitmsa = splitmsa.splitmsa:main
 | 
			
		||||
@@ -203,7 +203,7 @@ def trim(
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        if perform_translation and not skip_translation:
 | 
			
		||||
            if '-' in nt_sequence:
 | 
			
		||||
            if "-" in nt_sequence:
 | 
			
		||||
                sequence_with_ambiguity = []
 | 
			
		||||
                for codon_in_sequence in range(0, len(nt_sequence), 3):
 | 
			
		||||
                    codon = nt_sequence[codon_in_sequence : codon_in_sequence + 3]
 | 
			
		||||
@@ -247,72 +247,9 @@ def output_as_csv(gene: str, problems: list[list[str]], output_path: str):
 | 
			
		||||
        writer.writerows(problems)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(args):
 | 
			
		||||
    logging.basicConfig(level=args.log_level.upper())
 | 
			
		||||
 | 
			
		||||
    msa_records = list(read_msa_file(args.input))
 | 
			
		||||
    info(f"MSA records read complete. Found {len(msa_records)} records.")
 | 
			
		||||
    genes = []
 | 
			
		||||
    if args.gene_list:
 | 
			
		||||
        genes = read_genes_from_csv(args.gene_list)
 | 
			
		||||
        info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
 | 
			
		||||
    else:
 | 
			
		||||
        if args.gene_name and args.start and args.end:
 | 
			
		||||
            genes.append([args.gene_name, args.start, args.end])
 | 
			
		||||
            info(
 | 
			
		||||
                f"Extracting {args.gene_name} starting at {args.start} to "
 | 
			
		||||
                f"{args.end}."
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception(
 | 
			
		||||
                "Need either a gene list by --gene-list or a start and end "
 | 
			
		||||
                "via --start, and --end respectively."
 | 
			
		||||
            )
 | 
			
		||||
    for gene_name, start, end in genes:
 | 
			
		||||
        info(f"Started on gene {gene_name} ({start} - {end})")
 | 
			
		||||
        (
 | 
			
		||||
            nt_sequence_records,
 | 
			
		||||
            nt_no_stop_sequence_records,
 | 
			
		||||
            aa_sequence_records,
 | 
			
		||||
            aa_no_stop_sequence_records,
 | 
			
		||||
            problems,
 | 
			
		||||
        ) = trim(
 | 
			
		||||
            start,
 | 
			
		||||
            end,
 | 
			
		||||
            args.gen_cut_stop_codon,
 | 
			
		||||
            args.do_translate,
 | 
			
		||||
            msa_records,
 | 
			
		||||
            correction_range=args.correction_range,
 | 
			
		||||
        )
 | 
			
		||||
        if len(problems) > 0:
 | 
			
		||||
            warning(
 | 
			
		||||
                f"There were {len(problems)} problems " f"during trimming {gene_name}!"
 | 
			
		||||
            )
 | 
			
		||||
        if args.catalogue_problems:
 | 
			
		||||
            output_as_csv(
 | 
			
		||||
                gene_name,
 | 
			
		||||
                problems,
 | 
			
		||||
                os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
 | 
			
		||||
            )
 | 
			
		||||
        write_to_file(
 | 
			
		||||
            args.output_dir,
 | 
			
		||||
            gene_name,
 | 
			
		||||
            start,
 | 
			
		||||
            end,
 | 
			
		||||
            args.full_suffix,
 | 
			
		||||
            args.ns_suffix,
 | 
			
		||||
            args.aa_suffix,
 | 
			
		||||
            nt_sequence_records,
 | 
			
		||||
            nt_no_stop_sequence_records,
 | 
			
		||||
            aa_sequence_records,
 | 
			
		||||
            aa_no_stop_sequence_records,
 | 
			
		||||
        )
 | 
			
		||||
        info(f"Completed gene {gene_name} ({start} - {end})")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
def main():
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        prog="msa_splitter",
 | 
			
		||||
        prog="splitmsa",
 | 
			
		||||
        description="""
 | 
			
		||||
            The MSA splitter is a simple program that takes in two positions
 | 
			
		||||
            and a MSA file and produces two separate FASTA files
 | 
			
		||||
@@ -453,4 +390,70 @@ if __name__ == "__main__":
 | 
			
		||||
        action="store_true",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    main(parser.parse_args())
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    logging.basicConfig(level=args.log_level.upper())
 | 
			
		||||
 | 
			
		||||
    msa_records = list(read_msa_file(args.input))
 | 
			
		||||
    info(f"MSA records read complete. Found {len(msa_records)} records.")
 | 
			
		||||
    genes = []
 | 
			
		||||
    if args.gene_list:
 | 
			
		||||
        genes = read_genes_from_csv(args.gene_list)
 | 
			
		||||
        info(f"Gene list read from {args.gene_list} resulted in {len(genes)} " "genes.")
 | 
			
		||||
    else:
 | 
			
		||||
        if args.gene_name and args.start and args.end:
 | 
			
		||||
            genes.append([args.gene_name, args.start, args.end])
 | 
			
		||||
            info(
 | 
			
		||||
                f"Extracting {args.gene_name} starting at {args.start} to "
 | 
			
		||||
                f"{args.end}."
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception(
 | 
			
		||||
                "Need either a gene list by --gene-list or a start and end "
 | 
			
		||||
                "via --start, and --end respectively."
 | 
			
		||||
            )
 | 
			
		||||
    for gene_name, start, end in genes:
 | 
			
		||||
        info(f"Started on gene {gene_name} ({start} - {end})")
 | 
			
		||||
        (
 | 
			
		||||
            nt_sequence_records,
 | 
			
		||||
            nt_no_stop_sequence_records,
 | 
			
		||||
            aa_sequence_records,
 | 
			
		||||
            aa_no_stop_sequence_records,
 | 
			
		||||
            problems,
 | 
			
		||||
        ) = trim(
 | 
			
		||||
            start,
 | 
			
		||||
            end,
 | 
			
		||||
            args.gen_cut_stop_codon,
 | 
			
		||||
            args.do_translate,
 | 
			
		||||
            msa_records,
 | 
			
		||||
            correction_range=args.correction_range,
 | 
			
		||||
        )
 | 
			
		||||
        if len(problems) > 0:
 | 
			
		||||
            warning(
 | 
			
		||||
                f"There were {len(problems)} problems " f"during trimming {gene_name}!"
 | 
			
		||||
            )
 | 
			
		||||
        if args.catalogue_problems:
 | 
			
		||||
            output_as_csv(
 | 
			
		||||
                gene_name,
 | 
			
		||||
                problems,
 | 
			
		||||
                os.path.join(args.output_dir, f"{gene_name} - problems.csv"),
 | 
			
		||||
            )
 | 
			
		||||
        write_to_file(
 | 
			
		||||
            args.output_dir,
 | 
			
		||||
            gene_name,
 | 
			
		||||
            start,
 | 
			
		||||
            end,
 | 
			
		||||
            args.full_suffix,
 | 
			
		||||
            args.ns_suffix,
 | 
			
		||||
            args.aa_suffix,
 | 
			
		||||
            nt_sequence_records,
 | 
			
		||||
            nt_no_stop_sequence_records,
 | 
			
		||||
            aa_sequence_records,
 | 
			
		||||
            aa_no_stop_sequence_records,
 | 
			
		||||
        )
 | 
			
		||||
        info(f"Completed gene {gene_name} ({start} - {end})")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    main()
 | 
			
		||||
		Reference in New Issue
	
	Block a user