#Toxin import glob import os PROJECT = config["project"] INPUT = config["input_file"] # HMM scan rule run_HMM: input: hmm=config["hmm_file"], renamed="{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa" output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan" log: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.log" message: "Run HMM scan on {input.renamed} to generate {output}" params: outdir="{OUTDIR}" conda: "../../envs/HMMER.yaml" threads: 1 shell: """ {config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log} """ # Adjust HMM results to correct format rule HMM_correct_format: input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan" output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv" message: "Adjust {input} to correct format: {output}" params: outdir="{OUTDIR}" shell: """ sed '/^#/ d' {input} | sed 's/ \+/\\t/g' > {output} """ def aggregate_hmm(wildcards): checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits return expand( "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv", OUTDIR=wildcards.OUTDIR, project=wildcards.project, input_file=wildcards.input_file, file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i ) rule HMM_correct_format_2: input: aggregate_hmm output: temp("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv") params: outdir="{OUTDIR}" shell: """ cut -f 1,3,5,6 {input} | uniq > {output} """ rule HMM_correct_format_3: input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv" output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R.csv" params: outdir="{OUTDIR}" shell: """ echo "#Toxin" > {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header cat {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header {input} > {output} rm -rf {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header sed -i $'1 i\\\ Query_sequence\\tHMM_Name\\tSignificance_Evalue\\tScore' {output} """