Toxin.smk 2.87 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1 2 3 4 5
#Toxin

import glob
import os

6
# HMM scan
7
rule run_HMM_tox:
Laura Denies's avatar
Laura Denies committed
8
    input:
9
        hmm=config["pathofact"]["tox_hmm"],
10
        renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
11
    output:
12
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
13
    log:
14
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.log"
15
    message:
16
        "Run HMM scan on {input.renamed} to generate {output}"
17
    params:
18 19 20
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["long"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
21 22
    conda:
        "../../envs/HMMER.yaml"
23 24 25 26
    threads:
        1
    shell:
        """
27
        hmmsearch --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
Laura Denies's avatar
Laura Denies committed
28 29 30 31
        """

# Adjust HMM results to correct format
rule HMM_correct_format:
32
    input:
33
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
34
    output:
35
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv"
36 37 38
    message:
        "Adjust {input} to correct format: {output}"
    params:
39 40 41
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
42 43
    shell:
        """
44
        sed '/^#/ d' {input} | sed 's/ \+/\\t/g' > {output}
Laura Denies's avatar
Laura Denies committed
45 46 47
        """

def aggregate_hmm(wildcards):
48 49
    checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
    return expand(
50 51
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv",
        datadir=wildcards.datadir,
52
        project=wildcards.project,
53
        sample=wildcards.sample,
54
        file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
55
    )
Laura Denies's avatar
Laura Denies committed
56 57

rule HMM_correct_format_2:
58 59 60
    input:
        aggregate_hmm
    output:
61
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv")
62
    params:
63 64 65
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
66 67
    shell:
        """
68
        cut -f 1,3,5,6 {input} | uniq > {output}
69
        """
Laura Denies's avatar
Laura Denies committed
70 71

rule HMM_correct_format_3:
72
    input:
73
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv"
74
    output:
75
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv"
76
    params:
77 78 79
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
80 81
    shell:
        """
82 83 84
        echo "#Toxin" > {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
        cat {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header {input} > {output}
        rm -rf {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
85
        sed -i $'1 i\\\ Query_sequence\\tHMM_Name\\tSignificance_Evalue\\tScore' {output}    
86
        """