Toxin.smk 2.88 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1
2
3
4
5
#Toxin

import glob
import os

6
# HMM scan
7
rule run_HMM_tox:
Laura Denies's avatar
Laura Denies committed
8
    input:
9
        hmm=config["pathofact"]["tox_hmm"],
10
        renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
11
    output:
Laura Denies's avatar
Laura Denies committed
12
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan")
13
    log:
14
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.log"
15
    message:
16
        "Run HMM scan on {input.renamed} to generate {output}"
17
    params:
18
19
20
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["long"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
21
22
    conda:
        "../../envs/HMMER.yaml"
23
24
25
26
    threads:
        1
    shell:
        """
27
        hmmsearch --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
Laura Denies's avatar
Laura Denies committed
28
29
30
31
        """

# Adjust HMM results to correct format
rule HMM_correct_format:
32
    input:
33
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
34
    output:
Laura Denies's avatar
Laura Denies committed
35
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv")
36
37
38
    message:
        "Adjust {input} to correct format: {output}"
    params:
39
40
41
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
42
43
    shell:
        """
44
        sed '/^#/ d' {input} | sed 's/ \+/\\t/g' > {output}
Laura Denies's avatar
Laura Denies committed
45
46
47
        """

def aggregate_hmm(wildcards):
48
49
    checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
    return expand(
50
51
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv",
        datadir=wildcards.datadir,
52
        project=wildcards.project,
53
        sample=wildcards.sample,
54
        file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
55
    )
Laura Denies's avatar
Laura Denies committed
56
57

rule HMM_correct_format_2:
58
59
60
    input:
        aggregate_hmm
    output:
61
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv")
62
    params:
63
64
65
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
66
67
    shell:
        """
68
        cut -f 1,3,5,6 {input} | uniq > {output}
69
        """
Laura Denies's avatar
Laura Denies committed
70
71

rule HMM_correct_format_3:
72
    input:
73
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv"
74
    output:
75
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv"
76
    params:
77
78
79
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
80
81
    shell:
        """
82
83
84
        echo "#Toxin" > {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
        cat {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header {input} > {output}
        rm -rf {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
85
        sed -i $'1 i\\\ Query_sequence\\tHMM_Name\\tSignificance_Evalue\\tScore' {output}    
86
        """