Virulence.smk 11.1 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1
2
3
4
5
#Virulence

import glob
import os

6
# HMM scan
Laura Denies's avatar
Laura Denies committed
7
8
rule run_HMM_vir:
    input:
9
        hmm=config["pathofact"]["vir_hmm"],
10
        renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
11
    output:
12
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan"
13
    log:
14
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.log"
15
    message:
16
        "Run HMM scan on {input.renamed} to generate {output}"
17
    params:
18
19
20
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["long"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
21
22
    conda:
        "../../envs/HMMER.yaml"
23
24
25
26
    threads:
        12
    shell:
        """
27
        hmmsearch --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
Laura Denies's avatar
Laura Denies committed
28
29
30
31
        """

# Adjust HMM results to correct format
rule HMM_correct_format_vir:
32
    input:
33
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan"
34
    output:
35
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmm.csv"
36
37
38
    message:
        "Adjust {input} to correct format: {output}"
    params:
39
40
41
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
42
43
    shell:
        """
44
        sed '/^#/ d' {input} | sed 's/ \+/\\t/g' > {output}
Laura Denies's avatar
Laura Denies committed
45
46
47
        """

def aggregate_hmm(wildcards):
48
49
    checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
    return expand(
50
51
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmm.csv",
        datadir=wildcards.datadir,
52
        project=wildcards.project,
53
        sample=wildcards.sample,
54
        file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
55
    )
Laura Denies's avatar
Laura Denies committed
56
57

rule HMM_correct_format_2_vir:
58
59
60
    input:
        aggregate_hmm
    output:
61
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}.Input_HMM_R.csv"
62
    params:
63
64
65
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
66
67
    shell:
        """
Laura Denies's avatar
Laura Denies committed
68
69
70
71
        cut -f 1,3,5,6 {input} |uniq > {output}
        """

rule HMM_R_VIR:
72
    input:
73
74
75
76
77
        hmmresults="{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}.Input_HMM_R.csv",
        positive=config["pathofact"]["vir_domains"] + "/positive_domains.tsv",
        negative=config["pathofact"]["vir_domains"] + "/negative_domains.tsv",
        shared=config["pathofact"]["vir_domains"] + "/shared_domains.tsv",
        ID="{datadir}/{project}/renamed/{sample}_translation.tsv"
78
    output:
79
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}.hmm_results.csv"
80
    log:
81
82
83
84
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}.hmm_results.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
85
86
87
88
89
90
    conda:
        "../../envs/R.yaml"
    script:
        "../../scripts/hmm.R"

# Give pathogenicity prediction
91
rule HMM_VIR_classification:
92
    input:
93
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}.hmm_results.csv"
94
    output:
95
96
97
        non_path=temp("{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_non_path.txt"),
        pathogenic=temp("{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_pathogenic.txt"),
        unclassified=temp("{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_unclassified.txt")
98
99
100
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
101
102
    shell:
        """
103
104
105
        awk '$2 == "True" && $3 == "False"  && $4 == "False"' {input} | awk '$5 = "negative"' | sed 's/ /\\t/g' > {output.non_path}
        awk '$3 == "True"' {input} | awk '$5 = "pathogenic"' | sed 's/ /\\t/g' > {output.pathogenic}
        awk '$3 == "False"  && $4 == "True"' {input} | awk '$5 = "unclassified"' | sed 's/ /\\t/g' > {output.unclassified}
106
        """
Laura Denies's avatar
Laura Denies committed
107

108

Laura Denies's avatar
Laura Denies committed
109
rule HMM_VIR_report:
110
    input:
111
112
113
        non_pathogenic= "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_non_path.txt",
        pathogenic= "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_pathogenic.txt",
        unclassified= "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_unclassified.txt"
114
    output:
115
116
117
118
        temp("{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_prediction_intermediate.tsv")
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
119
120
    shell:
        """
121
        cat {input} | sort > {output}
122
        """
Laura Denies's avatar
Laura Denies committed
123
124

rule HMM_VIR_finalformat:
125
    input:
126
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_prediction_intermediate.tsv"
127
    output:
128
129
130
131
        "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_prediction.tsv"
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
132
133
    shell:
        "cut -f 1,5 {input}  > {output}"
Laura Denies's avatar
Laura Denies committed
134
135
136
137
138

##########################
#  Virulence classifier  #
##########################
rule AAC:
139
    input:
140
        "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
141
    output:
142
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt"
143
    log:
144
145
146
147
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
148
149
150
    conda:
        "../../envs/Biopython.yaml"
    shell:
151
        "python {config[pathofact][scripts]}/AAC.py --file {input} --out {output} &> {log}"
Laura Denies's avatar
Laura Denies committed
152
153

rule DPC:
154
    input:
155
        "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
156
    output:
157
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt"
158
    log:
159
160
161
162
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
163
164
165
    conda:
        "../../envs/Biopython.yaml"
    shell:
166
        "python {config[pathofact][scripts]}/DPC.py --file {input} --out {output} &> {log}"
Laura Denies's avatar
Laura Denies committed
167
168

rule CTDC:
169
    input:
170
        "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
171
    output:
172
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt"
173
    log:
174
175
176
177
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
178
179
180
    conda:
        "../../envs/Biopython.yaml"
    shell:
181
        "python {config[pathofact][scripts]}/CTDC.py --file {input} --out {output} &> {log}"
Laura Denies's avatar
Laura Denies committed
182
183

rule CTDT:
184
    input:
185
        "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
186
    output:
187
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt"
188
    log:
189
190
191
192
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
193
194
195
    conda:
        "../../envs/Biopython.yaml"
    shell:
196
        "python {config[pathofact][scripts]}/CTDT.py --file {input} --out {output} &> {log}"
Laura Denies's avatar
Laura Denies committed
197
198

rule CTDD:
199
    input:
200
        "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
201
    output:
202
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt"
203
    log:
204
205
206
207
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
208
209
210
    conda:
        "../../envs/Biopython.yaml"
    shell:
211
        "python {config[pathofact][scripts]}/CTDD.py --file {input} --out {output} &> {log}"
Laura Denies's avatar
Laura Denies committed
212
213

rule join_matrix:
214
    input:
215
216
217
218
219
        AAC="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt",
        DPC="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt",
        CTDC="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt",
        CTDT="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt",
        CTDD="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt"
220
    output:
221
222
223
224
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_matrix.tsv"
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
225
226
227
228
229
230
231
232
233
234
    shell:
        """
        xjoin() {{
            local f
                local srt="sort -k 1b,1"

                if [ "$#" -lt 2 ]; then
                        echo "xjoin: need at least 2 files" >&2
                        return 1
                elif [ "$#" -lt 3 ]; then
235
                        join -t $'\\t' <($srt "$1") <($srt "$2")
236
237
238
                else
                        f=$1
                        shift
239
                        join -t $'\\t' <($srt "$f") <(xjoin "$@")
240
241
242
                fi
        }}

243
        xjoin {input} | sort -k 1n,1 > {output}
244
        """
Laura Denies's avatar
Laura Denies committed
245
246

rule classifier:
247
    input:
248
249
        input="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_matrix.tsv",
        model=config["pathofact"]["scripts"] + "/Virulence_factor_model.sav"
250
    output:
251
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_classifier_prediction.tsv"
252
    log:
253
254
255
256
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_classifier_prediction.log"
    params:
        runtime=config["pathofact"]["runtime"]["medium"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
257
258
259
    conda:
        "../../envs/Biopython.yaml"
    shell:
260
        "python {config[pathofact][scripts]}/virulence_prediction.py {input.input} {output} {input.model} &> {log}"
Laura Denies's avatar
Laura Denies committed
261
262

def aggregate_classifier(wildcards):
263
264
    checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
    return expand(
265
266
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_classifier_prediction.tsv",
        datadir=wildcards.datadir,
267
        project=wildcards.project,
268
        sample=wildcards.sample,
269
        file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
270
    )
Laura Denies's avatar
Laura Denies committed
271
272

rule format_classifier:
273
274
275
    input:
        aggregate_classifier 
    output:
276
277
278
279
        temp("{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}_classifier_results_format.tsv")
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
280
281
282
283
    shell:
        """
        sed 's/"//g' {input} | cut -f2,3 >{output}
        """
Laura Denies's avatar
Laura Denies committed
284
285

rule format_classifier_2:
286
    input:
287
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}_classifier_results_format.tsv"
288
    output:
289
290
291
292
        "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}_classifier_results_formatted.tsv"
    params:
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
293
294
    shell:
        """
295
        awk '{{$1=sprintf("%010d", $1)}}1' {input} | sed 's/ /\\t/g' > {output}
296
        """