Toxin.smk 3.08 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1 2 3 4 5
#Toxin

import glob
import os

6
# HMM scan
7
rule run_HMM_tox:
Laura Denies's avatar
Laura Denies committed
8
    input:
9
        hmm=config["pathofact"]["tox_hmm"],
10
        renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
11
    output:
Laura Denies's avatar
Laura Denies committed
12
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan")
13
    log:
14
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.log"
15
    params:
16 17 18
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["long"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
19 20
    conda:
        "../../envs/HMMER.yaml"
21 22
    threads:
        1
Laura Denies's avatar
Laura Denies committed
23
    message: "Executing toxin prediction with {threads} threads on the following sample{s}: {wildcards.project} - {wildcards.sample}"
24 25
    shell:
        """
26
        hmmsearch --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
Laura Denies's avatar
Laura Denies committed
27 28 29 30
        """

# Adjust HMM results to correct format
rule HMM_correct_format:
31
    input:
32
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
33
    output:
Laura Denies's avatar
Laura Denies committed
34
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv")
35
    params:
36 37 38
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
Laura Denies's avatar
Laura Denies committed
39
    message: "Adjust output format of toxin predictions: {wildcards.project} - {wildcards.sample}"
40 41
    shell:
        """
42
        sed '/^#/ d' {input} | sed 's/ \+/\\t/g' > {output}
Laura Denies's avatar
Laura Denies committed
43 44 45
        """

def aggregate_hmm(wildcards):
46 47
    checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
    return expand(
48 49
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv",
        datadir=wildcards.datadir,
50
        project=wildcards.project,
51
        sample=wildcards.sample,
52
        file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
53
    )
Laura Denies's avatar
Laura Denies committed
54 55

rule HMM_correct_format_2:
56 57 58
    input:
        aggregate_hmm
    output:
59
        temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv")
60
    params:
61 62 63
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
Laura Denies's avatar
Laura Denies committed
64
    message: "Aggregate toxin prediction of the following sample{s}: {wildcards.project} - {wildcards.sample}"
65 66
    shell:
        """
67
        cut -f 1,3,5,6 {input} | uniq > {output}
68
        """
Laura Denies's avatar
Laura Denies committed
69 70

rule HMM_correct_format_3:
71
    input:
72
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv"
73
    output:
74
        "{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv"
75
    params:
76 77 78
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
79 80
    shell:
        """
81 82 83
        echo "#Toxin" > {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
        cat {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header {input} > {output}
        rm -rf {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
84
        sed -i $'1 i\\\ Query_sequence\\tHMM_Name\\tSignificance_Evalue\\tScore' {output}    
85
        """