Preprocessing_contig.smk 2.17 KB
Newer Older
Laura Denies's avatar
Laura Denies committed
1
2
3
4
5
6
7
8
9
10
11
#Prepare fasta

import glob
import os

##############################
#     Modify fasta input     #
##############################

# Generate unique ID number for each sequence
rule generate_contigID:
12
    input:
13
        "{datadir}/{sample}.fna"
14
    output:
15
        "{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
16
    message:
Laura Denies's avatar
Laura Denies committed
17
        "Replace fasta headers with unique ID number: {wildcards.project} - {wildcards.sample}"
18
    params:
19
20
21
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
Laura Denies's avatar
Laura Denies committed
22
23
24
25
26
27
28
29
    shell:
        """
        awk 'BEGIN{{zeros="0000000000"}}{{if(substr($1,1,1)==">"){{i+=1;print">"substr(zeros,1,10-length(i))""i}}else{{print$0}}}}' {input}  > {output}
        """

# Generate translation file combining original header with unique ID
rule generate_ContigTranslation:
    input:
30
31
        renamed="{datadir}/{project}/renamed/{sample}_Contig_ID.fna",
        original="{datadir}/{sample}.fna"
32
    output:
33
        "{datadir}/{project}/renamed/{sample}_Contig_translation.tsv"
34
    message:
Laura Denies's avatar
Laura Denies committed
35
        "Generate {output} containing original fasta header with corresponding ID number: {wildcards.project} - {wildcards.sample}"
36
    params:
37
38
39
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
Laura Denies's avatar
Laura Denies committed
40
41
    shell:
        """
42
        paste {input.renamed} {input.original} | awk 'sub(/^>/,"")' OFS='\\t' > {output}
Laura Denies's avatar
Laura Denies committed
43
        """
44
45
46
47
48
49
50
51
52
53
54
55
56

checkpoint splitcontig:
    input:
        "{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
    output:
        split=directory("{datadir}/{project}/contig_splitted/{sample}/")
    params:
        outdir="{datadir}",
        runtime=config["pathofact"]["runtime"]["short"],
        mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
        split=config["pathofact"]["size_fasta"]
    conda:
        "../../envs/Biopython.yaml"
Laura Denies's avatar
Laura Denies committed
57
58
    log:
        "{datadir}/{project}/logs/split_contig_{sample}.log"
59
60
    shell:
        """
Laura Denies's avatar
Laura Denies committed
61
         python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log}
62
63
64
        """