#Prepare fasta import glob import os ############################## # Modify fasta input # ############################## # Generate unique ID number for each sequence rule generate_contigID: input: "{datadir}/{sample}.fna" output: "{datadir}/{project}/renamed/{sample}_Contig_ID.fna" message: "Replace fasta headers with unique ID number: {wildcards.project} - {wildcards.sample}" params: outdir="{datadir}", runtime=config["pathofact"]["runtime"]["short"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] shell: """ awk 'BEGIN{{zeros="0000000000"}}{{if(substr($1,1,1)==">"){{i+=1;print">"substr(zeros,1,10-length(i))""i}}else{{print$0}}}}' {input} > {output} """ # Generate translation file combining original header with unique ID rule generate_ContigTranslation: input: renamed="{datadir}/{project}/renamed/{sample}_Contig_ID.fna", original="{datadir}/{sample}.fna" output: "{datadir}/{project}/renamed/{sample}_Contig_translation.tsv" message: "Generate {output} containing original fasta header with corresponding ID number: {wildcards.project} - {wildcards.sample}" params: outdir="{datadir}", runtime=config["pathofact"]["runtime"]["short"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] shell: """ paste {input.renamed} {input.original} | awk 'sub(/^>/,"")' OFS='\\t' > {output} """ checkpoint splitcontig: input: "{datadir}/{project}/renamed/{sample}_Contig_ID.fna" output: split=directory("{datadir}/{project}/contig_splitted/{sample}/") params: outdir="{datadir}", runtime=config["pathofact"]["runtime"]["short"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], split=config["pathofact"]["size_fasta"] conda: "../../envs/Biopython.yaml" log: "{datadir}/{project}/logs/split_contig_{sample}.log" shell: """ python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log} """