Commit a37ed46c authored by Valentina Galata's avatar Valentina Galata
Browse files

format fix in rules/Universal (issue #26)

parent 02d34030
......@@ -4,17 +4,17 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
rule combine_PathoFact:
input:
Virulence_factor= "{OUTDIR}/{project}/Virulence_prediction_{input_file}_report.csv",
Toxins="{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv",
AMR_MGE="{OUTDIR}/{project}/AMR_MGE_prediction_{input_file}_report.tsv"
output: "{OUTDIR}/{project}/PathoFact_{input_file}_predictions.csv"
conda: "../../envs/R.yaml"
script: "../../scripts/PathoFact.R"
input:
Virulence_factor= "{OUTDIR}/{project}/Virulence_prediction_{input_file}_report.csv",
Toxins="{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv",
AMR_MGE="{OUTDIR}/{project}/AMR_MGE_prediction_{input_file}_report.tsv"
output:
"{OUTDIR}/{project}/PathoFact_{input_file}_predictions.csv"
conda:
"../../envs/R.yaml"
script:
"../../scripts/PathoFact.R"
......@@ -4,8 +4,8 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
##############################
# Modify fasta input #
......@@ -13,10 +13,14 @@ INPUT=config["input_file"]
# Generate unique ID number for each sequence
rule generate_ID:
input: "{OUTDIR}/{input_file}.faa"
output: "{OUTDIR}/{project}/renamed/{input_file}_ID.faa"
message: "Replace fasta headers with unique ID number - {wildcards.project}"
params: outdir="{OUTDIR}"
input:
"{OUTDIR}/{input_file}.faa"
output:
"{OUTDIR}/{project}/renamed/{input_file}_ID.faa"
message:
"Replace fasta headers with unique ID number - {wildcards.project}"
params:
outdir="{OUTDIR}"
shell:
"""
awk 'BEGIN{{zeros="0000000000"}}{{if(substr($1,1,1)==">"){{i+=1;print">"substr(zeros,1,10-length(i))""i}}else{{print$0}}}}' {input} > {output}
......@@ -27,9 +31,12 @@ rule generate_translation:
input:
renamed="{OUTDIR}/{project}/renamed/{input_file}_ID.faa",
original="{OUTDIR}/{input_file}.faa"
output: "{OUTDIR}/{project}/renamed/{input_file}_translation.tsv"
message: "Generate {output} containing original fasta header with corresponding ID number - {wildcards.project}"
params: outdir="{OUTDIR}"
output:
"{OUTDIR}/{project}/renamed/{input_file}_translation.tsv"
message:
"Generate {output} containing original fasta header with corresponding ID number - {wildcards.project}"
params:
outdir="{OUTDIR}"
shell:
"""
paste {input[0]} {input[1]} | awk 'sub(/^>/,"")' OFS='\t' > {output}
......@@ -41,14 +48,15 @@ rule generate_translation:
# Split fasta file
checkpoint splitting:
input: "{OUTDIR}/{project}/renamed/{input_file}_ID.faa"
output:
splits=directory("{OUTDIR}/{project}/splitted/{input_file}/")
params: outdir="{OUTDIR}"
conda: "../../envs/SeqKit.yaml"
shell: """
seqkit split2 -s 10000 {input} -O {wildcards.OUTDIR}/{wildcards.project}/splitted/{wildcards.input_file}
"""
input:
"{OUTDIR}/{project}/renamed/{input_file}_ID.faa"
output:
splits=directory("{OUTDIR}/{project}/splitted/{input_file}/")
params:
outdir="{OUTDIR}"
conda:
"../../envs/SeqKit.yaml"
shell:
"""
seqkit split2 -s 10000 {input} -O {wildcards.OUTDIR}/{wildcards.project}/splitted/{wildcards.input_file}
"""
......@@ -4,8 +4,8 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
##############################
# Modify fasta input #
......@@ -13,10 +13,14 @@ INPUT=config["input_file"]
# Generate unique ID number for each sequence
rule generate_contigID:
input: "{OUTDIR}/{input_file}.fna"
output: "{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna"
message: "Replace fasta headers with unique ID number - {wildcards.project}"
params: outdir="{OUTDIR}"
input:
"{OUTDIR}/{input_file}.fna"
output:
"{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna"
message:
"Replace fasta headers with unique ID number - {wildcards.project}"
params:
outdir="{OUTDIR}"
shell:
"""
awk 'BEGIN{{zeros="0000000000"}}{{if(substr($1,1,1)==">"){{i+=1;print">"substr(zeros,1,10-length(i))""i}}else{{print$0}}}}' {input} > {output}
......@@ -27,14 +31,13 @@ rule generate_ContigTranslation:
input:
renamed="{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna",
original="{OUTDIR}/{input_file}.fna"
output: "{OUTDIR}/{project}/renamed/{input_file}_Contig_translation.tsv"
message: "Generate {output} containing original fasta header with corresponding ID number - {wildcards.project}"
params: outdir="{OUTDIR}"
output:
"{OUTDIR}/{project}/renamed/{input_file}_Contig_translation.tsv"
message:
"Generate {output} containing original fasta header with corresponding ID number - {wildcards.project}"
params:
outdir="{OUTDIR}"
shell:
"""
paste {input[0]} {input[1]} | awk 'sub(/^>/,"")' OFS='\t' > {output}
"""
......@@ -4,49 +4,68 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
#Run SignalP on split sequence files
rule signalp:
input: "{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa"
output: "{OUTDIR}/{project}/SignalP/{input_file}/{file_i}.txt"
message: "Running SignalP analysis on {input} resulting in {output}."
params: outdir="{OUTDIR}"
shell: "{config[signalp]} -t gram+ {input} > {output}"
input:
"{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa"
output:
"{OUTDIR}/{project}/SignalP/{input_file}/{file_i}.txt"
message:
"Running SignalP analysis on {input} resulting in {output}."
params:
outdir="{OUTDIR}"
shell:
"{config[signalp]} -t gram+ {input} > {output}"
#adjust format of signalP files
rule SignalP_format:
input: "{OUTDIR}/{project}/SignalP/{input_file}/{file_i}.txt"
output: "{OUTDIR}/{project}/SignalP/format/{input_file}/{file_i}.txt"
message: "Apply correct format to {input} to create {output}"
params: outdir="{OUTDIR}"
shell: """
sed '1,2d' {input} >{output}
"""
input:
"{OUTDIR}/{project}/SignalP/{input_file}/{file_i}.txt"
output:
"{OUTDIR}/{project}/SignalP/format/{input_file}/{file_i}.txt"
message:
"Apply correct format to {input} to create {output}"
params:
outdir="{OUTDIR}"
shell:
"""
sed '1,2d' {input} >{output}
"""
rule signalP_modified:
input: "{OUTDIR}/{project}/SignalP/format/{input_file}/{file_i}.txt"
output: "{OUTDIR}/{project}/SignalP/modified/{input_file}/{file_i}.txt"
params: outdir="{OUTDIR}"
shell: """
awk '{{print $1"\t"$10}}' {input} > {output}
"""
input:
"{OUTDIR}/{project}/SignalP/format/{input_file}/{file_i}.txt"
output:
"{OUTDIR}/{project}/SignalP/modified/{input_file}/{file_i}.txt"
params:
outdir="{OUTDIR}"
shell:
"""
awk '{{print $1"\t"$10}}' {input} > {output}
"""
def aggregate_input(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand("{OUTDIR}/{project}/SignalP/modified/{input_file}/{file_i}.txt",
OUTDIR=wildcards.OUTDIR,
project=wildcards.project,
input_file=wildcards.input_file,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i)
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{OUTDIR}/{project}/SignalP/modified/{input_file}/{file_i}.txt",
OUTDIR=wildcards.OUTDIR,
project=wildcards.project,
input_file=wildcards.input_file,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
)
# Join multiple SignalP files in a single file
rule aggregate:
input: aggregate_input
output: "{OUTDIR}/{project}/SignalP/aggregated/{input_file}_SignalP_results.txt"
message: "concatenate multiple split signalP files in a single joined file: {output}"
params: outdir="{OUTDIR}"
shell: "cat {input} > {output}"
input:
aggregate_input
output:
"{OUTDIR}/{project}/SignalP/aggregated/{input_file}_SignalP_results.txt"
message:
"concatenate multiple split signalP files in a single joined file: {output}"
params:
outdir="{OUTDIR}"
shell:
"cat {input} > {output}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment