Commit 46961fc8 authored by Valentina Galata's avatar Valentina Galata
Browse files

replaced index by name when accessing snakemake variables (issue #29)

parent 0ba12560
......@@ -132,10 +132,10 @@ rule combine_confidence:
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv"
message:
"The different confidence files ({input[0]}, {input[1]}, {input[2]}) are combined in a single output file: {output}"
"The different confidence files ({input}) are combined in a single output file: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
cat {input[0]} {input[1]} {input[2]} | sort > {output}
cat {input} | sort > {output}
"""
......@@ -16,7 +16,7 @@ rule run_HMM:
log:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.log"
message:
"Run HMM scan on {input[1]} to generate {output}"
"Run HMM scan on {input.renamed} to generate {output}"
params:
outdir="{OUTDIR}"
conda:
......@@ -25,7 +25,7 @@ rule run_HMM:
1
shell:
"""
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input[0]} {input[1]} &> {log}
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
"""
# Adjust HMM results to correct format
......
......@@ -21,7 +21,7 @@ rule merge_final:
outdir="{OUTDIR}"
shell:
"""
join -t $'\\t' <(sort {input[0]}) <(sort {input[1]}) > {output}
join -t $'\\t' <(sort {input.translation}) <(sort {input.prediction}) > {output}
"""
rule toxin_report:
......
......@@ -38,7 +38,7 @@ rule generate_translation:
outdir="{OUTDIR}"
shell:
"""
paste {input[0]} {input[1]} | awk 'sub(/^>/,"")' OFS='\\t' > {output}
paste {input.renamed} {input.original} | awk 'sub(/^>/,"")' OFS='\\t' > {output}
"""
###############################
......
......@@ -38,5 +38,5 @@ rule generate_ContigTranslation:
outdir="{OUTDIR}"
shell:
"""
paste {input[0]} {input[1]} | awk 'sub(/^>/,"")' OFS='\\t' > {output}
paste {input.renamed} {input.original} | awk 'sub(/^>/,"")' OFS='\\t' > {output}
"""
......@@ -18,7 +18,7 @@ rule merge_SignalPVir:
"{OUTDIR}/{project}/VIRULENCE/virulence_merged/{input_file}_virulence_SignalP_prediction.tsv"
shell:
"""
join -t $'\\t' <(sort {input[0]}) <(sort {input[1]}) > {output[0]}
join -t $'\\t' <(sort {input.Vir}) <(sort {input.SignalP}) > {output}
"""
###########################################################
......@@ -90,4 +90,4 @@ rule SignalVir_virulence_prediction:
output:
"{OUTDIR}/{project}/VIRULENCE/Virulence_prediction/{input_file}_Confidence_Virulence_ensembled.csv"
shell:
"cat {input[0]} {input[1]} {input[2]} {input[3]} {input[4]} > {output[0]}"
"cat {input} > {output}"
......@@ -16,7 +16,7 @@ rule run_HMM_vir:
log:
"{OUTDIR}/{project}/VIRULENCE/HMM_virulence/{input_file}/{file_i}.log"
message:
"Run HMM scan on {input[1]} to generate {output}"
"Run HMM scan on {input.renamed} to generate {output}"
params:
outdir="{OUTDIR}"
conda:
......@@ -25,7 +25,7 @@ rule run_HMM_vir:
12
shell:
"""
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input[0]} {input[1]} &> {log}
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input.hmm} {input.renamed} &> {log}
"""
# Adjust HMM results to correct format
......@@ -121,7 +121,7 @@ rule HMM_VIR_report:
temp("{OUTDIR}/{project}/VIRULENCE/HMM_virulence/{input_file}_hmm_prediction_intermediate.tsv")
shell:
"""
cat {input[0]} {input[1]} {input[2]} | sort > {output}
cat {input} | sort > {output}
"""
rule HMM_VIR_finalformat:
......@@ -222,7 +222,7 @@ rule join_matrix:
fi
}}
xjoin {input[0]} {input[1]} {input[2]} {input[3]} {input[4]} | sort -k 1n,1 > {output}
xjoin {input} | sort -k 1n,1 > {output}
"""
rule classifier:
......@@ -277,7 +277,7 @@ rule merge_virulence:
output:
"{OUTDIR}/{project}/VIRULENCE/HMM_classifier_virulence/{input_file}_Virulence_prediction.tsv"
shell:
"join -t $'\\t' <(sort {input[0]}) <(sort {input[1]}) > {output};"
"join -t $'\\t' <(sort {input.hmm}) <(sort {input.classifier}) > {output};"
"sed -i $'1 i\\\ ID\\tHMM_prediction\\tmodel_prediction' {output}"
rule virulence_nonpath:
......@@ -351,4 +351,4 @@ rule merge_combined:
output:
"{OUTDIR}/{project}/VIRULENCE/HMM_classifier_virulence/{input_file}_virulence_final_prediction.tsv"
shell:
"cat {input[0]} {input[1]} {input[2]} {input[3]} {input[4]} {input[5]} > {output}"
"cat {input} > {output}"
......@@ -22,7 +22,7 @@ rule Virulence_merge_final:
outdir="{OUTDIR}"
shell:
"""
join -t $'\\t' <(sort {input[0]}) <(sort {input[1]}) > {output}
join -t $'\\t' <(sort {input.translation}) <(sort {input.prediction}) > {output}
"""
rule virulence_report:
......
......@@ -6,28 +6,28 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
# AMR prediction
AMR <- read.delim(file = snakemake@input[[1]], header=FALSE, comment.char="#")
AMR <- read.delim(file = snakemake@input[["AMR"]], header=FALSE, comment.char="#")
AMR <- AMR %>% select(1,4,5)
colnames(AMR) <- c("ARG","GeneID","Category")
AMR_translation<-read.delim(file = snakemake@input[[2]], header=FALSE)
AMR_translation<-read.delim(file = snakemake@input[["AMR_translation"]], header=FALSE)
colnames(AMR_translation) <- c("GeneID", "Gene_Name")
AMR_translation$Gene_Name<- sub('>', '', AMR_translation$Gene_Name)
# Plasmid Prediction
Plasmid <- read.delim(file = snakemake@input[[3]], header=TRUE)
Plasmid <- read.delim(file = snakemake@input[["Plasmid"]], header=TRUE)
Plasmid <- separate(data= Plasmid, col=label, into=c("left", "right"), sep="\\.")
Plasmid <- Plasmid %>% select(1,2)
colnames(Plasmid) <-c("ContigID","Plasmid_prediction")
Plasmid <- Plasmid %>% filter(ContigID != "contig_name")
Plasmid$ContigID <- sprintf("%010d", Plasmid$ContigID)
Plasmid_translation <- read.delim(file = snakemake@input[[4]], header=FALSE)
Plasmid_translation <- read.delim(file = snakemake@input[["Contig_translation"]], header=FALSE)
colnames(Plasmid_translation) <- c("ContigID", "Contig_Name")
Plasmid_translation$Contig_Name <- sub('>', '', Plasmid_translation$Contig_Name)
# Combine AMR and Plasmid prediction
Contig_gene_list <- read.delim(file = snakemake@input[[5]], header=FALSE)
Contig_gene_list <- read.delim(file = snakemake@input[["Contig_gene_list"]], header=FALSE)
colnames(Contig_gene_list) <- c("Contig_Name", "Gene_Name")
Contig_gene_list <- merge(Contig_gene_list, AMR_translation, by="Gene_Name", all=TRUE)
Contig_gene_list <- merge(Contig_gene_list, Plasmid_translation, by="Contig_Name", all=TRUE)
......@@ -42,7 +42,7 @@ AMR_Plasmid$Plasmid_prediction <- fct_explicit_na(AMR_Plasmid$Plasmid_prediction
# Phage prediction
## VirFinder
VirFinder <- read.delim(file = snakemake@input[[6]])
VirFinder <- read.delim(file = snakemake@input[["VirFinder"]])
VirFinder <- VirFinder %>% select(1,3,4)
colnames(VirFinder)<- c("ContigID","VirFinder_score","VirFinder_pvalue" )
VirFinder <- VirFinder %>% filter(ContigID != "name")
......@@ -52,7 +52,7 @@ VirFinder$VirFinder_prediction <- ifelse(VirFinder$VirFinder_pvalue <=0.05 & Vir
VirFinder$ContigID <- sprintf("%010d", VirFinder$ContigID)
## VirSorter
vs.pred <- read.csv(file = snakemake@input[[7]], quote="", header=FALSE)
vs.pred <- read.csv(file = snakemake@input[["VirSorter"]], quote="", header=FALSE)
vs.head <- setNames(data.frame(matrix(ncol=12, nrow=0)), c("Contig_id","Nb.genes.contigs","Fragment","Nb.genes","Category","Nb.phage.hallmark.genes","Phage.gene.enrichment.sig","Non-Caudovirales.phage.gene.enrichment.sig","Pfam.depletion.sig","Uncharacterized.enrichment.sig","Strand.switch.depletion.sig","Short.genes.enrichment.sig"))
colnames(vs.pred)<-colnames(vs.head)
colnames(vs.pred)[1]<- "vs.id"
......@@ -84,7 +84,7 @@ AMR_MGE <- AMR_MGE %>% mutate_if(is.factor, fct_explicit_na, na_level = "-")
AMR_MGE <- AMR_MGE %>% select(2,4,3,5,6,7,10,11)
AMR_MGE$GeneID <- sprintf("%010d", AMR_MGE$GeneID)
write.csv(AMR_MGE, file=snakemake@output[[1]], row.names=FALSE)
write.csv(AMR_MGE, file=snakemake@output[["Report_1"]], row.names=FALSE)
AMR_MGE_report <- AMR_MGE
AMR_MGE_report$origin <- ifelse(AMR_MGE_report$Plasmid_prediction == "chromosome" & AMR_MGE_report$VirFinder_prediction == "-" & AMR_MGE_report$VirSorter_prediction =="-", "chromosome",
......@@ -93,6 +93,6 @@ AMR_MGE_report$origin <- ifelse(AMR_MGE_report$Plasmid_prediction == "chromosome
ifelse(AMR_MGE_report$Plasmid_prediction == "unclassified" & AMR_MGE_report$VirFinder_prediction == "-" & AMR_MGE_report$VirSorter_prediction =="-", "unclassified", "ambiguous"))))
AMR_MGE_report <- AMR_MGE_report %>% select(1,2,3,4,5,9)
write.csv(AMR_MGE_report, file=snakemake@output[[2]], row.names=FALSE)
write.csv(AMR_MGE_report, file=snakemake@output[["Report_2"]], row.names=FALSE)
......@@ -6,21 +6,21 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
# load virulence prediction
Virulence_factors <- read.delim(file=snakemake@input[[1]])
Virulence_factors <- read.delim(file=snakemake@input[["Virulence_factor"]])
Virulence_factors <- Virulence_factors %>% select(1,5,7)
colnames(Virulence_factors) <- c("GeneID","Virulence_factor_prediction","Virulence_confidence_level")
Virulence_factors$GeneID <- as.numeric(as.character(Virulence_factors$GeneID))
Virulence_factors$GeneID <- sprintf("%010d", Virulence_factors$GeneID)
# load toxin prediction
Toxins <- read.delim(file=snakemake@input[[2]])
Toxins <- read.delim(file=snakemake@input[["Toxins"]])
Toxins <- Toxins %>% select(1,4,5,6)
colnames(Toxins) <- c("GeneID","Toxin_prediction", "SignalP", "Toxin_confidence_level")
Toxins$GeneID <- as.numeric(as.character(Toxins$GeneID))
Toxins$GeneID <- sprintf("%010d", Toxins$GeneID)
# load AMR prediction
AMR_MGE <- read.delim(file=snakemake@input[[3]])
AMR_MGE <- read.delim(file=snakemake@input[["AMR_MGE"]])
colnames(AMR_MGE) <- c("GeneID","Gene_Name","Contig_Name","ARG_prediction","AMR_Category","MGE_prediction")
# Combine files
......
......@@ -6,13 +6,13 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
library(reshape2)
HMM_results<-read.delim(file = snakemake@input[[1]], header = FALSE)
ID <- read.delim(file = snakemake@input[[5]], header = FALSE)
HMM_results<-read.delim(file = snakemake@input[["hmmresults"]], header = FALSE)
ID <- read.delim(file = snakemake@input[["ID"]], header = FALSE)
HMM_results<- merge(HMM_results, ID, by.x = "V1", by.y = "V1", all.y = TRUE)
positive <- read.table(file = snakemake@input[[2]], quote="\"", comment.char="")
negative <- read.table(file = snakemake@input[[3]], quote="\"", comment.char="")
shared <- read.table(file = snakemake@input[[4]], quote="\"", comment.char="")
positive <- read.table(file = snakemake@input[["positive"]], quote="\"", comment.char="")
negative <- read.table(file = snakemake@input[["negative"]], quote="\"", comment.char="")
shared <- read.table(file = snakemake@input[["shared"]], quote="\"", comment.char="")
positive$class <- "positive"
negative$class <- "negative"
shared$class <- "shared"
......
......@@ -9,9 +9,9 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
### Input of the different files ###
####################################
gene_table_HMM<-read.delim(file = snakemake@input[[1]], comment.char="#")
translation<-read.delim(file = snakemake@input[[2]], header=FALSE)
signalP<-read.table(file = snakemake@input[[3]], sep= "\t")
gene_table_HMM<-read.delim(file = snakemake@input[["input_HMM"]], comment.char="#")
translation<-read.delim(file = snakemake@input[["translation"]], header=FALSE)
signalP<-read.table(file = snakemake@input[["signalP"]], sep= "\t")
gene_table_HMM<-gene_table_HMM[,c(1,2,4,3)]
......@@ -21,7 +21,7 @@ colnames(translation)<-c("ID","Query_sequence")
translation<-translation[,c(2,1)]
#Description file of the HMM
library_HMM<-read.table(file = snakemake@input[[4]], sep = ";", header = T, quote = '#')
library_HMM<-read.table(file = snakemake@input[["library"]], sep = ";", header = T, quote = '#')
colnames(library_HMM)<-c("HMM_ID","NAME","Alternative_name", "Database", "Description")
gene_table_library<-merge.data.frame(gene_table_HMM, library_HMM, by.x= "HMM_Name", by.y= "HMM_ID")
gene_table_library<-gene_table_library[,c(2,1,3,4,5,6,7,8)]
......@@ -29,7 +29,7 @@ gene_table_library<-gene_table_library[,c(2,1,3,4,5,6,7,8)]
gene_table_library<-merge.data.frame(translation, gene_table_library, by.x = "ID", by.y = "Query_sequence")
gene_table_library$ID<-sprintf("%010d",as.numeric(gene_table_library$ID))
write.csv(gene_table_library,file = snakemake@output[[1]])
write.csv(gene_table_library,file = snakemake@output[["gene_library"]])
#Toxin/Non-Toxin file
frequency<-data.frame(table(gene_table_HMM[1]))
......@@ -55,4 +55,4 @@ gene_table_Toxic<-as.data.frame(gene_table_Toxic)
gene_table_Toxic<-merge.data.frame(gene_table_Toxic,signalP,by="Query_sequence")
gene_table_Toxic$Query_sequence<-sprintf("%010d",as.numeric(gene_table_Toxic$Query_sequence))
write.csv(gene_table_Toxic,file = snakemake@output[[2]])
write.csv(gene_table_Toxic,file = snakemake@output[["gene_toxic"]])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment