Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit 21195b19 authored by Laura Denies's avatar Laura Denies
Browse files

Updated AMR workflow: combine results

parent 96415bbc
......@@ -9,16 +9,17 @@ import os
rule combine_AMR_plasmid:
input:
AMR="{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG",
AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Plasmid="{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv",
ORF_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Contig_ORFs="{datadir}/{project}/Prodigal/{sample}.contig",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
Contig_gene_list="{datadir}/{project}/Prodigal/{sample}.contig",
VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
AMR="{datadir}/{project}/AMR/{sample}_AMR_prediction.tsv",
PlasFlow="{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv",
MOB_suite="{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv",
DeepVirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
output:
Report_1=temp("{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.csv"),
Report_2=temp("{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_report_temp.csv")
Report_1="{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv",
Report_2="{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv"
log:
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.log"
params:
......@@ -30,30 +31,3 @@ rule combine_AMR_plasmid:
script:
"../../scripts/AMR_MGE.R"
rule modify_details:
input:
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.csv"
output:
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' > {output}
"""
rule modify_report:
input:
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_report_temp.csv"
output:
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' > {output}
"""
......@@ -5,51 +5,70 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
# AMR prediction
AMR <- read.delim(file = snakemake@input[["AMR"]], header=FALSE, comment.char="#")
AMR <- AMR %>% select(1,4,5)
colnames(AMR) <- c("ARG","GeneID","Category")
# ORF Translation
ORF_translation<-read.delim(file = snakemake@input[["ORF_translation"]], header=FALSE)
colnames(ORF_translation) <- c("ORF_ID", "ORF")
ORF_translation$ORF<- sub('>', '', ORF_translation$ORF)
ORF_translation$ORF_ID <- sprintf("%010d", ORF_translation$ORF_ID)
# Mapping File
Contig_ORFs <- Contig_ORFs <- read.delim(file = snakemake@input[["Contig_ORFs"]], header=FALSE)
colnames(Contig_ORFs) <- c("Contig","ORF")
AMR_translation<-read.delim(file = snakemake@input[["AMR_translation"]], header=FALSE)
colnames(AMR_translation) <- c("GeneID", "Gene_Name")
AMR_translation$Gene_Name<- sub('>', '', AMR_translation$Gene_Name)
# AMR prediction
AMR <- read.delim(file = snakemake@input[["AMR"]])
AMR$ORF_ID <- sprintf("%010d", AMR$ORF_ID)
AMR <- merge(AMR, ORF_translation, by="ORF_ID", all=TRUE)
AMR <- AMR %>% mutate_all(list(~replace(as.character(.), is.na(.), "-")))
AMR <- merge(AMR, Contig_ORFs, by="ORF", all=TRUE)
AMR <- AMR %>% select(9,1,2,3,4,5,6,7,8)
# Contig Translation
Contig_translation <- read.delim(file = snakemake@input[["Contig_translation"]], header=FALSE)
colnames(Contig_translation) <- c("Contig_ID","Contig")
Contig_translation$Contig<- sub('>', '', Contig_translation$Contig)
Contig_translation$Contig_ID <- sprintf("%010d", Contig_translation$Contig_ID)
# Plasmid Prediction
Plasmid <- read.delim(file = snakemake@input[["Plasmid"]], header=TRUE)
Plasmid <- separate(data= Plasmid, col=label, into=c("left", "right"), sep="\\.")
Plasmid <- Plasmid %>% select(1,2)
colnames(Plasmid) <-c("ContigID","Plasmid_prediction")
Plasmid <- Plasmid %>% filter(ContigID != "contig_name")
Plasmid$ContigID <- sprintf("%010d", Plasmid$ContigID)
Plasmid_translation <- read.delim(file = snakemake@input[["Contig_translation"]], header=FALSE)
colnames(Plasmid_translation) <- c("ContigID", "Contig_Name")
Plasmid_translation$Contig_Name <- sub('>', '', Plasmid_translation$Contig_Name)
# Combine AMR and Plasmid prediction
Contig_gene_list <- read.delim(file = snakemake@input[["Contig_gene_list"]], header=FALSE)
colnames(Contig_gene_list) <- c("Contig_Name", "Gene_Name")
Contig_gene_list <- merge(Contig_gene_list, AMR_translation, by="Gene_Name", all=TRUE)
Contig_gene_list <- merge(Contig_gene_list, Plasmid_translation, by="Contig_Name", all=TRUE)
Contig_gene_list$ContigID <- as.numeric(as.character(Contig_gene_list$ContigID))
Contig_gene_list$ContigID <- sprintf("%010d", Contig_gene_list$ContigID)
AMR_Plasmid <- merge(Contig_gene_list, AMR, by="GeneID", all = TRUE)
AMR_Plasmid <- merge(AMR_Plasmid, Plasmid, by="ContigID", all = TRUE)
AMR_Plasmid$ARG <- fct_explicit_na(AMR_Plasmid$ARG, na_level = "-")
AMR_Plasmid$Category <- fct_explicit_na(AMR_Plasmid$Category, na_level = "-")
AMR_Plasmid$Plasmid_prediction <- fct_explicit_na(AMR_Plasmid$Plasmid_prediction, na_level = "unclassified")
# Phage prediction
## PlasFlow
PlasFlow <- read.delim(file = snakemake@input[["PlasFlow"]], header=TRUE)
PlasFlow <- separate(data= PlasFlow, col=label, into=c("left", "right"), sep="\\.")
PlasFlow <- PlasFlow %>% select(1,2)
colnames(PlasFlow) <-c("Contig_ID","PlasFlow_prediction")
PlasFlow <- PlasFlow %>% filter(Contig_ID != "contig_name")
#PlasFlow$Contig_ID <- sprintf("%010d", PlasFlow$Contig_ID)
## MOBsuite
MOB_suite <- read.delim(file = snakemake@input[["MOB_suite"]], header=TRUE)
MOB_suite <- MOB_suite %>% filter(sample_id != "sample_id")
MOB_suite$mash_neighbor_distance <- as.numeric(as.character(MOB_suite$mash_neighbor_distance))
MOB_suite$MOB_suite_prediction <- if_else(MOB_suite$mash_neighbor_distance <= 0.06 , "plasmid","-")
MOB_suite <- MOB_suite %>% select(1,27) %>% filter(MOB_suite_prediction == "plasmid")
colnames(MOB_suite)<- c("Contig_ID","MOB_suite_prediction")
#MOB_suite$Contig_ID <- sprintf("%010d", MOB_suite$Contig_ID)
## Combine PlasFlow and MOB_suite
Plasmid <- merge(PlasFlow, MOB_suite, by="Contig_ID", all = TRUE)
Plasmid <- merge(Plasmid, Contig_translation, by= "Contig_ID", all= TRUE)
Plasmid$PlasFlow_prediction <- fct_explicit_na(Plasmid$PlasFlow_prediction, na_level = "unclassified")
Plasmid$MOB_suite_prediction <- fct_explicit_na(Plasmid$MOB_suite_prediction, na_level = "unclassified")
Plasmid$Plasmid_prediction <- ifelse(Plasmid$PlasFlow_prediction == "plasmid" | Plasmid$MOB_suite_prediction == "plasmid", "plasmid", "-")
Plasmid$Plasmid_database <- ifelse(Plasmid$Plasmid_prediction == "plasmid" & Plasmid$PlasFlow_prediction == "plasmid" & Plasmid$MOB_suite_prediction == "plasmid", "PlasFlow/MOB_suite",
ifelse(Plasmid$Plasmid_prediction == "plasmid" & Plasmid$PlasFlow_prediction == "plasmid" & Plasmid$MOB_suite_prediction != "plasmid", "PlasFlow",
ifelse(Plasmid$Plasmid_prediction == "plasmid" & Plasmid$PlasFlow_prediction != "plasmid" & Plasmid$MOB_suite_prediction == "plasmid", "MOB_suite", "-")))
Plasmid$Chromosome_prediction <- ifelse(Plasmid$PlasFlow_prediction == "chromosome" & Plasmid$MOB_suite_prediction != "plasmid", "chromosome","-")
Plasmid <- Plasmid %>% select(4,1,7,5,6)
# Phage
## VirFinder
VirFinder <- read.delim(file = snakemake@input[["VirFinder"]])
VirFinder <- read.delim(file = snakemake@input[["DeepVirFinder"]])
VirFinder <- VirFinder %>% select(1,3,4)
colnames(VirFinder)<- c("ContigID","VirFinder_score","VirFinder_pvalue" )
VirFinder <- VirFinder %>% filter(ContigID != "name")
colnames(VirFinder)<- c("Contig_ID","VirFinder_score","VirFinder_pvalue" )
VirFinder <- VirFinder %>% filter(Contig_ID != "name")
VirFinder$VirFinder_pvalue <- as.numeric(as.character(VirFinder$VirFinder_pvalue))
VirFinder$VirFinder_score <- as.numeric(as.character(VirFinder$VirFinder_score))
VirFinder$VirFinder_prediction <- ifelse(VirFinder$VirFinder_pvalue <=0.05 & VirFinder$VirFinder_score >=0.70, "Phage", "-")
VirFinder$ContigID <- sprintf("%010d", VirFinder$ContigID)
VirFinder$VirFinder_prediction <- ifelse(VirFinder$VirFinder_pvalue <=0.05 & VirFinder$VirFinder_score >=0.70, "phage", "-")
#VirFinder$Contig_ID <- sprintf("%010d", VirFinder$Contig_ID)
## VirSorter
vs.pred <- read.csv(file = snakemake@input[["VirSorter"]], quote="", header=FALSE)
......@@ -68,36 +87,39 @@ VirSorter <- vs.pred %>% select(5,13)
VirSorter$node <- gsub("(^|[^0-9])0+", "\\1", VirSorter$node, perl = TRUE)
VirSorter$Virsorter_prediction <- ifelse(VirSorter$Category == "Prophages 3", "-",
ifelse(VirSorter$Category == "Complete phage contigs 3", "-",
ifelse(VirSorter$Category == "-", "-", "Phage")))
ifelse(VirSorter$Category == "-", "-", "phage")))
VirSorter <- VirSorter %>% select(2,3)
colnames(VirSorter)<- c("ContigID","VirSorter_prediction")
VirSorter$ContigID <- as.numeric(as.character(VirSorter$ContigID))
VirSorter$ContigID <- sprintf("%010d", VirSorter$ContigID)
colnames(VirSorter)<- c("Contig_ID","VirSorter_prediction")
VirSorter$Contig_ID <- as.numeric(as.character(VirSorter$Contig_ID))
#VirSorter$Contig_ID <- sprintf("%010d", VirSorter$Contig_ID)
## Combine Phage prediction
Phage_prediction <- merge(VirFinder, VirSorter, by="ContigID", all = TRUE)
if(all(is.na(Phage_prediction$VirSorter_prediction))){
# issue #39
Phage_prediction$VirSorter_prediction <- factor(rep("-", nrow(Phage_prediction)))
Phage <- merge(VirFinder, VirSorter, by="Contig_ID", all = TRUE)
if(all(is.na(Phage$VirSorter_prediction))){
Phage$VirSorter_prediction <- factor(rep("-", nrow(Phage)))
} else {
Phage_prediction$VirSorter_prediction <- fct_explicit_na(Phage_prediction$VirSorter_prediction, na_level = "-")
Phage$VirSorter_prediction <- fct_explicit_na(Phage$VirSorter_prediction, na_level = "-")
}
# Combine AMR/MGE with Phage prediction
AMR_MGE <- merge(AMR_Plasmid, Phage_prediction, by="ContigID", all = TRUE)
AMR_MGE <- AMR_MGE %>% mutate_if(is.factor, fct_explicit_na, na_level = "-")
AMR_MGE <- AMR_MGE %>% select(2,4,3,5,6,7,10,11)
AMR_MGE$GeneID <- sprintf("%010d", AMR_MGE$GeneID)
write.csv(AMR_MGE, file=snakemake@output[["Report_1"]], row.names=FALSE)
AMR_MGE_report <- AMR_MGE
AMR_MGE_report$origin <- ifelse(AMR_MGE_report$Plasmid_prediction == "chromosome" & AMR_MGE_report$VirFinder_prediction == "-" & AMR_MGE_report$VirSorter_prediction =="-", "chromosome",
ifelse(AMR_MGE_report$Plasmid_prediction == "plasmid" & AMR_MGE_report$VirFinder_prediction == "-" & AMR_MGE_report$VirSorter_prediction =="-", "plasmid",
ifelse(AMR_MGE_report$Plasmid_prediction == "unclassified" & AMR_MGE_report$VirFinder_prediction == "Phage" | AMR_MGE_report$Plasmid_prediction == "unclassified" & AMR_MGE_report$VirSorter_prediction =="Phage", "phage",
ifelse(AMR_MGE_report$Plasmid_prediction == "unclassified" & AMR_MGE_report$VirFinder_prediction == "-" & AMR_MGE_report$VirSorter_prediction =="-", "unclassified", "ambiguous"))))
AMR_MGE_report <- AMR_MGE_report %>% select(1,2,3,4,5,9)
write.csv(AMR_MGE_report, file=snakemake@output[["Report_2"]], row.names=FALSE)
Phage$Phage_prediction <- ifelse(Phage$VirFinder_prediction == "phage" | Phage$VirSorter_prediction == "phage", "phage", "-")
Phage$Phage_database <- ifelse(Phage$Phage_prediction == "phage" & Phage$VirFinder_prediction == "phage" & Phage$VirSorter_prediction == "phage", "VIRSorter/DeepVirFinder",
ifelse(Phage$Phage_prediction == "phage" & Phage$VirFinder_prediction == "phage" & Phage$VirSorter_prediction != "phage", "DeepVirFinder",
ifelse(Phage$Phage_prediction == "phage" & Phage$VirFinder_prediction != "phage" & Phage$VirSorter_prediction == "phage", "VIRSorter","-")))
Phage <- Phage %>% select(1,6,7)
# Combine MGEs
MGEs <- merge(Plasmid, Phage, by = "Contig_ID", all=TRUE)
MGEs$MGE_prediction <- ifelse(MGEs$Plasmid_prediction == "plasmid" & MGEs$Phage_prediction == "-" & MGEs$Chromosome_prediction == "-", "plasmid",
ifelse(MGEs$Plasmid_prediction == "-" & MGEs$Phage_prediction == "phage" & MGEs$Chromosome_prediction == "-", "phage",
ifelse(MGEs$Plasmid_prediction == "-" & MGEs$Phage_prediction == "-" & MGEs$Chromosome_prediction == "chromosome", "chromosome",
ifelse(MGEs$Plasmid_prediction == "plasmid" & MGEs$Phage_prediction == "-" & MGEs$Chromosome_prediction == "chromosome", "ambiguous (plasmid/chromosome)",
ifelse(MGEs$Plasmid_prediction == "-" & MGEs$Phage_prediction == "phage" & MGEs$Chromosome_prediction == "chromosome", "ambiguous (phage/chromosome)",
ifelse(MGEs$Plasmid_prediction == "plasmid" & MGEs$Phage_prediction == "phage" & MGEs$Chromosome_prediction == "-", "ambiguous (plasmid/phage)","unclassified"))))))
# Combine AMR MGEs
AMR_MGE <- merge(AMR, MGEs, by="Contig", all.x = TRUE)
AMR_MGE <- AMR_MGE %>% select(1,10,2,3,4,5,6,7,8,9,11,12,13,14,15,16)
write.table(AMR_MGE, file = snakemake@output[["Report_1"]], sep="\t", row.names=FALSE, quote=FALSE)
AMR_MGE_final <- AMR_MGE %>% select(1:10,16)
write.table(AMR_MGE_final, file = snakemake@output[["Report_2"]], sep="\t", row.names=FALSE, quote=FALSE)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment