#!/usr/bin/env Rscript ### source: https://github.com/nf-core/chipseq/blob/master/bin/featurecounts_deseq2.r ### own changes and adjustments for snakemake-workflow chipseq are marked with "# AVI: " in comment #MIT License # #Copyright (c) Philip Ewels # #Permission is hereby granted, free of charge, to any person obtaining a copy #of this software and associated documentation files (the "Software"), to deal #in the Software without restriction, including without limitation the rights #to use, copy, modify, merge, publish, distribute, sublicense, and/or sell #copies of the Software, and to permit persons to whom the Software is #furnished to do so, subject to the following conditions: # #The above copyright notice and this permission notice shall be included in all #copies or substantial portions of the Software. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR #IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, #FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, #OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE #SOFTWARE. ################################################ ################################################ ## REQUIREMENTS ## ################################################ ################################################ ## DIFFERENTIAL ANALYSIS, SCATTERPLOTS AND PCA FOR SAMPLES IN FEATURECOUNTS FILE ## - FIRST SIX COLUMNS OF FEATURECOUNTS_FILE SHOULD BE INTERVAL INFO. REMAINDER OF COLUMNS SHOULD BE SAMPLES-SPECIFIC COUNTS. ## - SAMPLE NAMES HAVE TO END IN "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS. ## - BAM_SUFFIX IS PORTION OF FILENAME AFTER SAMPLE NAME IN FEATURECOUNTS COLUMN SAMPLE NAMES E.G. ".rmDup.bam" if "DRUG_R1.rmDup.bam" ## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R ################################################ ################################################ ## LOAD LIBRARIES ## ################################################ ################################################ #library(optparse) library(DESeq2) library(vsn) library(ggplot2) library(RColorBrewer) library(pheatmap) library(lattice) library(BiocParallel) ################################################ ################################################ ## PARSE COMMAND-LINE PARAMETERS ## ################################################ ################################################ #option_list <- list(make_option(c("-i", "--featurecount_file"), type="character", default=NULL, help="Feature count file generated by the SubRead featureCounts command.", metavar="path"), # make_option(c("-b", "--bam_suffix"), type="character", default=NULL, help="Portion of filename after sample name in featurecount file header e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'", metavar="string"), # make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"), # make_option(c("-p", "--outprefix"), type="character", default='differential', help="Output prefix", metavar="string"), # make_option(c("-s", "--outsuffix"), type="character", default='', help="Output suffix for comparison-level results", metavar="string"), # make_option(c("-v", "--vst"), type="logical", default=FALSE, help="Run vst transform instead of rlog", metavar="boolean"), # make_option(c("-c", "--cores"), type="integer", default=1, help="Number of cores", metavar="integer")) # #opt_parser <- OptionParser(option_list=option_list) #opt <- parse_args(opt_parser) # #if (is.null(opt$featurecount_file)){ # print_help(opt_parser) # stop("Please provide featurecount file.", call.=FALSE) #} #if (is.null(opt$bam_suffix)){ # print_help(opt_parser) # stop("Please provide bam suffix in header of featurecount file.", call.=FALSE) #} ################################################ ################################################ ## READ IN COUNTS FILE ## ################################################ ################################################ featurecount_file <- snakemake@input[[1]] # AVI: adapted to snakemake # AVI: suffix and prefix are already removed in rule featurecounts_modified_colnames #bam_suffix <- ".bam" # AVI #if (snakemake@params[["singleend"]]) { # AVI # bam_suffix <- ".mLb.clN.sorted.bam" # AVI #} count.table <- read.delim(file=featurecount_file,header=TRUE) # AVI: removed 'skip=1', this was already done in rule featurecounts_modified_colnames #colnames(count.table) <- gsub(bam_suffix,"",colnames(count.table)) # AVI #colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1))) # AVI rownames(count.table) <- count.table$Geneid interval.table <- count.table[,1:6] count.table <- count.table[,7:ncol(count.table),drop=FALSE] ################################################ ################################################ ## RUN DESEQ2 ## ################################################ ################################################ # AVI: this is handled by snakemake #if (file.exists(opt$outdir) == FALSE) { # dir.create(opt$outdir,recursive=TRUE) #} #setwd(opt$outdir) samples.vec <- sort(colnames(count.table)) groups <- sub("_[^_]+$", "", samples.vec) #FIXME needs to be better, as in rna-seq print(unique(groups)) if (length(unique(groups)) == 1) { quit(save = "no", status = 0, runLast = FALSE) } DDSFile <- snakemake@output[["dds"]] # AVI: adapted to snakemake if (file.exists(DDSFile) == FALSE) { counts <- count.table[,samples.vec,drop=FALSE] print(head(counts)) coldata <- data.frame(row.names=colnames(counts),condition=groups) print(head(coldata)) # AVI: set threads limit to prevent the "'bplapply' receive data failed" error # see also https://github.com/kdkorthauer/dmrseq/issues/7 threads <- floor(snakemake@threads[[1]] * 0.75) dds <- DESeqDataSetFromMatrix(countData = round(counts), colData = coldata, design = ~ condition) dds <- DESeq(dds, parallel=TRUE, BPPARAM=MulticoreParam(ifelse(threads>0, threads, 1))) # AVI: set threads limit if (!snakemake@params[["vst"]]) { rld <- rlog(dds) } else { rld <- vst(dds) } save(dds,rld,file=DDSFile) } ################################################# ################################################# ### PLOT QC ## ################################################# ################################################# PlotPCAFile <- snakemake@output[["plot_pca"]] # AVI: adapted to snakemake PlotHeatmapFile <- snakemake@output[["plot_heatmap"]] # AVI: adapted to snakemake if (file.exists(PlotPCAFile) == FALSE) { # pdf(file=PlotFile,onefile=TRUE,width=7,height=7) # AVI: splitted in separate pdf files ## PCA pdf(file=PlotPCAFile,onefile=TRUE,width=7,height=7) # AVI: added to create separate pdf files pca.data <- DESeq2::plotPCA(rld,intgroup=c("condition"),returnData=TRUE) percentVar <- round(100 * attr(pca.data, "percentVar")) plot <- ggplot(pca.data, aes(PC1, PC2, color=condition)) + geom_point(size=3) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), panel.border = element_rect(colour = "black", fill=NA, size=1)) print(plot) dev.off() # AVI: added to create separate pdf files } ## WRITE PC1 vs PC2 VALUES TO FILE if (file.exists(PlotHeatmapFile) == FALSE) { # AVI: added for splitted pdf files pca.vals <- pca.data[,1:2] colnames(pca.vals) <- paste(colnames(pca.vals),paste(percentVar,'% variance',sep=""), sep=": ") pca.vals <- cbind(sample = rownames(pca.vals), pca.vals) write.table(pca.vals,file=snakemake@output[["pca_data"]],row.names=FALSE,col.names=TRUE,sep="\t",quote=TRUE) # AVI: adapted to snakemake ## SAMPLE CORRELATION HEATMAP pdf(file=PlotHeatmapFile,onefile=TRUE,width=7,height=7) # AVI: added to create separate pdf files sampleDists <- dist(t(assay(rld))) sampleDistMatrix <- as.matrix(sampleDists) colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) dev.off() # AVI: added to create separate pdf files ## WRITE SAMPLE DISTANCES TO FILE # AVI: adapted to snakemake write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=snakemake@output[["dist_data"]],row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE) } ################################################# ################################################# ### SAVE SIZE FACTORS ## ################################################# ################################################# #SizeFactorsDir <- "sizeFactors/" #if (file.exists(SizeFactorsDir) == FALSE) { # dir.create(SizeFactorsDir,recursive=TRUE) #} NormFactorsFile <- snakemake@output[["size_factors_rdata"]] # AVI: adapted to snakemake if (file.exists(NormFactorsFile) == FALSE) { normFactors <- sizeFactors(dds) save(normFactors,file=NormFactorsFile) for (name in names(sizeFactors(dds))) { sizeFactorFile <- snakemake@output[["size_factors_res"]] # AVI: adapted to snakemake if (file.exists(sizeFactorFile) == FALSE) { write(as.numeric(sizeFactors(dds)[name]),file=sizeFactorFile) } } } ################################################# ################################################# ### WRITE LOG FILE ## ################################################# ################################################# LogFile <- snakemake@log[[1]] # AVI: adapted to snakemake if (file.exists(LogFile) == FALSE) { cat("\nSamples =",samples.vec,"\n\n",file=LogFile,append=TRUE,sep=', ') cat("Groups =",groups,"\n\n",file=LogFile,append=TRUE,sep=', ') cat("Dimensions of count matrix =",dim(counts),"\n\n",file=LogFile,append=FALSE,sep=' ') cat("\n",file=LogFile,append=TRUE,sep='') } ################################################# ################################################# ### LOOP THROUGH COMPARISONS ## ################################################# ################################################# ResultsFile <- snakemake@output[["results"]] # AVI: adapted to snakemake if (file.exists(ResultsFile) == FALSE) { raw.counts <- counts(dds,normalized=FALSE) colnames(raw.counts) <- paste(colnames(raw.counts),'raw',sep='.') pseudo.counts <- counts(dds,normalized=TRUE) colnames(pseudo.counts) <- paste(colnames(pseudo.counts),'pseudo',sep='.') deseq2_results_list <- list() comparisons <- combn(unique(groups),2) for (idx in 1:ncol(comparisons)) { control.group <- comparisons[1,idx] treat.group <- comparisons[2,idx] CompPrefix <- paste(control.group,treat.group,sep="vs") cat("Saving results for ",CompPrefix," ...\n",sep="") # AVI: this is handled by snakemake #CompOutDir <- paste(CompPrefix,'/',sep="") #if (file.exists(CompOutDir) == FALSE) { # dir.create(CompOutDir,recursive=TRUE) #} control.samples <- samples.vec[which(groups == control.group)] treat.samples <- samples.vec[which(groups == treat.group)] comp.samples <- c(control.samples,treat.samples) comp.results <- results(dds,contrast=c("condition",c(control.group,treat.group))) comp.df <- as.data.frame(comp.results) comp.table <- cbind(interval.table, as.data.frame(comp.df), raw.counts[,paste(comp.samples,'raw',sep='.')], pseudo.counts[,paste(comp.samples,'pseudo',sep='.')]) ## WRITE RESULTS FILE CompResultsFile <- snakemake@output[["results"]] write.table(comp.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) ## FILTER RESULTS BY FDR & LOGFC AND WRITE RESULTS FILE # pdf(file=snakemake@output[["deseq2_plots"]],width=10,height=8) # AVI: splitted in separate pdf files if (length(comp.samples) > 2) { for (MIN_FDR in c(0.01,0.05)) { ## SUBSET RESULTS BY FDR pass.fdr.table <- subset(comp.table, padj < MIN_FDR) pass.fdr.up.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange > 0) pass.fdr.down.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange < 0) ## SUBSET RESULTS BY FDR AND LOGFC pass.fdr.logFC.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1) pass.fdr.logFC.up.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange > 0) pass.fdr.logFC.down.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange < 0) ## WRITE RESULTS FILE if (MIN_FDR == 0.01) { # AVI: dynamically generated file name extensions added dirs <- c(snakemake@output[["FDR_1_perc_res"]],snakemake@output[["FDR_1_perc_bed"]],snakemake@output[["plot_FDR_1_perc_MA"]], snakemake@output[["plot_FDR_1_perc_volcano"]],snakemake@output[["FDR_5_perc_res"]],snakemake@output[["FDR_5_perc_bed"]], snakemake@output[["plot_FDR_5_perc_MA"]],snakemake@output[["plot_FDR_5_perc_volcano"]],snakemake@output[["plot_sample_corr_heatmap"]], snakemake@output[["plot_scatter"]]) for (dir in dirs) { if (file.exists(dir) == FALSE) { dir.create(dir,recursive=TRUE) } } CompResultsFile <- file.path(snakemake@output[["FDR_1_perc_res"]], paste(snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="")) CompBEDFile <- file.path(snakemake@output[["FDR_1_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="")) MAplotFile <- file.path(snakemake@output[["plot_FDR_1_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_1_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { # AVI: dynamically file name extensions added CompResultsFile <- file.path(snakemake@output[["FDR_5_perc_res"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="")) CompBEDFile <- file.path(snakemake@output[["FDR_5_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="")) # AVI: write paths of FDR 0.05 bed files to igv file cat(paste0(CompBEDFile, "\t255,0,0\n"),file=snakemake@output[["igv_FDR_5_bed"]],append=TRUE) MAplotFile <- file.path(snakemake@output[["plot_FDR_5_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_5_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files } write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) write.table(pass.fdr.table[,c("Chr","Start","End","Geneid","log2FoldChange","Strand")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE) ## MA PLOT & VOLCANO PLOT pdf(file=MAplotFile,width=10,height=8) # AVI: added to create separate pdf files DESeq2::plotMA(comp.results, main=paste("MA plot FDR <= ",MIN_FDR,sep=""), ylim=c(-2,2),alpha=MIN_FDR) dev.off() # AVI: added to create separate pdf files pdf(file=VolcanoPlotFile,width=10,height=8) # AVI: added to create separate pdf files plot(comp.table$log2FoldChange, -1*log10(comp.table$padj), col=ifelse(comp.table$padj<=MIN_FDR, "red", "black"), xlab="logFC", ylab="-1*log10(FDR)", main=paste("Volcano plot FDR <=",MIN_FDR,sep=" "), pch=20) dev.off() # AVI: added to create separate pdf files ## ADD COUNTS TO LOGFILE cat(CompPrefix," genes with FDR <= ",MIN_FDR,": ",nrow(pass.fdr.table)," (up=",nrow(pass.fdr.up.table),", down=",nrow(pass.fdr.down.table),")","\n",file=LogFile,append=TRUE,sep="") cat(CompPrefix," genes with FDR <= ",MIN_FDR," & FC > 2: ",nrow(pass.fdr.logFC.table)," (up=",nrow(pass.fdr.logFC.up.table),", down=",nrow(pass.fdr.logFC.down.table),")","\n",file=LogFile,append=TRUE,sep="") } cat("\n",file=LogFile,append=TRUE,sep="") # AVI: creates required output files with message } else { warning("More than 2 samples treated with the same antibody are needed to calculate the FDR & LOGFC.") } ## SAMPLE CORRELATION HEATMAP CorrHeatmapFile <- file.path(snakemake@output[["plot_sample_corr_heatmap"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="")) # AVI: dynamically file name extensions added pdf(file=CorrHeatmapFile,width=10,height=8) # AVI: added to create separate pdf files rld.subset <- assay(rld)[,comp.samples] sampleDists <- dist(t(rld.subset)) sampleDistMatrix <- as.matrix(sampleDists) colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) dev.off() # AVI: added to create separate pdf files ## SCATTER PLOT FOR RLOG COUNTS ScatterPlotFile <- file.path(snakemake@output[["plot_scatter"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="")) # AVI: dynamically file name extensions added pdf(file=ScatterPlotFile,width=10,height=8) # AVI: added to create separate pdf files combs <- combn(comp.samples,2,simplify=FALSE) clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset))) plot <- xyplot(y~x|comp,plotdat, panel=function(...){ panel.xyplot(...) panel.abline(0,1,col="red") }, par.strip.text=list(cex=0.5)) print(plot) dev.off() # AVI: added to create separate pdf files colnames(comp.df) <- paste(CompPrefix,".",colnames(comp.df),sep="") deseq2_results_list[[idx]] <- comp.df } ## WRITE RESULTS FROM ALL COMPARISONS TO FILE deseq2_results_table <- cbind(interval.table,do.call(cbind, deseq2_results_list),raw.counts,pseudo.counts) write.table(deseq2_results_table, file=ResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) } ################################################# ################################################# ### R SESSION INFO ## ################################################# ################################################# cat(unlist(sessionInfo()),file=LogFile,append=TRUE,sep='') ################################################# ################################################# ################################################# #################################################