#!/usr/bin/env Rscript # ================================================================================================ # Libraries # ================================================================================================ library("yaml") library("readxl") library("tidyverse") library("ggplot2") source("../libs/conf/confR.R") source("../libs/utils/utils.R") message(paste0("[", Sys.time(), "] Libraries loaded.")) # ================================================================================================ # Configuration # ================================================================================================ options(bitmapType = "cairo") config <- read_config(config_dirs = c("../Confs/", "./")) output_data_dir <- paste0(config$global_data_dir, config$local_data_dir) input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir) message(paste0("[", Sys.time(), "] Configuration done.")) # ================================================================================================ # Functions # ================================================================================================ #' @title Compute the specificity scores of enriched pathways. #' #' @description This functions takes two dataframes that contain the enriched pathways of a #' reference and a control experiments. The functions then merge the two dataframes, computes #' a Delta score that represents the specificity of the enrichemnt in the reference experiment #' with respect to the control experiment. It then returs the pathways and their delta scores, #' possibly filtered to retain only the most significant pathways. #' #' @param DB_ref The reference data-frame, contains at least the pathway identifiers ("ID), #' descriptions ("Description") and adjusted P values ("p.adjust"). #' @param DB_ctrl The control data-frame, contains at least the pathway identifiers ("ID), #' descriptions ("Description") and adjusted P values ("p.adjust"). #' @pval_thres The threshold to use to return only the pathways that reach that level of #' significance either in the control or in the reference experiments. #' @return A dataframe that contains the original data augmented with the Delta scores. compute_delta <- function(DB_ref, DB_ctrl, pval_thres = 0.1) { # Merging both dataframes, keeping onlyh the common functional terms. DB <- merge(x = DB_ref, y = DB_ctrl, by = "ID") %>% mutate(Description = Description.x, pval_ref = p.adjust.x, pval_ctrl = p.adjust.y) %>% select(ID, Description, pval_ref, pval_ctrl) # We compute the delta (-log P based delta). # Delta = -log10(P value reference) + log10( P value control). DB$logP_ref <- -log10(DB$pval_ref) DB$logP_ctrl <- -log10(DB$pval_ctrl) DB$Delta <- DB$logP_ref - DB$logP_ctrl # We keep only pathways that are significant or almost significant at least once (either # in ref or in control). res <- DB %>% filter(pval_ref < pval_thres | pval_ctrl < pval_thres) %>% arrange(Delta) # We clean and return the selected pathways. rm(DB) return(res) } # ================================================================================================ # Main # ================================================================================================ # This script aims at merging the results of functional enrichment for male and female DEGs in # order to select the pathways that are only enriched for one of the two genders (gender specific # pathways). # # We do all integration schemes one by one. for (i in seq_len(length(config$integrations))) { # We get the integration name for that scheme. integration <- config$integrations[[i]] # We create a structure for all results. gsp_data <- c() male_prefix <- paste0(integration$name, "_PDVsControl_males_") female_prefix <- paste0(integration$name, "_PDVsControl_females_") # # KEGG by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "KEGG")) rm(M_filename, F_filename, M_data, F_data) # # MSIGDB by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "MSIGDB")) rm(M_filename, F_filename, M_data, F_data) # # REACTOME by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "REACTOME")) rm(M_filename, F_filename, M_data, F_data) # # GO-BP by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "GOBP")) rm(M_filename, F_filename, M_data, F_data) # # GO-CellularComponent by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "GOCC")) rm(M_filename, F_filename, M_data, F_data) # # GO-MF by CP # M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv") F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv") M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>% select(ID, Description, p.adjust) gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>% mutate(tool = "CP", source = "GOMF")) rm(M_filename, F_filename, M_data, F_data) # We do not run it yet for the ROT and PF tools since there is no obvious p-value to # use (there are several). We can later adapt this script for these two tools as well. # Save file with all specific pathways. ofile <- paste0(output_data_dir, integration$name, "_gender_specific_pathways.tsv") write.table(gsp_data, file = ofile, sep = "\t", quote = FALSE, col.names = NA) rm(ofile, gsp_data) message(paste0("[", Sys.time(), "] [", integration$name, "] Gender specific pathways identified.")) rm(integration, male_prefix, female_prefix) } # End for each integration. rm(i) # Final cleaning. rm(config, input_data_dir, output_data_dir) sessionInfo()