Commit 243303b3 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Cosmetic changes on the enrichment procedure (mostly refactoring file names).

parent cebef2d7
......@@ -3,7 +3,7 @@
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 20
#SBATCH -n 25
#SBATCH --time=0-06:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
......@@ -15,7 +15,7 @@ enrich:
@sbatch ${CODE_FOLDER}ROT_enrich.sh
pfenrich:
@/bin/bash ${CODE_FOLDER}PF_enrich.sh
merge:
@sbatch ${CODE_FOLDER}merge.sh
#merge:
# @sbatch ${CODE_FOLDER}merge.sh
gsp:
@sbatch ${CODE_FOLDER}gender_specific_pathways.sh
......@@ -79,14 +79,14 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data <- c()
male_prefix <- paste0(integration$name, "_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_PDVsControl_females_")
male_prefix <- paste0(integration$name, "_VSN_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_VSN_PDVsControl_females_")
#
# KEGG by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_KEGG.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_KEGG.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -98,8 +98,8 @@ for (i in seq_len(length(config$integrations))) {
#
# MSIGDB by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_MSIGDB.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -111,8 +111,8 @@ for (i in seq_len(length(config$integrations))) {
#
# REACTOME by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_REACTOME.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_REACTOME.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -124,8 +124,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-BP by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_GOBP.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_GOBP.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -137,8 +137,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-CC by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_GOCC.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_GOCC.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -150,8 +150,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-MF by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "gsea_GOMF.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "gsea_GOMF.tsv")
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
......@@ -163,8 +163,8 @@ for (i in seq_len(length(config$integrations))) {
#
# MSIGDB by G2P
#
M_filename <- paste0(output_data_dir, "G2P_", male_prefix, "MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "G2P_", female_prefix, "MSIGDB.tsv")
M_filename <- paste0(output_data_dir, "G2P_", male_prefix, "max-avg_gs_pivalue_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "G2P_", female_prefix, "max-avg_gs_pivalue_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE)
M_data$adj_pv = M_data$pv * dim(M_data)[1]
M_data$adj_pv[M_data$adj_pv > 1] <- 1
......
......@@ -24,6 +24,7 @@ module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/gender_specific_pathways.R > ${OUTPUT_FOLDER}/gsp_log.out 2> ${OUTPUT_FOLDER}/gsp_log.err
Rscript --vanilla ${CODE_FOLDER}/pathways_with_gender_specific_genes.R > ${OUTPUT_FOLDER}/pgsg_log.out 2> ${OUTPUT_FOLDER}/pgsg_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("readxl")
library("tidyverse")
library("ggplot2")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
get_overlap <- function(row) {
genes_ref <- str_split(row[5], "/")[[1]]
genes_ctrl <- str_split(row[6], "/")[[1]]
overlap <- length(intersect(genes_ref, genes_ctrl)) / min(length(genes_ref), length(genes_ctrl))
return(overlap)
}
#' @title Compute the specificity scores of enriched pathways.
#'
#' @description This functions takes two dataframes that contain the enriched pathways of a
#' reference and a control experiments. The functions then merge the two dataframes, computes
#' a Delta score that represents the specificity of the enrichemnt in the reference experiment
#' with respect to the control experiment. It then returs the pathways and their delta scores,
#' possibly filtered to retain only the most significant pathways.
#'
#' @param DB_ref The reference data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @param DB_ctrl The control data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @pval_thres The threshold to use to return only the pathways that reach that level of
#' significance either in the control or in the reference experiments.
#' @return A dataframe that contains the original data augmented with the Delta scores.
compute_overlap <- function(DB_ref, DB_ctrl, pval_thres = 0.1) {
# Merging both dataframes, keeping onlyh the common functional terms.
DB <- merge(x = DB_ref, y = DB_ctrl, by = "ID") %>%
mutate (Description = Description.x,
pval_ref = p.adjust.x,
pval_ctrl = p.adjust.y,
genes_ref = core_enrichment.x,
genes_ctrl = core_enrichment.y
) %>% select(ID, Description, pval_ref, pval_ctrl, genes_ref, genes_ctrl)
# We compute the overlap.
DB$Overlap <- apply(DB, 1, get_overlap)
# We keep only pathways that are significant or almost significant at least once (either
# in ref or in control).
res <- DB %>% filter(pval_ref < pval_thres | pval_ctrl < pval_thres) %>%
arrange(Overlap)
# We clean and return the selected pathways.
rm(DB)
return(res)
}
# ================================================================================================
# Main
# ================================================================================================
# This script aims at merging the results of functional enrichment for male and female DEGs in
# order to select the pathways that are only enriched for one of the two genders (gender specific
# pathways).
#
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We create a structure for all results.
gsp_data <- c()
male_prefix <- paste0(integration$name, "_VSN_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_VSN_PDVsControl_females_")
#
# KEGG by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_KEGG.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_KEGG.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "KEGG"))
rm(M_filename, F_filename, M_data, F_data)
#
# MSIGDB by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "MSIGDB"))
rm(M_filename, F_filename, M_data, F_data)
#
# REACTOME by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_REACTOME.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_REACTOME.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "REACTOME"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-BP by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_GOBP.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_GOBP.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "GOBP"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-CC by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_GOCC.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_GOCC.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "GOCC"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-MF by CP
#
M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_all_pivalue_gsea_GOMF.tsv")
F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_all_pivalue_gsea_GOMF.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust, core_enrichment)
gsp_data <- rbind(gsp_data, compute_overlap(M_data, F_data) %>%
mutate(tool = "CP", source = "GOMF"))
rm(M_filename, F_filename, M_data, F_data)
# Save file with all specific pathways.
ofile <- paste0(output_data_dir, integration$name, "_pathways_with_gender_specific_genes.tsv")
write.table(gsp_data, file = ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(ofile, gsp_data)
message(paste0("[", Sys.time(), "] [", integration$name, "] Pathways with gender specific genes identified."))
rm(integration, male_prefix, female_prefix)
} # End for each integration.
rm(i)
# Final cleaning.
rm(config, input_data_dir, output_data_dir)
sessionInfo()
......@@ -36,7 +36,7 @@ limma_analyses:
coefficient: "PD - Control"
name: "PDVsControl"
clinical_factor: "Disease_status"
use_for_network: "TRUE"
use_for_network: "FALSE"
-
factor: gender_disease_status
coefficient: "F.PD - M.PD"
......@@ -66,7 +66,7 @@ limma_analyses:
coefficient: "(F.PD - F.Control) - (M.PD - M.Control)"
name: "Gender_disease_status"
clinical_factor: "Gender_disease_status"
use_for_network: "TRUE"
use_for_network: "FALSE"
# Integration schemes
nb_min_pval: 2
perc_min_pval: 0.33334
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment