Commit 1d7210ba authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

GeneGO and DoRothEA analyses added (step 08).

parent 746ca8cf
......@@ -192,7 +192,7 @@ for (i in seq_len(length(config$datasets))) {
"][", k, "] Limma fit done."))
# We extract the DEGs for the coefficient of interest (for that limma analysis).
ArrayUtils::extract_DEGs(fit, limma_parameters$name, 1,
ArrayUtils::extract_degs(fit, limma_parameters$name, 1,
output_data_dir = output_data_dir,
file_prefix = dataset_name,
file_suffix = vsn$name)
......
......@@ -17,5 +17,3 @@ check:
@sbatch ${CODE_FOLDER}check.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
#xp:
# @sbatch ${CODE_FOLDER}extra_plots.sh
......@@ -24,7 +24,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' @title Function to save the lists of significnat genes.
#' @title Function to save the lists of significant genes.
#'
#' @description This function saves the significant genes from a data-frame. It saves three
#' files, one with all significant genes, one with opnly up-regulated genes and one with
......@@ -37,8 +37,8 @@ save_sign_genes <- function(data, file_prefix) {
# We filter the data.
sign_genes <- data %>%
dplyr::mutate(diff_exp = ifelse(sign(logFC_median) == 1, "up", "down")) %>% # nolint
dplyr::select(X, diff_exp)
dplyr::mutate(diff_exp = ifelse(sign(wavg_csss_logFC) == 1, "up", "down")) %>% # nolint
dplyr::select(SYMBOL, diff_exp)
sign_genes_up <- sign_genes %>% dplyr::filter(diff_exp == "up") # nolint
sign_genes_down <- sign_genes %>% dplyr::filter(diff_exp == "down") # nolint
......@@ -54,13 +54,13 @@ save_sign_genes <- function(data, file_prefix) {
quote = FALSE,
col.names = FALSE,
row.names = FALSE)
write.table(sign_genes_up$X,
write.table(sign_genes_up$SYMBOL,
file = sign_genes_up_fn,
sep = "\t",
quote = FALSE,
col.names = FALSE,
row.names = FALSE)
write.table(sign_genes_down$X,
write.table(sign_genes_down$SYMBOL,
file = sign_genes_down_fn,
sep = "\t",
quote = FALSE,
......@@ -135,17 +135,24 @@ for (i in seq_len(length(config$integrations))) {
integration_results <- read.table(int_results_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
row.names = 1,
as.is = TRUE)
rm(int_results_fn)
# We create the data structures to later plot the use counts.
results_core_matrix_full <- integration_results %>%
dplyr::select(-starts_with("sum_of")) %>%
dplyr::select(-starts_with("int_")) %>%
dplyr::select(-starts_with("P_")) %>%
dplyr::select(-starts_with("confidence")) %>%
dplyr::select(-starts_with("wavg")) %>%
dplyr::select(-starts_with("nb_")) %>%
dplyr::select(-starts_with("logFC_")) %>%
dplyr::select(-starts_with("avg_fc_")) %>%
dplyr::select(-starts_with("X"))
dplyr::select(-starts_with("SYMBOL"))
# We filter out to non consensus log fold changes only for the use count plots.
results_csss_lfc <- rep(integration_results$wavg_csss_logFC, dim(results_core_matrix_full)[2])
dim(results_csss_lfc) <- dim(results_core_matrix_full)
results_core_matrix_full[sign(results_core_matrix_full) != sign(results_csss_lfc)] <- NA
nb_nas <- c(0, sapply(results_core_matrix_full,
function(y) sum(is.na(y))))
use_count_full <- dim(results_core_matrix_full)[1] - nb_nas
......@@ -175,7 +182,7 @@ for (i in seq_len(length(config$integrations))) {
# Clean the df for further use (now we only need the global P values).
integration_results <- integration_results %>%
dplyr::select(X, logFC_median, starts_with("int_adj_pval_")) #nolint
dplyr::select(SYMBOL, wavg_csss_logFC, starts_with("int_csss_adj_pval_")) #nolint
# We then study the various P value combination methods one by one.
all_local_counts <- vector()
......@@ -185,7 +192,7 @@ for (i in seq_len(length(config$integrations))) {
# List the significant genes if any.
sign_genes <- integration_results %>%
dplyr::filter(.data[[paste0("int_adj_pval_",
dplyr::filter(.data[[paste0("int_csss_adj_pval_",
pval_comb$name)]] < config$p_val_sign_thres)
# Update the global structures (for comparison).
......@@ -197,11 +204,11 @@ for (i in seq_len(length(config$integrations))) {
# We plot the eCDFs of the P values (if we have at least one gene).
if (dim(sign_genes)[1] > 0) {
sign_genes_4ecdf <- sign_genes %>%
dplyr::mutate(diff_exp = ifelse(sign(logFC_median) == 1, "up", "down")) #nolint
dplyr::mutate(diff_exp = ifelse(sign(wavg_csss_logFC) == 1, "up", "down")) #nolint
ecdf_fn <- paste0(file_prefix, pval_comb$name, "_eCDFs.png")
png(ecdf_fn)
myplot <- ggplot(sign_genes_4ecdf,
aes_string(x = paste0("int_adj_pval_", pval_comb$name),
aes_string(x = paste0("int_csss_adj_pval_", pval_comb$name),
colour = "diff_exp")) +
stat_ecdf(pad = FALSE, geom = "smooth") +
theme(legend.position = "bottom",
......@@ -235,7 +242,7 @@ for (i in seq_len(length(config$integrations))) {
} # End of for each gene-probe selection.
rm(l)
# At this stage, we ant to compare the gene-probe selection / P value combination methods.
# At this stage, we want to compare the gene-probe selection / P value combination methods.
# We start by looking at the overlaps (raw counts and percentages).
# n covers the gene-probe selection methods and the P value combination methods.
# Here n = 2 * 3 = 6.
......@@ -246,8 +253,8 @@ for (i in seq_len(length(config$integrations))) {
rm(n)
for (m in seq_len(length(sign_genes_all) - 1)) {
for (n in (m + 1):length(sign_genes_all)) {
set1 <- sign_genes_all[[m]]$X
set2 <- sign_genes_all[[n]]$X
set1 <- sign_genes_all[[m]]$SYMBOL
set2 <- sign_genes_all[[n]]$SYMBOL
inter_set1_set2 <- intersect(set1, set2)
perc_over <- 100 * length(inter_set1_set2) / min(length(set1), length(set2))
overlap[m, n] <- length(inter_set1_set2)
......
......@@ -86,9 +86,9 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
# We identify the top hits.
top_hits <- results_data_df %>%
dplyr::arrange(!! sym(paste0("int_adj_pval_", pval_comb$name)),
!! sym(paste0("int_pval_", pval_comb$name))) %>%
dplyr::select(X, !! sym(paste0("int_adj_pval_", pval_comb$name))) %>%
dplyr::arrange(!! sym(paste0("int_csss_adj_pval_", pval_comb$name)),
!! sym(paste0("int_csss_pval_", pval_comb$name))) %>%
dplyr::select(SYMBOL, !! sym(paste0("int_csss_adj_pval_", pval_comb$name))) %>%
head(nb_hits)
# We plot each gene sequentially.
......@@ -97,7 +97,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
# We get the relevant probe identifiers.
gene <- top_hits[p, ]
matching_data_gene <- matching_data %>%
dplyr::filter(genes == gene$X) %>%
dplyr::filter(genes == gene$SYMBOL) %>%
dplyr::select(ends_with(str_replace(selection_name, "-", ".")))
# We do all datasets one by one and collect the clinical and expression data.
......@@ -176,13 +176,13 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
full_profiles <- rbind(full_profiles, full_zprofiles)
# We can create the plot.
output_figure_file <- paste0(file_prefix, "geneplot_", p, "_", gene$X, ".png")
output_figure_file <- paste0(file_prefix, "geneplot_", p, "_", gene$SYMBOL, ".png")
png(output_figure_file)
message(paste0("[", Sys.time(), "] Plotting ", gene$X, " with clinical factor ",
message(paste0("[", Sys.time(), "] Plotting ", gene$SYMBOL, " with clinical factor ",
clinical_factor))
myplot <- ggplot(full_profiles, aes(x = !!ensym(clinical_factor),
y = Expression,
colour = !!ensym(clinical_factor))) +
y = Expression,
colour = !!ensym(clinical_factor))) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 3, width = 0.2, height = 0.1) +
facet_wrap(~ dataset, scales = "free") +
......@@ -206,11 +206,11 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
legend.text = element_text(size = 16,
face = "bold")) +
scale_color_manual(values = palette) +
ggtitle(paste0(gene$X, " (P = ", formatC(gene[[2]], format = "e", digits = 2), ")"))
ggtitle(paste0(gene$SYMBOL, " (P = ", formatC(gene[[2]], format = "e", digits = 2), ")"))
print(myplot)
dev.off()
rm(myplot, output_figure_file, clinical_factor)
message(paste0("[", Sys.time(), "] Gene ", gene$X, " done."))
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL, " done."))
rm(gene, matching_data_gene, full_profiles, full_zprofiles)
} # End for each top hit.
rm(p, pval_comb, file_prefix, top_hits)
......
......@@ -88,15 +88,15 @@ for (i in seq_len(length(config$integrations))) {
# We define the I/Os.
pval_comb <- config$p_val_combinations[[o]]
pval_meth <- paste0("int_adj_pval_", pval_comb$name)
pval_meth <- paste0("int_csss_adj_pval_", pval_comb$name)
# We filter the results for that particular P value combination method.
diff_exp_results <- integration_results %>%
dplyr::select(X, !! sym(pval_meth))
dplyr::select(SYMBOL, !! sym(pval_meth))
# We filter the list of PD associated genes as well to keep only the genes that
# are in both lists.
pd_genes_loc <- merge(pd_genes, diff_exp_results, by.x = "V1", by.y = "X")
pd_genes_loc <- merge(pd_genes, diff_exp_results, by.x = "V1", by.y = "SYMBOL")
# We need to compute the contingency table.
nb_tt <- dim(pd_genes_loc %>%
......
......@@ -11,4 +11,9 @@ enrich:
@sbatch ${CODE_FOLDER}/CP_enrich.sh
@sbatch ${CODE_FOLDER}/G2P_enrich.sh
@sbatch ${CODE_FOLDER}/ROT_enrich.sh
##@sbatch ${CODE_FOLDER}/PF_enrich.sh
pfenrich:
@/bin/bash ${CODE_FOLDER}/PF_enrich.sh
merge:
@sbatch ${CODE_FOLDER}/merge.sh
gsp:
@sbatch ${CODE_FOLDER}/gender_specific_pathways.sh
......@@ -50,11 +50,13 @@ duplicate_row_first_cell <- function(row, sep = "|") {
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We do not do the gender specific genes since we do not have P values (currently).
if (i > 1) {
# We need a P value to start with (PF requirement) so we only run PF on the purely
# limma derived cases and ignore the gender specific cases since the deltas we computed
# ourselves are not associated with any P value.
if (i > 3) {
next
}
# We define the I/Os.
ranking_file <- paste0(output_data_dir, config$ranking_files[[i]])
file_prefix <- paste0("PF_", strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]])
......@@ -85,12 +87,13 @@ for (i in seq_len(length(config$ranking_files))) {
# We run the Network based ORA.
output_dir <- paste0(output_data_dir, file_prefix, "tmp")
output_df <- pathfindR::run_pathfindR(input = ranking_final,
gene_sets = func_source$pf_name,
enrichment_threshold = 1,
iterations = 10,
output_dir = output_dir,
pin_name_path = "Biogrid")
output_df <- pathfindR::run_pathfindR(input = ranking_final,
gene_sets = func_source$pf_name,
enrichment_threshold = 1,
iterations = 10,
output_dir = output_dir,
visualize_enriched_terms = FALSE,
pin_name_path = "Biogrid")
otable_fn <- paste0(output_data_dir, file_prefix, func_source$pf_name, ".tsv")
write.table(output_df,
......@@ -102,6 +105,7 @@ for (i in seq_len(length(config$ranking_files))) {
rm(output_dir, otable_fn, output_df)
message(paste0("[", Sys.time(), "][", file_prefix, "] PF GSEA enrichment (",
func_source$pf_name, ") done."))
rm(func_source)
} # End for each functional source.
rm(j, file_prefix, ranking_final)
} # End for each file to analyse.
......
#!/bin/bash -l
#SBATCH -J geneder:07:pf_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 8
#SBATCH --time=0-24:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/07-Enrichment/
# Loading modules.
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/PF_enrich.R > ${OUTPUT_FOLDER}/PF_enrich_log.out 2> ${OUTPUT_FOLDER}/PF_enrich_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
xvfb-run Rscript --vanilla ${CODE_FOLDER}/PF_enrich.R > ${OUTPUT_FOLDER}/PF_enrich_log.out 2> ${OUTPUT_FOLDER}/PF_enrich_log.err
......@@ -70,93 +70,116 @@ compute_delta <- function(DB_ref, DB_ctrl, pval_thres = 0.1) {
# order to select the pathways that are only enriched for one of the two genders (gender specific
# pathways).
#
# NOTE: the XLSX files the script is reading have been manually built.
# Additional config.
res_folder <- "~/Projects/GeneDER/Results/Enrich_results_v04.02/"
res <- c()
gsp_data <- c()
#
# KEGG
# KEGG by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_KEGG") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_KEGG.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_KEGG.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_KEGG") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "KEGG"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "KEGG"))
rm(M_filename, F_filename, M_data, F_data)
#
# MSIGDB
# MSIGDB by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_MSIGDB") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_MSIGDB") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "MSIGDB"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "MSIGDB"))
rm(M_filename, F_filename, M_data, F_data)
#
# REACTOME
# REACTOME by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_REACTOME") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_REACTOME.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_REACTOME.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_REACTOME") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "REACTOME"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "REACTOME"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-BP
# GO-BP by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_GOBP") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_GOBP.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_GOBP.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_GOBP") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "GOBP"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "GOBP"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-CC
# GO-CC by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_GOCC") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_GOCC.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_GOCC.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_GOCC") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "GOCC"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "GOCC"))
rm(M_filename, F_filename, M_data, F_data)
#
# GO-MF
# GO-MF by CP
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "CP_GOMF") %>%
M_filename <- paste0(output_data_dir, "CP_Male_gsea_GOMF.tsv")
F_filename <- paste0(output_data_dir, "CP_Female_gsea_GOMF.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "CP_GOMF") %>%
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
select(ID, Description, p.adjust)
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "GOMF"))
rm(DB_M, DB_F)
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "CP", source = "GOMF"))
rm(M_filename, F_filename, M_data, F_data)
#
# [G2P] MSIGDB
# MSIGDB by G2P
#
DB_M <- read_excel(paste0(res_folder, "07_enrich_M.xlsx"), sheet = "G2P_MSIGDB") %>%
M_filename <- paste0(output_data_dir, "G2P_Male_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "G2P_Female_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE)
M_data$adj_pv = M_data$pv * dim(M_data)[1]
M_data$adj_pv[M_data$adj_pv > 1] <- 1
M_data <- M_data %>%
select(pid, pname, adj_pv) %>% filter(!is.na(adj_pv)) %>%
rename(ID = pid, Description = pname, p.adjust = adj_pv)
DB_F <- read_excel(paste0(res_folder, "07_enrich_F.xlsx"), sheet = "G2P_MSIGDB") %>%
minp_M <- min(M_data$p.adjust[M_data$p.adjust > 0])
M_data$p.adjust[M_data$p.adjust == 0] <- minp_M / 100
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE)
F_data$adj_pv = F_data$pv * dim(F_data)[1]
F_data$adj_pv[F_data$adj_pv > 1] <- 1
F_data <- F_data %>%
select(pid, pname, adj_pv) %>% filter(!is.na(adj_pv)) %>%
rename(ID = pid, Description = pname, p.adjust = adj_pv)
# Correcting for P == 0.
minp_M <- min(DB_M$p.adjust[DB_M$p.adjust > 0])
minp_F <- min(DB_F$p.adjust[DB_F$p.adjust > 0])
DB_M$p.adjust[DB_M$p.adjust == 0] <- minp_M / 100
DB_F$p.adjust[DB_F$p.adjust == 0] <- minp_F / 100
res <- rbind(res, compute_delta(DB_M, DB_F) %>% mutate(source = "G2P_MSIGDB"))
rm(DB_M, DB_F, minp_M, minp_F)
minp_F <- min(F_data$p.adjust[F_data$p.adjust > 0])
F_data$p.adjust[F_data$p.adjust == 0] <- minp_F / 100
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "G2P", source = "MSIGDB"))
rm(M_filename, F_filename, M_data, F_data, minp_M, minp_F)
# Save file with all specific pathways.
ofile <- paste0(output_data_dir, "Gender_specific_pathways.tsv")
write.table(res, file = ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(ofile, res, res_folder)
write.table(gsp_data, file = ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(ofile, gsp_data)
message(paste0("[", Sys.time(), "] Gender specific pathways identified."))
# Final cleaning.
rm(config, input_data_dir, output_data_dir)
......
#!/bin/bash -l
#SBATCH -J geneder:07:gsp
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 8
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/07-Enrichment/
# Loading modules.
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/gender_specific_pathways.R > ${OUTPUT_FOLDER}/gsp_log.out 2> ${OUTPUT_FOLDER}/gsp_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("tidyverse")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We define the I/Os.
file_corename <- strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]]
# I- We merge the MSIGDB results by CP and G2P.
cp_filename <- paste0(output_data_dir, "CP_", file_corename, "gsea_MSIGDB.tsv")
g2p_filename <- paste0(output_data_dir, "G2P_", file_corename, "MSIGDB.tsv")
combined_fn <- paste0(output_data_dir, "COMB_", file_corename, "MSIGDB.tsv")
cp_data <- read.delim(cp_filename, row.names = 1, stringsAsFactors = FALSE)
g2p_data <- read.delim(g2p_filename, stringsAsFactors = FALSE)
combined <- merge(x = cp_data, y = g2p_data, by.x = "ID", by.y = "pname")
write.table(combined, file = combined_fn, sep = "\t", quote = FALSE)
rm(cp_filename, g2p_filename, combined_fn, cp_data, g2p_data, combined)
# We only have PF results for the first three files.
if (i <= 3) {
# II- We merge the GO-BP results by CP and PF.
cp_filename <- paste0(output_data_dir, "CP_", file_corename, "gsea_GOBP.tsv")
pf_filename <- paste0(output_data_dir, "PF_", file_corename, "GO-BP.tsv")
combined_fn <- paste0(output_data_dir, "COMB_", file_corename, "GO-BP.tsv")
cp_data <- read.delim(cp_filename, row.names = 1, stringsAsFactors = FALSE)
pf_data <- read.delim(pf_filename, row.names = 1, stringsAsFactors = FALSE)
combined <- merge(x = cp_data, y = pf_data, by = "ID")
write.table(combined, file = combined_fn, sep = "\t", quote = FALSE)
rm(cp_filename, pf_filename, combined_fn, cp_data, pf_data, combined)
# III- We merge the GO-CC results by CP and PF.
cp_filename <- paste0(output_data_dir, "CP_", file_corename, "gsea_GOCC.tsv")
pf_filename <- paste0(output_data_dir, "PF_", file_corename, "GO-CC.tsv")
combined_fn <- paste0(output_data_dir, "COMB_", file_corename, "GO-CC.tsv")
cp_data <- read.delim(cp_filename, row.names = 1, stringsAsFactors = FALSE)
pf_data <- read.delim(pf_filename, row.names = 1, stringsAsFactors = FALSE)
combined <- merge(x = cp_data, y = pf_data, by = "ID")