Commit 67e7de53 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

New combined scores to integrate both the pi values and the gender-specificity scores.

parent fdf6b966
......@@ -363,10 +363,8 @@ rm(m)
#for (gene in config$genes_to_plot) {#}
# DEBUG
gene <- "RFC5"
##gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
gene_datasets <- c("GSE20163")
gene <- "XIST"
gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
plot_gene(gene = gene,
gene_datasets = gene_datasets,
......
......@@ -302,6 +302,78 @@ plot_top_gsea_hits <- function(gsea,
}
}
#' @title Reading a file with LFC and P values and computes the pi values.
#'
#' @description This functions reads a results files given an integration scheme, a limma
#' tag and a file tag. It then compute the pi values, mapped the official symbols to
#' EntrezGene identifiers.
#'
#' @param integration The integration scheme to use (SN, DA, SNage).
#' @param limma_tag The name of the limma analysis to use.
#' @param file_tag The name of the file tag to use.
#' @param negative A boolean indicating whether the sign of the pi values should be inversed.
#' @return A dataframe with the gene official symbols, the EntrezGene identifiers (can be NA),
#' the log fold-change, P values and pi values.
get_pi_values <- function(integration, limma_tag, file_tag, negative = FALSE) {
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_VSN_", limma_tag, "_max-avg_", file_tag)
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean <- NULL
if (sum(grepl("\\|", ranking$Gene)) > 0) {
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
}
rm(ranking)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean <- ranking_clean %>%
mutate(pi_value = -log10(P_value) * abs(log_fold_change))
pseudo_pval_ref <- 0.1 * (9 + max(ranking_clean$P_value[ranking_clean$P_value < 1]))
pseudo_pi_ref <- -log10(pseudo_pval_ref) * abs(ranking_clean$log_fold_change[ranking_clean$pi_value == 0])
ranking_clean$pi_value[ranking_clean$pi_value == 0] <- pseudo_pi_ref
if (negative) {
ranking_clean$pi_value <- -ranking_clean$pi_value
}
rm(pseudo_pval_ref, pseudo_pi_ref)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
ranking_final <- merge(x = ranking_clean,
y = mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, log_fold_change, P_value, pi_value)
ranking_final$log_fold_change <- as.numeric(ranking_final$log_fold_change)
ranking_final$P_value <- as.numeric(ranking_final$P_value)
ranking_final$pi_value <- as.numeric(ranking_final$pi_value)
ranking_final <- ranking_final %>% arrange(desc(pi_value))
rm(mapped_egenes, ranking_clean)
return(ranking_final)
}
# ================================================================================================
# Main
# ================================================================================================
......@@ -316,139 +388,91 @@ for (i in seq_len(length(config$integrations))) {
if (integration$use_for_enrichment == "FALSE") {
next
}
# We do all limma comparisons one by one.
for (j in seq_len(length(config$limma_analyses))) {
# We extract the Limma parameters.
limma_parameters <- config$limma_analyses[[j]]
# We only do the enrichment if it is necessary.
if (limma_parameters$use_for_enrichment == "FALSE") {
next
# For each defined strategy
for (j in seq_len(length(config$enrichment_strategies))) {
# We get the integration name for that scheme.
enrichment_strategy <- config$enrichment_strategies[[j]]
# I/Os
analysis_prefix <- paste0(integration$name, "_VSN_", enrichment_strategy$limmas[1],
"_max-avg_", enrichment_strategy$file_tags[1])
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
# We get the pi values of the two files
ranking_final <- get_pi_values(integration, enrichment_strategy$limmas[1],
enrichment_strategy$file_tags[1], negative = FALSE)
if (!is.na(enrichment_strategy$limmas[2])) {
ranking_final_neg <- get_pi_values(integration, enrichment_strategy$limmas[2],
enrichment_strategy$file_tags[2], negative = TRUE)
ranking_final <- rbind(ranking_final, ranking_final_neg)
}
for (tag in limma_parameters$enrichment_file_tags) {
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg_", tag)
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean <- NULL
if (sum(grepl("\\|", ranking$Gene)) > 0) {
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_final_eg <- ranking_final %>% filter(!is.na(EGene))
ranking_eg <- ranking_final_eg$pi_value
ranking_sy <- ranking_final$pi_value
logFC_eg <- ranking_final_eg$log_fold_change
logFC_sy <- ranking_final$log_fold_change
names(ranking_eg) <- ranking_final_eg$EGene
names(ranking_sy) <- ranking_final$Gene
names(logFC_eg) <- ranking_final_eg$EGene
names(logFC_sy) <- ranking_final$Gene
ranking_eg <- ranking_eg[!duplicated(names(ranking_eg))]
ranking_sy <- ranking_sy[!duplicated(names(ranking_sy))]
logFC_eg <- logFC_eg[!duplicated(names(logFC_eg))]
logFC_sy <- logFC_sy[!duplicated(names(logFC_sy))]
rm(ranking_final, ranking_final_eg)
# We loop over the functional sources.
for (k in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[k]]
if (is.null(func_source$cp_name) || func_source$cp_name == "") {
next
}
rm(ranking)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean <- ranking_clean %>%
mutate(pi_value = -log10(P_value) * abs(log_fold_change))
pseudo_pval_ref <- 0.1 * (9 + max(ranking_clean$P_value[ranking_clean$P_value < 1]))
pseudo_pi_ref <- -log10(pseudo_pval_ref) * abs(ranking_clean$log_fold_change[ranking_clean$pi_value == 0])
ranking_clean$pi_value[ranking_clean$pi_value == 0] <- pseudo_pi_ref
rm(pseudo_pval_ref, pseudo_pi_ref)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
ranking_final <- merge(x = ranking_clean,
y = mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, log_fold_change, P_value, pi_value)
ranking_final$log_fold_change <- as.numeric(ranking_final$log_fold_change)
ranking_final$P_value <- as.numeric(ranking_final$P_value)
ranking_final$pi_value <- as.numeric(ranking_final$pi_value)
ranking_final <- ranking_final %>% arrange(desc(pi_value))
ranking_final_eg <- ranking_final %>% filter(!is.na(EGene))
rm(mapped_egenes, ranking_clean)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_eg <- ranking_final_eg$pi_value
ranking_sy <- ranking_final$pi_value
logFC_eg <- ranking_final_eg$log_fold_change
logFC_sy <- ranking_final$log_fold_change
names(ranking_eg) <- ranking_final_eg$EGene
names(ranking_sy) <- ranking_final$Gene
names(logFC_eg) <- ranking_final_eg$EGene
names(logFC_sy) <- ranking_final$Gene
ranking_eg <- ranking_eg[!duplicated(names(ranking_eg))]
ranking_sy <- ranking_sy[!duplicated(names(ranking_sy))]
logFC_eg <- logFC_eg[!duplicated(names(logFC_eg))]
logFC_sy <- logFC_sy[!duplicated(names(logFC_sy))]
rm(ranking_final, ranking_final_eg)
# We loop over the functional sources.
for (k in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[k]]
if (is.null(func_source$cp_name) || func_source$cp_name == "") {
next
}
# We define the reference genes (depending on the source).
ranking <- NULL
logFC <- NULL
if (func_source$cp_gene_id_type == "entrezgene") {
ranking <- ranking_eg
logFC <- logFC_eg
} else if (func_source$cp_gene_id_type == "symbol") {
ranking <- ranking_sy
logFC <- logFC_sy
}
# We run the GSEA itself.
gsea_pi <- perform_gsea(ranking,
type = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
simplify = FALSE)
# We plot all figures for the gsea (PVAL * FC).
plot_enrichment(gsea_pi,
sign(logFC),
ont_tag = func_source$cp_name,
type_tag = "gsea",
file_dir = output_data_dir,
file_prefix = ofile_prefix)
# We define the reference genes (depending on the source).
ranking <- NULL
logFC <- NULL
if (func_source$cp_gene_id_type == "entrezgene") {
ranking <- ranking_eg
logFC <- logFC_eg
} else if (func_source$cp_gene_id_type == "symbol") {
ranking <- ranking_sy
logFC <- logFC_sy
}
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits(gsea_pi,
n = 10,
ont_tag = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
file_tag = ofile_prefix)
message(paste0("[", Sys.time(), "][", analysis_prefix, "] CP GSEA enrichment (",
func_source$cp_name, ") done."))
rm(func_source, gsea_pi, logFC, ranking)
} # End for each functional source.
rm(k, ranking_eg, ranking_sy, logFC_eg, logFC_sy, analysis_prefix, ofile_prefix)
} # End for each enrichment file tag.
rm(tag, limma_parameters)
} # End for each limma comparison.
# We run the GSEA itself.
gsea_pi <- perform_gsea(ranking,
type = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
simplify = FALSE)
# We plot all figures for the gsea (PVAL and FC).
plot_enrichment(gsea_pi,
sign(logFC),
ont_tag = func_source$cp_name,
type_tag = "gsea",
file_dir = output_data_dir,
file_prefix = ofile_prefix)
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits(gsea_pi,
n = 10,
ont_tag = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
file_tag = ofile_prefix)
message(paste0("[", Sys.time(), "][", analysis_prefix, "] CP GSEA enrichment (",
func_source$cp_name, ") done."))
rm(func_source, gsea_pi, logFC, ranking)
} # End for each functional source.
rm(k, ranking_eg, ranking_sy, logFC_eg, logFC_sy, analysis_prefix, ofile_prefix)
} # End for each enrichment strategy.
rm(j, integration)
} # End for each integration.
rm(i)
......
......@@ -30,56 +30,42 @@ limma_analyses:
coefficient: "F - M"
name: "FemaleVsMale"
clinical_factor: "Gender"
use_for_enrichment: "FALSE"
enrichment_file_tags: []
use_for_network: "FALSE"
-
factor: Disease.status
coefficient: "PD - Control"
name: "PDVsControl"
clinical_factor: "Disease_status"
use_for_enrichment: "FALSE"
enrichment_file_tags: []
use_for_network: "TRUE"
-
factor: gender_disease_status
coefficient: "F.PD - M.PD"
name: "FemaleVsMale_PD"
clinical_factor: "Gender_PD"
use_for_enrichment: "FALSE"
enrichment_file_tags: []
use_for_network: "FALSE"
-
factor: gender_disease_status
coefficient: "F.Control - M.Control"
name: "FemaleVsMale_control"
clinical_factor: "Gender_Control"
use_for_enrichment: "FALSE"
enrichment_file_tags: []
use_for_network: "FALSE"
-
factor: gender_disease_status
coefficient: "F.PD - F.Control"
name: "PDVsControl_females"
clinical_factor: "Disease_status_females"
use_for_enrichment: "TRUE"
enrichment_file_tags: ["genderspecific", "genderspecific_genderdimorphic"]
use_for_network: "TRUE"
-
factor: gender_disease_status
coefficient: "M.PD - M.Control"
name: "PDVsControl_males"
clinical_factor: "Disease_status_males"
use_for_enrichment: "TRUE"
enrichment_file_tags: ["genderspecific", "genderspecific_genderdimorphic"]
use_for_network: "TRUE"
-
factor: gender_disease_status
coefficient: "(F.PD - F.Control) - (M.PD - M.Control)"
name: "Gender_disease_status"
clinical_factor: "Gender_disease_status"
use_for_enrichment: "TRUE"
enrichment_file_tags: ["genderdimorphic"]
use_for_network: "TRUE"
# Integration schemes
nb_min_pval: 2
......@@ -89,12 +75,12 @@ integrations:
-
name: SN
criteria: tissue;SN
use_for_enrichment: "FALSE"
use_for_enrichment: "TRUE"
use_for_network: "FALSE"
-
name: DA
criteria: tissue;DA
use_for_enrichment: "FALSE"
use_for_enrichment: "TRUE"
use_for_network: "FALSE"
-
name: SNage
......@@ -128,6 +114,22 @@ p_val_combinations:
short_name: zt
method: "z.transform"
# Enrichments parameters
enrichment_strategies:
-
limmas: ["PDVsControl_females", "PDVsControl_males"]
file_tags: ["genderspecific", "genderspecific"]
-
limmas: ["PDVsControl_males", "PDVsControl_females"]
file_tags: ["genderspecific", "genderspecific"]
-
limmas: ["PDVsControl_females", "PDVsControl_males"]
file_tags: ["genderspecific_genderdimorphic", "genderspecific"]
-
limmas: ["PDVsControl_males", "PDVsControl_females"]
file_tags: ["genderspecific_genderdimorphic", "genderspecific"]
-
limmas: ["Gender_disease_status"]
file_tags: ["genderdimorphic"]
nb_permutations: 150000
min_gs_size: 10
max_gs_size: 300
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment