Commit 0b393707 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Adapted the enrichment procedure to the new ranking format.

parent 6bfe3fb1
......@@ -208,45 +208,109 @@ GD <- resGDS_potGDsel_only %>% filter(!is.na(log_fold_change))
#potGD_notinGDS <- resGDS_potGDsel_only %>% filter(is.na(log_fold_change))
rm(resGDS_potGDsel_only, potGDsel, potGD)
# BETTER SAVE FOR FIURTHER ANALA?SIOS
# We save the files we have generated, starting with the female gender-specific genes.
GS_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_none_", selection$name, "_genelist_genderspecific_females.tsv")
k <- 5
limma_analysis <- config$limma_analyses[[k]]
GS_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific.tsv")
write.table(GS_females, GS_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
GS_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_rankings.tsv")
write.table(GS_females %>%
mutate(log_fold_change = log_fold_change.F) %>%
mutate(P_value = P_value.F) %>%
select(Gene, log_fold_change, P_value),
GS_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GS_females_fn,GS_females)
# And the male gender-specific genes.
GS_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_none_", selection$name, "_genelist_genderspecific_males.tsv")
write.table(GS_males, GS_males_fn,
# The complete female list (gender-specific + gender-dimorphic).
GSGD_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_genderdimorphic.tsv")
write.table(GSGD_females, GSGD_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GS_males_fn, GS_males)
# The gender-dimorphic genes.
GD_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_none_", selection$name, "_genelist_genderdimorphic.tsv")
write.table(GD, GD_fn,
GSGD_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_genderdimorphic_rankings.tsv")
write.table(GSGD_females %>%
mutate(log_fold_change = log_fold_change.F) %>%
mutate(P_value = P_value.F) %>%
select(Gene, log_fold_change, P_value),
GSGD_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GD_fn, GD)
rm(GSGD_females_fn, GSGD_females)
rm(k, limma_analysis)
# The complete female list (gender-specific + gender-dimorphic).
GSGD_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_none_", selection$name, "_genelist_genderspecific_genderdimorphic_females.tsv")
write.table(GSGD_females, GSGD_females_fn,
# And the male gender-specific genes.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
GS_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific.tsv")
write.table(GS_males, GS_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GSGD_females_fn, GSGD_females)
GS_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_rankings.tsv")
write.table(GS_males %>%
mutate(log_fold_change = log_fold_change.M) %>%
mutate(P_value = P_value.M) %>%
select(Gene, log_fold_change, P_value),
GS_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GS_males_fn, GS_males)
# The complete male list (gender-specific + gender-dimorphic).
GSGD_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_none_", selection$name, "_genelist_genderspecific_genderdimorphic_males.tsv")
GSGD_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_genderdimorphic.tsv")
write.table(GSGD_males, GSGD_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
GSGD_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_genderdimorphic_rankings.tsv")
write.table(GSGD_males %>%
mutate(log_fold_change = log_fold_change.M) %>%
mutate(P_value = P_value.M) %>%
select(Gene, log_fold_change, P_value),
GSGD_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GSGD_males_fn, GSGD_males)
rm(k, limma_analysis)
# The gender-dimorphic genes.
k <- 7
limma_analysis <- config$limma_analyses[[k]]
GD_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderdimorphic.tsv")
write.table(GD, GD_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
GD_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
"_", selection$name, "_genderdimorphic_rankings.tsv")
write.table(GD %>%
select(Gene, log_fold_change, P_value),
GD_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(GD_fn, GD)
rm(k, limma_analysis)
# We clean the workspace and log the session details for reproducibility.
rm(l, selection, vsn, j, integration, int_criteria, i)
......
......@@ -45,7 +45,7 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
to_ret <- cbind(Gene = keys, log_fold_change = val_1, P_value = val_2)
return(to_ret)
}
......@@ -214,7 +214,7 @@ plot_enrichment <- function(enrich,
type_tag = "enrich",
file_dir = "",
file_prefix = "") {
# We start by defining the text size of the y labels wrt
# to the length of these labels. Default size is 9.
# This was empirically defined and it is used to circumvent a bug that prevents
......@@ -228,7 +228,7 @@ plot_enrichment <- function(enrich,
y_text_elmt <- ggplot2::element_blank()
}
}
# Dotplot of the the top enriched terms.
enrich_dotplot_fn <- paste0(file_dir, file_prefix, type_tag, "_", ont_tag,
"_dotplot.png")
......@@ -240,7 +240,7 @@ plot_enrichment <- function(enrich,
plot.title = element_text(size = 10))
print(p)
dev.off()
# The heatplot between genes and top enriched terms.
enrich_heatplot_fn <- paste0(file_dir, file_prefix, type_tag, "_", ont_tag,
"_heatplot.png")
......@@ -253,7 +253,7 @@ plot_enrichment <- function(enrich,
axis.text.x = element_blank())
print(p)
dev.off()
# Network of top enriched terms (emapplot).
enrich_emapplot_fn <- paste0(file_dir, file_prefix, type_tag, "_", ont_tag,
"_emapplot.png")
......@@ -308,9 +308,14 @@ plot_top_gsea_hits <- function(gsea,
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We only do the enrichment if it is necessary.
if (integration$use_for_enrichment == "FALSE") {
next
}
# We do all limma comparisons one by one.
for (j in seq_len(length(config$limma_analyses))) {
......@@ -323,126 +328,15 @@ for (i in seq_len(length(config$integrations))) {
next
}
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name)
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean <- NULL
if (sum(grepl("\\|", ranking$Gene)) > 0) {
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean <- ranking_clean %>% select(Gene, LFC, ranking_value)
}
rm(ranking)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
ranking_final <- merge(x = ranking_clean,
y = mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, LFC, ranking_value)
ranking_final$ranking_value <- as.numeric(ranking_final$ranking_value)
ranking_final$LFC <- as.numeric(ranking_final$LFC)
ranking_final <- ranking_final %>% arrange(desc(ranking_value))
rm(mapped_egenes, ranking_clean)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_eg <- ranking_final$ranking_value
ranking_sy <- ranking_final$ranking_value
logFC_eg <- ranking_final$LFC
logFC_sy <- ranking_final$LFC
names(ranking_eg) <- ranking_final$EGene
names(ranking_sy) <- ranking_final$Gene
names(logFC_eg) <- ranking_final$EGene
names(logFC_sy) <- ranking_final$Gene
ranking_eg <- ranking_eg[!duplicated(names(ranking_eg))]
ranking_sy <- ranking_sy[!duplicated(names(ranking_sy))]
logFC_eg <- logFC_eg[!duplicated(names(logFC_eg))]
logFC_sy <- logFC_sy[!duplicated(names(logFC_sy))]
rm(ranking_final)
# We loop over the functional sources.
for (k in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[k]]
if (is.null(func_source$cp_name) || func_source$cp_name == "") {
next
}
# We define the reference genes (depending on the source).
ranking <- NULL
logFC <- NULL
if (func_source$cp_gene_id_type == "entrezgene") {
ranking <- ranking_eg
logFC <- logFC_eg
} else if (func_source$cp_gene_id_type == "symbol") {
ranking <- ranking_sy
logFC <- logFC_sy
}
# We run the GSEA itself.
gsea_pi <- perform_gsea(ranking,
type = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
simplify = FALSE)
# We plot all figures for the gsea (PVAL * FC).
plot_enrichment(gsea_pi,
sign(logFC),
ont_tag = func_source$cp_name,
type_tag = "gsea",
file_dir = output_data_dir,
file_prefix = ofile_prefix)
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits(gsea_pi,
n = 10,
ont_tag = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
file_tag = ofile_prefix)
message(paste0("[", Sys.time(), "][", analysis_prefix, "] CP GSEA enrichment (",
func_source$cp_name, ") done."))
rm(func_source, gsea_pi, logFC, ranking)
} # End for each functional source.
rm(k, ranking_eg, ranking_sy, logFC_eg, logFC_sy, analysis_prefix, ofile_prefix)
# Second pass, if we have a specific file for that limma comparison, we do an extra round
# of analysis (e.g., PDVsControl_male -> PDVsControl_male_specific).
if (limma_parameters$can_be_specific == "TRUE") {
# We re-define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name, "_specific")
for (tag in limma_parameters$enrichment_file_tags) {
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg_", tag)
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
......@@ -453,17 +347,25 @@ for (i in seq_len(length(config$integrations))) {
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean <- ranking_clean %>% select(Gene, LFC, ranking_value)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$log_fold_change <- as.numeric(ranking_clean$log_fold_change)
}
rm(ranking)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean <- ranking_clean %>%
mutate(pi_value = -log10(P_value) * abs(log_fold_change))
pseudo_pval_ref <- 0.1 * (9 + max(ranking_clean$P_value[ranking_clean$P_value < 1]))
pseudo_pi_ref <- -log10(pseudo_pval_ref) * abs(ranking_clean$log_fold_change[ranking_clean$pi_value == 0])
ranking_clean$pi_value[ranking_clean$pi_value == 0] <- pseudo_pi_ref
rm(pseudo_pval_ref, pseudo_pi_ref)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
......@@ -474,36 +376,38 @@ for (i in seq_len(length(config$integrations))) {
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, LFC, ranking_value)
ranking_final$ranking_value <- as.numeric(ranking_final$ranking_value)
ranking_final$LFC <- as.numeric(ranking_final$LFC)
ranking_final <- ranking_final %>% arrange(desc(ranking_value))
mutate(EGene = ENTREZID) %>% select(Gene, EGene, log_fold_change, P_value, pi_value)
ranking_final$log_fold_change <- as.numeric(ranking_final$log_fold_change)
ranking_final$P_value <- as.numeric(ranking_final$P_value)
ranking_final$pi_value <- as.numeric(ranking_final$pi_value)
ranking_final <- ranking_final %>% arrange(desc(pi_value))
ranking_final_eg <- ranking_final %>% filter(!is.na(EGene))
rm(mapped_egenes, ranking_clean)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_eg <- ranking_final$ranking_value
ranking_sy <- ranking_final$ranking_value
logFC_eg <- ranking_final$LFC
logFC_sy <- ranking_final$LFC
names(ranking_eg) <- ranking_final$EGene
ranking_eg <- ranking_final_eg$pi_value
ranking_sy <- ranking_final$pi_value
logFC_eg <- ranking_final_eg$log_fold_change
logFC_sy <- ranking_final$log_fold_change
names(ranking_eg) <- ranking_final_eg$EGene
names(ranking_sy) <- ranking_final$Gene
names(logFC_eg) <- ranking_final$EGene
names(logFC_eg) <- ranking_final_eg$EGene
names(logFC_sy) <- ranking_final$Gene
ranking_eg <- ranking_eg[!duplicated(names(ranking_eg))]
ranking_sy <- ranking_sy[!duplicated(names(ranking_sy))]
logFC_eg <- logFC_eg[!duplicated(names(logFC_eg))]
logFC_sy <- logFC_sy[!duplicated(names(logFC_sy))]
rm(ranking_final)
rm(ranking_final, ranking_final_eg)
# We loop over the functional sources.
for (k in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[k]]
if (is.null(func_source$cp_name) || func_source$cp_name == "") {
next
}
# We define the reference genes (depending on the source).
ranking <- NULL
logFC <- NULL
......@@ -514,14 +418,14 @@ for (i in seq_len(length(config$integrations))) {
ranking <- ranking_sy
logFC <- logFC_sy
}
# We run the GSEA itself.
gsea_pi <- perform_gsea(ranking,
type = func_source$cp_name,
file_dir = output_data_dir,
file_prefix = ofile_prefix,
simplify = FALSE)
# We plot all figures for the gsea (PVAL * FC).
plot_enrichment(gsea_pi,
sign(logFC),
......@@ -529,7 +433,7 @@ for (i in seq_len(length(config$integrations))) {
type_tag = "gsea",
file_dir = output_data_dir,
file_prefix = ofile_prefix)
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits(gsea_pi,
n = 10,
......@@ -541,10 +445,9 @@ for (i in seq_len(length(config$integrations))) {
func_source$cp_name, ") done."))
rm(func_source, gsea_pi, logFC, ranking)
} # End for each functional source.
rm(k, ranking_eg, ranking_sy, logFC_eg, logFC_sy, ofile_prefix, analysis_prefix)
}
rm(limma_parameters)
rm(k, ranking_eg, ranking_sy, logFC_eg, logFC_sy, analysis_prefix, ofile_prefix)
} # End for each enrichment file tag.
rm(tag, limma_parameters)
} # End for each limma comparison.
rm(j, integration)
} # End for each integration.
......
......@@ -40,7 +40,7 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
to_ret <- cbind(Gene = keys, log_fold_change = val_1, P_value = val_2)
return(to_ret)
}
......@@ -61,6 +61,11 @@ for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We only do the enrichment if it is necessary.
if (integration$use_for_enrichment == "FALSE") {
next
}
# We do all limma comparisons one by one.
for (j in seq_len(length(config$limma_analyses))) {
......@@ -72,98 +77,7 @@ for (i in seq_len(length(config$integrations))) {
next
}
# Create the repository (for reusability).
repoRoot <- file.path(db_dir, "gep2pep_data_tmp")
unlink(repoRoot, recursive = TRUE)
rp <- createRepository(repoRoot, db)
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name)
ofile_prefix <- paste0("G2P_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean <- NULL
if (sum(grepl("\\|", ranking$Gene)) > 0) {
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value)) %>%
select(Gene, LFC, ranking_value)
}
rm(ranking)
# We prepare the gene id specific rankings (using gene names).
ranking_final <- ranking_clean %>% mutate(ranking = base::rank(-ranking_value,
na.last = TRUE,
ties.method = "first")) %>%
select(ranking)
rownames(ranking_final) <- ranking_clean$Gene
rm(ranking_clean)
# We build the PEPs (~differentially expressed pathways).
buildPEPs(rp, ranking_final)
rm(ranking_final)
# We then need to extract the results for each collection.
collections <- getCollections(rp)
peps_msigdb_res <- list()
for (k in seq_len(length(collections))) {
# Load the results.
message("[", k, " started]")
collec <- collections[k]
rawES <- loadPVmatrix(rp, collec)
rawPV <- loadPVmatrix(rp, collec)
resES <- data.frame(pid = rownames(rawES), es = rawES[, 1])
resPV <- data.frame(pid = rownames(rawPV), pv = rawPV[, 1])
res <- merge(x = resES, y = resPV, by = "pid") %>% mutate(collection = collec)
# Get id to name mapping, same for description (takes time).
collec_sets <- loadCollection(rp, collec)
res$pname <- setId2setName(collec_sets, res$pid)
res$pdesc <- apply(data.frame(res$pname), 1, function(x) description(collec_sets[[x]]))
peps_msigdb_res[[k]] <- res
rm(collec, rawES, rawPV, resES, resPV, res, collec_sets)
message("[", k, " done]")
} # End for each collection.
rm(k, collections)
# We format the results and save them.
peps_msigdb_res <- data.frame(do.call(rbind, c(peps_msigdb_res)),
row.names = NULL,
stringsAsFactors = FALSE)
peps_msigdb_res_fn <- paste0(output_data_dir, ofile_prefix, "MSIGDB.tsv")
write.table(peps_msigdb_res,
file = peps_msigdb_res_fn,
sep = "\t",
quote = FALSE,
col.names = NA)
rm(peps_msigdb_res, peps_msigdb_res_fn)
# Cleaning
unlink(repoRoot, recursive = TRUE)
rm(rp, repoRoot)
message(paste0("[", Sys.time(), "][", analysis_prefix, "] G2P GSEA enrichment (MsigDB) done."))
rm(ofile_prefix, analysis_prefix)
# Second pass, if we have a specific file for that limma comparison, we do an extra round
# of analysis (e.g., PDVsControl_male -> PDVsControl_male_specific).
if (limma_parameters$can_be_specific == "TRUE") {
for (tag in limma_parameters$enrichment_file_tags) {
# Create the repository (for reusability).
repoRoot <- file.path(db_dir, "gep2pep_data_tmp")
......@@ -171,13 +85,12 @@ for (i in seq_len(length(config$integrations))) {
rp <- createRepository(repoRoot, db)
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name, "_specific")
analysis_prefix <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg_", tag)
ofile_prefix <- paste0("G2P_", analysis_prefix, "_")
ranking_file <- paste0(output_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)