Commit c9b5b26b authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Improved DoRothEA enrichment procedure (enriched outputs and figure generation).

parent 62ce0a0b
......@@ -44,7 +44,8 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
val_3 <- rep(row[4], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, P_value = val_2, adj_P_value = val_3)
return(to_ret)
}
......@@ -75,6 +76,36 @@ for (ii in seq_len(dim(update_info)[1])) {
names(viper_regulon) <- paste(vp_names, vp_fullnames[2, ], sep = "_")
rm(update_info, vp_fullnames, vp_names, ii)
# We compute the number of percentages of activations / inhibitions
# from the pre-defined regulons as to better understand the actual meaning
# of the enrichment scores (NES).
# An inhibitor whose targets are depleted is associated with a positive score.
# An activator whose targets are depleted is associated with a negative score.
# An inhibitor whose targets are enriched is associated with a negaive score.
# An activator whose targets are enriched is associated with a positive score.
all_npos <- array(dim = length(names(viper_regulon)))
all_nneg <- array(dim = length(names(viper_regulon)))
all_ppos <- array(dim = length(names(viper_regulon)))
all_pneg <- array(dim = length(names(viper_regulon)))
i <- 1
for (gene in names(viper_regulon)) {
data <- viper_regulon[[gene]]$tfmode
n <- length(data)
all_npos[i] <- length(data[data == 1])
all_nneg[i] <- length(data[data == -1])
all_ppos[i] <- 100 * all_npos[i] / n
all_pneg[i] <- 100 * all_nneg[i] / n
i <- i + 1
rm(data, n)
}
rm(gene, i)
viper_regulon_infos <- data.frame(nb_pos = all_npos,
nb_neg = all_nneg,
pc_pos = all_ppos,
pc_neg = all_pneg)
rownames(viper_regulon_infos) <- names(viper_regulon)
rm(all_npos, all_nneg, all_ppos, all_pneg)
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
......@@ -83,6 +114,7 @@ for (i in seq_len(length(config$integrations))) {
# We only do the enrichment if it is necessary.
if (integration$use_for_network == "FALSE") {
rm(integration)
next
}
......@@ -94,16 +126,17 @@ for (i in seq_len(length(config$integrations))) {
# We only do the enrichment if it is necessary.
if (limma_parameters$use_for_network == "FALSE") {
rm(limma_parameters)
next
}
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name)
ranking_file <- paste0(input_data_dir, analysis_prefix, "_rankings.tsv")
analysis_prefix <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg")
ranking_file <- paste0(input_data_dir, analysis_prefix, "_all_pivalue_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE) %>%
select(Gene, log_fold_change, P_value, adj_P_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
......@@ -112,28 +145,28 @@ for (i in seq_len(length(config$integrations))) {
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$adj_P_value <- as.numeric(ranking_clean$adj_P_value)
ranking_clean <- ranking_clean %>% arrange(adj_P_value)
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean <- ranking_clean %>% select(Gene, LFC, ranking_value) %>% arrange(desc(ranking_value))
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$adj_P_value <- as.numeric(ranking_clean$adj_P_value)
ranking_clean <- ranking_clean %>% arrange(adj_P_value)
}
rm(ranking)
# Estimate Z-score values for the GES (As in DoRothEA manual, cheeck VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
# Estimate Z-score values for the GES (As in DoRothEA manual, check VIPER manual for details).
myStatistics <- matrix(ranking_clean$LFC,
dimnames = list(ranking_clean$Gene, "logFC") )
mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))),
dimnames = list(ranking_clean$Gene, "pP.Value") )
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
dimnames = list(ranking_clean$Gene, "logFC"))
myPvalue <- matrix(ranking_clean$adj_P_value,
dimnames = list(ranking_clean$Gene, "P.Value"))
mySignature <- (qnorm(myPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
mySignature <- mySignature[order(mySignature, decreasing = T)]
rm(ranking_clean, mypPvalue, myStatistics)
rm(ranking_clean, myPvalue, myStatistics)
# Estimate TF activities via Viper.
mrs <- msviper(ges = mySignature, regulon = viper_regulon, minsize = 4, ges.filter = F)
......@@ -146,20 +179,34 @@ for (i in seq_len(length(config$integrations))) {
Size = mrs$es$size[ names(mrs$es$nes) ],
NES = mrs$es$nes,
p.value = mrs$es$p.value)
TF_activities <- merge(viper_regulon_infos, TF_activities, by = 0, all.y = TRUE)
TF_activities <- TF_activities[ order(TF_activities$p.value), ] %>%
filter(evdc_lvl %in% c("A", "B", "C", "D")) %>%
mutate(FDR = p.adjust(p.value, method = "fdr"))
rm(mrs, mySignature, tf_name, evidence_level)
rownames(TF_activities) <- TF_activities$Row.names
TF_activities$Row.names <- NULL
rm(mrs, tf_name, evidence_level)
# Save results
TF_act_fn <- paste0(output_data_dir, analysis_prefix, "_TFactivities.tsv")
write.table(TF_activities, file = TF_act_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(TF_act_fn, TF_activities, analysis_prefix, limma_parameters)
# We create pseudo-GSEA images to check the enrichment visually as well.
for (gene in names(viper_regulon)) {
targets <- names(viper_regulon[[gene]]$tfmode)
targets_idx <- match(targets, names(mySignature))
png(paste0(output_data_dir, analysis_prefix, "_", gene, ".png"))
plot(mySignature, main = paste(gene, "=", round(TF_activities[gene, ]$NES, 2)))
abline(v = targets_idx, col = "red")
points(targets_idx, mySignature[targets], col = "red", bg = "red", pch = 25)
dev.off()
rm(targets, targets_idx)
}
rm(gene, TF_act_fn, TF_activities, analysis_prefix, limma_parameters, mySignature)
} # End for each limma comparison.
rm(j, integration)
} # End for each integration.
rm(i, viper_regulon)
rm(i, viper_regulon, viper_regulon_infos)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment