Commit 62ce0a0b authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Another attempt to get Carnival to run (bioc package version).

parent ae497b09
......@@ -18,6 +18,7 @@ echo ""
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
ALT_INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# We load the module (R is used i part II).
......@@ -43,8 +44,7 @@ cut -f 1,2,4 ${OUTPUT_FOLDER}Omnipath_network.tsv | awk -v FS=$'\t' -v OFS=$'\t'
#echo "*************************************************************************"
# Part II - we now take care of the PROGENy linear model (useful to create the CARNIVAL weight matrix).
# We obtain the latest file from the CARNIVAL R package.
Rscript --vanilla ${CODE_FOLDER}extract_progeny_model.R > ${OUTPUT_FOLDER}extract_progeny_model_log.out 2> ${OUTPUT_FOLDER}extract_progeny_model_log.err
cp -rf ${DATA_FOLDER}/Original/Else/Progeny/Progeny_model.csv ${OUTPUT_FOLDER}Progeny_model.csv
# We process the file further to extract the full list of gene ids used by PROGENy.
cut -f 1 -d ',' ${OUTPUT_FOLDER}Progeny_model.csv | sed -r "s/\"//g" | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}Progeny_geneids.tsv
......
......@@ -8,6 +8,7 @@ library("stringr")
library("tidyverse")
library("viper")
library("CARNIVAL")
library("progeny")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -40,7 +41,8 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
val_3 <- rep(row[4], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, P_value = val_2, adj_P_value = val_3)
return(to_ret)
}
......@@ -101,51 +103,88 @@ map_genenames_uniprots <- read.delim(map_genenames_uniprots_fn,
stringsAsFactors = FALSE,
header = TRUE)
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We only do the first three cases because then we do not have P values so
# we can not really create the "mySignature" object.
if (i > 4) {
next
}
# We define the I/Os.
ranking_file <- paste0(input_data_dir, config$ranking_files[[i]])
file_prefix <- strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]]
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res, ranking)
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# Estimate Z-score values for the GES (As in DoRothEA manual, check VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics <- matrix(ranking_clean$LFC,
dimnames = list(ranking_clean$Gene, "logFC") )
mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))),
dimnames = list(ranking_clean$Gene, "pP.Value") )
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
mySignature <- mySignature[order(mySignature, decreasing = T)]
rm(ranking_clean, mypPvalue, myStatistics)
# We only do the enrichment if it is necessary.
if (integration$use_for_network == "FALSE") {
rm(integration)
next
}
# Estimate TF activities via DoRothEA.
TF_activities <- runDoRothEA(data.frame(EGF = mySignature, row.names = names(mySignature)),
regulon = viper_regulon,
confidence_level = c('A', 'B', 'C', 'D'))
TF_activities$SYMBOL <- row.names(TF_activities)
# We do all limma comparisons one by one.
for (j in seq_len(length(config$limma_analyses))) {
# We extract the Limma parameters.
limma_parameters <- config$limma_analyses[[j]]
# We only do the enrichment if it is necessary.
if (limma_parameters$use_for_network == "FALSE") {
rm(limma_parameters)
next
}
# We define the I/Os.
file_prefix <- paste0(integration$name, "_VSN_", limma_parameters$name,
"_max-avg_all_pivalue_")
ranking_file <- paste0(input_data_dir, file_prefix, "rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE) %>%
select(Gene, log_fold_change, P_value, adj_P_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean$P_value <- as.numeric(ranking_clean$P_value)
ranking_clean$adj_P_value <- as.numeric(ranking_clean$adj_P_value)
ranking_clean <- ranking_clean %>% arrange(P_value)
rm(tmp_res, ranking)
# Estimate Z-score values for the GES (as in DoRothEA manual, check VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics <- matrix(ranking_clean$LFC,
dimnames = list(ranking_clean$Gene, "logFC"))
myPvalue <- matrix(ranking_clean$adj_P_value,
dimnames = list(ranking_clean$Gene, "P.Value"))
mySignature <- (qnorm(myPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
mySignature <- mySignature[order(mySignature, decreasing = T)]
rm(ranking_clean, myPvalue, myStatistics)
# Estimate TF activities via DoRothEA.
TF_activities <- runDoRothEA(data.frame(EGF = mySignature, row.names = names(mySignature)),
regulon = viper_regulon,
confidence_level = c('A', 'B', 'C', 'D'))
TF_activities$SYMBOL <- row.names(TF_activities)
# Estimate TF activities via Viper.
mrs <- msviper(ges = mySignature, regulon = viper_regulon, minsize = 4, ges.filter = F)
# Format the results.
tf_name <- sapply(strsplit(names(mrs$es$nes), split = "_"), head, 1)
evidence_level <- sapply(strsplit(names(mrs$es$nes), split = "_"), tail, 1)
TF_activities <- data.frame(Regulon = tf_name,
evdc_lvl = evidence_level,
Size = mrs$es$size[ names(mrs$es$nes) ],
NES = mrs$es$nes,
p.value = mrs$es$p.value)
TF_activities <- TF_activities[ order(TF_activities$p.value), ] %>%
filter(evdc_lvl %in% c("A", "B", "C", "D")) %>%
mutate(FDR = p.adjust(p.value, method = "fdr")) %>%
rename(SYMBOL = Regulon)
row.names(TF_activities) <- TF_activities$SYMBOL
##rm(mrs, mySignature, tf_name, evidence_level)
# We then convert the gene names to Uniprot Ids since CARNIVAL requires Uniprot ids.
TF_activities_uniprotids <- merge(x = TF_activities,
y = map_genenames_uniprots,
......@@ -186,6 +225,7 @@ for (i in seq_len(length(config$ranking_files))) {
# We then move the results to the data folder.
system(paste0("mv ", result_dir, " ", output_data_dir))
}
} # End for each file to analyse.
rm(i, viper_regulon)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment