Commit 3e4e2db5 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refined the gender specific pathway figures.

parent 84aa8519
......@@ -303,14 +303,6 @@ i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We rely on the last limma analyses (gender_disease_status) but this is only to get the
# 4 categories of interest.
##k <- 5
##limma_analysis <- config$limma_analyses[[k]]
# We use the first gene-probe selection method (max-avg).
l <- 1
selection <- config$selections[[l]]
......@@ -446,6 +438,20 @@ for (gene in top_genes) {
message(paste0("[",gene, "][", dataset, "] plotted."))
}
}
# Manual figures via rstudio (for manuscript).
# dataset <- "GSE49036"
# gene <- "EFNA1"
# plot_gene(gene = gene,
# gene_datasets = c(dataset),
# output_data_dir = output_data_dir,
# integration_name = integration$name,
# limma_analysis = config$limma_analyses[[5]],
# selection_name = selection$name,
# palette = gds_plt,
# clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
# plot_all = FALSE, plot_datasets = TRUE)
rm(gene, top_genes, common_datasets)
rm(selection, exp_data_all, zexp_data_all, clin_data_all, m_indx, m_indx_map)
rm(l, gds_plt, vsn)
......
......@@ -29,7 +29,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# create a single GD ranking based on both by summing up the male and female signals.
# We focus on one configuration, our main configuration (SNage, VSN, max-avg) but we also
# run the other integration schemes (SN and DA) with the same default parameters.
# run the other integration schemes (SN / DA / iPSC-DA) with the same default parameters.
for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
......
......@@ -25,34 +25,15 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, logFC = val_1, adj_P_value = val_2)
return(to_ret)
}
# ================================================================================================
# Main
# ================================================================================================
# We define the I/Os.
F_ranking_file <- paste0(output_data_dir,
"SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv")
"SNage_PDVsControl_females_max-avg_all_pivalue_rankings.tsv")
M_ranking_file <- paste0(output_data_dir,
"SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv")
"SNage_PDVsControl_males_max-avg_all_pivalue_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
F_ranking <- read.delim(F_ranking_file, stringsAsFactors = FALSE) %>%
......@@ -61,49 +42,38 @@ M_ranking <- read.delim(M_ranking_file, stringsAsFactors = FALSE) %>%
select(Gene, log_fold_change, adj_P_value)
rm(F_ranking_file, M_ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
F_tmp_res <- t(apply(F_ranking, 1, duplicate_row_first_cell))
M_tmp_res <- t(apply(M_ranking, 1, duplicate_row_first_cell))
F_ranking_clean <- data.frame(do.call(rbind, c(F_tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
M_ranking_clean <- data.frame(do.call(rbind, c(M_tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
rm(F_tmp_res, F_ranking, M_tmp_res, M_ranking)
# We map the gene symbols to the EntrezGene identifiers.
F_mapped_egenes <- bitr(F_ranking_clean$Gene,
F_mapped_egenes <- bitr(F_ranking$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
F_ranking_final <- merge(x = F_ranking_clean,
F_ranking_final <- merge(x = F_ranking,
y = F_mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, logFC, adj_P_value)
M_mapped_egenes <- bitr(M_ranking_clean$Gene,
mutate(EGene = ENTREZID) %>% select(Gene, EGene, log_fold_change, adj_P_value)
M_mapped_egenes <- bitr(M_ranking$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
M_ranking_final <- merge(x = M_ranking_clean,
M_ranking_final <- merge(x = M_ranking,
y = M_mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>% select(Gene, EGene, logFC, adj_P_value)
F_ranking_final$logFC <- as.numeric(F_ranking_final$logFC)
mutate(EGene = ENTREZID) %>% select(Gene, EGene, log_fold_change, adj_P_value)
F_ranking_final$logFC <- as.numeric(F_ranking_final$log_fold_change)
F_ranking_final$adj_P_value <- as.numeric(F_ranking_final$adj_P_value)
M_ranking_final$logFC <- as.numeric(M_ranking_final$logFC)
M_ranking_final$logFC <- as.numeric(M_ranking_final$log_fold_change)
M_ranking_final$adj_P_value <- as.numeric(M_ranking_final$adj_P_value)
rm(F_mapped_egenes, F_ranking_clean, M_mapped_egenes, M_ranking_clean)
rm(F_mapped_egenes, M_mapped_egenes)
# We also save the data for GeneGo (manual work on the pathway maps).
FM <- merge(x = F_ranking_final, y = M_ranking_final,
by = "Gene", all = TRUE, suffixes = c("_F", "_M")) %>%
select(Gene, logFC_F, adj_P_value_F, logFC_M, adj_P_value_M,)
FM_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_both_max-avg_all_merged_rankings_4GeneGo.tsv")
FM_fn <- paste0(output_data_dir, "SNage_PDVsControl_both_max-avg_all_merged_rankings_4GeneGo.tsv")
FM$logFC_F [is.na(FM$logFC_F)] <- 0
FM$adj_P_value_F[is.na(FM$adj_P_value_F)] <- 1
FM$logFC_M [is.na(FM$logFC_M)] <- 0
......@@ -113,27 +83,27 @@ rm(FM, FM_fn)
# We prepare the gene id specific rankings (using entrezgene ids for KEGG).
# max P = 0.05
F_logFC_eg <- F_ranking_final$logFC[F_ranking_final$adj_P_value < 0.05]
names(F_logFC_eg) <- F_ranking_final$EGene[F_ranking_final$adj_P_value < 0.05]
F_logFC_eg <- F_logFC_eg[!duplicated(names(F_logFC_eg))]
F_logFC_eg <- F_logFC_eg[!is.na(names(F_logFC_eg))]
M_logFC_eg <- M_ranking_final$logFC[M_ranking_final$adj_P_value < 0.05]
names(M_logFC_eg) <- M_ranking_final$EGene[M_ranking_final$adj_P_value < 0.05]
M_logFC_eg <- M_logFC_eg[!duplicated(names(M_logFC_eg))]
M_logFC_eg <- M_logFC_eg[!is.na(names(M_logFC_eg))]
rm(F_ranking_final, M_ranking_final)
# Alternatively, max P = 1
# F_logFC_eg <- F_ranking_final$logFC
# names(F_logFC_eg) <- F_ranking_final$EGene
# F_logFC_eg <- F_ranking_final$logFC[F_ranking_final$adj_P_value < 0.05]
# names(F_logFC_eg) <- F_ranking_final$EGene[F_ranking_final$adj_P_value < 0.05]
# F_logFC_eg <- F_logFC_eg[!duplicated(names(F_logFC_eg))]
# F_logFC_eg <- F_logFC_eg[!is.na(names(F_logFC_eg))]
# M_logFC_eg <- M_ranking_final$logFC
# names(M_logFC_eg) <- M_ranking_final$EGene
# M_logFC_eg <- M_ranking_final$logFC[M_ranking_final$adj_P_value < 0.05]
# names(M_logFC_eg) <- M_ranking_final$EGene[M_ranking_final$adj_P_value < 0.05]
# M_logFC_eg <- M_logFC_eg[!duplicated(names(M_logFC_eg))]
# M_logFC_eg <- M_logFC_eg[!is.na(names(M_logFC_eg))]
# rm(F_ranking_final, M_ranking_final)
# Alternatively, max P = 1
F_logFC_eg <- F_ranking_final$logFC
names(F_logFC_eg) <- F_ranking_final$EGene
F_logFC_eg <- F_logFC_eg[!duplicated(names(F_logFC_eg))]
F_logFC_eg <- F_logFC_eg[!is.na(names(F_logFC_eg))]
M_logFC_eg <- M_ranking_final$logFC
names(M_logFC_eg) <- M_ranking_final$EGene
M_logFC_eg <- M_logFC_eg[!duplicated(names(M_logFC_eg))]
M_logFC_eg <- M_logFC_eg[!is.na(names(M_logFC_eg))]
rm(F_ranking_final, M_ranking_final)
# Reformat the data.
FM <- merge(x = data.frame(EGene = names(F_logFC_eg), F_log = F_logFC_eg),
y = data.frame(EGene = names(M_logFC_eg), M_log = M_logFC_eg),
......@@ -145,9 +115,10 @@ FM$M_log[is.na(FM$M_log)] <- 0
rm(F_logFC_eg, M_logFC_eg)
# Selection of KEGG pathways to be plotted.
my_ids <- c("hsa04062", "hsa04668", "hsa04721", "hsa04145", "hsa04064", "hsa04657", "hsa04010",
"hsa04151", "hsa04211", "hsa05012", "hsa00190", "hsa00061", "hsa04630", "hsa04210",
"hsa04115", "hsa04620", "hsa04612", "hsa04728")
my_ids <- c("hsa00020","hsa00061","hsa00190","hsa01200","hsa04012","hsa04060","hsa04061",
"hsa04062","hsa04657","hsa04668","hsa04714","hsa04721","hsa04966","hsa05012",
"hsa05017","hsa05020","hsa05110","hsa05120","hsa05130","hsa05132","hsa04728",
"hsa04064")
for (my_id in my_ids) {
pv.out <- pathview(gene.data = FM, pathway.id = my_id, species = "hsa",
......
......@@ -79,8 +79,8 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data <- c()
male_prefix <- paste0(integration$name, "_VSN_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_VSN_PDVsControl_females_")
male_prefix <- paste0(integration$name, "_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_PDVsControl_females_")
#
# KEGG by CP
......@@ -160,30 +160,6 @@ for (i in seq_len(length(config$integrations))) {
mutate(tool = "CP", source = "GOMF"))
rm(M_filename, F_filename, M_data, F_data)
#
# MSIGDB by G2P
#
M_filename <- paste0(output_data_dir, "G2P_", male_prefix, "max-avg_gs_pivalue_MSIGDB.tsv")
F_filename <- paste0(output_data_dir, "G2P_", female_prefix, "max-avg_gs_pivalue_MSIGDB.tsv")
M_data <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE)
M_data$adj_pv = M_data$pv * dim(M_data)[1]
M_data$adj_pv[M_data$adj_pv > 1] <- 1
M_data <- M_data %>%
select(pid, pname, adj_pv) %>% filter(!is.na(adj_pv)) %>%
rename(ID = pid, Description = pname, p.adjust = adj_pv)
minp_M <- min(M_data$p.adjust[M_data$p.adjust > 0])
M_data$p.adjust[M_data$p.adjust == 0] <- minp_M / 100
F_data <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE)
F_data$adj_pv = F_data$pv * dim(F_data)[1]
F_data$adj_pv[F_data$adj_pv > 1] <- 1
F_data <- F_data %>%
select(pid, pname, adj_pv) %>% filter(!is.na(adj_pv)) %>%
rename(ID = pid, Description = pname, p.adjust = adj_pv)
minp_F <- min(F_data$p.adjust[F_data$p.adjust > 0])
F_data$p.adjust[F_data$p.adjust == 0] <- minp_F / 100
gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
mutate(tool = "G2P", source = "MSIGDB"))
rm(M_filename, F_filename, M_data, F_data, minp_M, minp_F)
# We do not run it yet for the ROT and PF tools since there is no obvious p-value to use (there are several).
# We can later adapt this script for these two tools as well.
......
......@@ -87,8 +87,8 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data <- c()
male_prefix <- paste0(integration$name, "_VSN_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_VSN_PDVsControl_females_")
male_prefix <- paste0(integration$name, "_PDVsControl_males_")
female_prefix <- paste0(integration$name, "_PDVsControl_females_")
#
# KEGG by CP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment