Commit 0fab72eb authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Re-adjusting the GDS P values for the gender-dimorphic genes.

parent 69df6c97
......@@ -124,122 +124,22 @@ rm(x, res_females, res_males)
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
# We save the complete refining data as to keep track of things.
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
selection$name, "_FvsM_refining_data.tsv")
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap_fulldata.tsv")
write.table(FM, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM, refining_fn)
# D- We then enrich both raw and non raw results for females. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_females <- merge(x = res_females, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_refined.tsv")
write.table(res_females, res_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_females_fn)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_females <- merge(x = res_females, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw_refined.tsv")
write.table(res_females, res_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(k, limma_analysis, res_females_fn)
# E- We enrich both raw and non raw results for males. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_males <- merge(x = res_males, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_refined.tsv")
write.table(res_males, res_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_males_fn)
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_males <- merge(x = res_males, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw_refined.tsv")
write.table(res_males, res_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(k, limma_analysis, res_males_fn)
# F- We enrich both raw and non raw results for GDS. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k <- 7
limma_analysis <- config$limma_analyses[[k]]
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_gds <- read.delim(res_gds_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_gds <- merge(x = res_gds, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_refined.tsv")
write.table(res_gds, res_gds_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_gds_fn)
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_gds <- read.delim(res_gds_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_gds <- merge(x = res_gds, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw_refined.tsv")
write.table(res_gds, res_gds_fn,
# We save the simplified data (to be used directly for merging).
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
write.table(FM_ref, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(k, limma_analysis, res_gds_fn)
rm(FM_ref, res_females, res_males, res_gds)
rm(FM_ref, refining_fn)
# We clean the workspace and log the session details for reproducibility.
rm(l, selection, vsn, j, integration, int_criteria, i)
......
......@@ -23,36 +23,22 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' @title Function to compute the delta based on the Pi value based rankings.
#' @title Function to compute the gender-specificity score based on the Pi value rankings.
#'
#' @description This function takes a dataframe with the log fold changes and P values of two
#' comparisons termed "ref" and "ctrl". The idea is to rank the genes according to their
#' comparisons termed "ref" and "ctrl". The idea is to compute a score representing their
#' specificity for "ref" using "ctrl" to correct the non specific DEGs.
#' The specificity is based on the Pi values so the actual regulation (up/down) does not
#' matter and two genes with strong but opposite signals will not be considered specific.
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, and ctrl_adj_Pval.
#' @param has_ctrl A boolean indicating whether a "ctrl" comaprison is present. If not then
#' @param has_ctrl A boolean indicating whether a "ctrl" comparison is present. If not then
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' no delta values are returned.
#' @return An enriched data-frame that contains additional columns, in particular one that
#' contains the delta values, representing the specificity to the "ref" comparison (unless
#' has_ctrl was set to FALSE).
compute_delta <- function(FM, has_ctrl = TRUE) {
# Before we start with the real computation, we need to filter out the NA values in the control
# part of the data and keep them for later re-merging.
if (has_ctrl) {
FM_nas <- FM %>% filter(is.na(ctrl_Pval)) %>%
mutate(ref_pivalue = NA) %>%
mutate(ctrl_pivalue = NA) %>%
mutate(ref_pivalue_rank = NA) %>%
mutate(ref_pivalue_rankratio = NA) %>%
mutate(ctrl_adj_pivalue = NA) %>%
mutate(ctrl_pivalue_rank = NA) %>%
mutate(ctrl_pivalue_rankratio = NA) %>%
mutate(Delta = NA) %>%
mutate(ranking_value = 2)
FM <- FM %>% filter(!is.na(ctrl_Pval))
}
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
......@@ -115,7 +101,7 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
# The rank ratios of the genes based on the Pi value ranks (for the "ref"
# comparison). Values are [0, 1]. Small ratios correspond to high Pi values
# (and therefore significant DEGs).
FM$ref_pivalue_rank <- rank(-FM$ref_pivalue, ties.method = "min")
FM$ref_pivalue_rank <- rank(-FM$ref_pivalue, ties.method = "average")
max_rank <- max(FM$ref_pivalue_rank)
FM$ref_pivalue_rankratio <- FM$ref_pivalue_rank / max_rank
rm(max_rank)
......@@ -126,73 +112,19 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
return(FM)
}
# Creating the ranks and rank ratios for "control", based on the Pi values.
# Things are similar than for "ref" but because these rank ratios will only be used to
# correct the "ref" rank ratios, they don't have the same role and we need to take into
# account special cases such as when we correct a gene that is up-expressed in "ref" but
# down-expressed in "control" (or the other way round). In these cases, we can not take the
# "control" Pi values as such since they do not represent the same thing than the "ref"
# Pi values.
#
# Examples:
# Gene A:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = +1, P value = 1e-3, Pi value = 3
# => It makes sense to correct the "ref" results by the "ctrl" results since A is up-expressed
# in both comparisons.
# Gene B:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = -3, P value = 1e-7, Pi value = 21
# => It does not make sense to correct the "ref" results by the "ctrl" results since A is
# up-expressed in "ref" and down-expressed in "ctrl".
#
# This is why when the fold changes in both comparisons are of different signs, we modify
# the ranks and rank ratios as if it would be a gene that is not differentially expressed in
# "control".
#
# Example:
# Gene B:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = ~0, P value = ~1, Pi value = ~0 [pseudo-values only for computation]
# We get the minimum "ctrl" Pi value that is linked to a gene with concordant signal in both
# comparisons and whose P value is not 1 (in "ctrl").
min_pi <- min(FM %>% filter(sign(FM$ref_logFC) == sign(FM$ctrl_logFC)) %>%
filter(ctrl_Pval < 1) %>% select(ctrl_pivalue))
# We will add one field:
# ctrl_adj_pivalue:
# The adjusted Pi values for the "control" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. The difference with the non
# adjusted Pi values is that discordant cases (e.g., up in "ctrl" and down in
# "ref") and non differentially expressed genes (P value == 1) all get the
# same minimum Pi value as to not discriminate between these bottom line cases.
# We keep the original Pi values as is, but we create the adjusted Pi values.
FM$ctrl_adj_pivalue <- FM$ctrl_pivalue
# We then update the special cases mentioned above.
FM$ctrl_adj_pivalue[sign(FM$ref_logFC) != sign(FM$ctrl_logFC)] <- min_pi
FM$ctrl_adj_pivalue[FM$ctrl_Pval == 1] <- min_pi
rm(min_pi)
# Now creating the rankings per say.
# Now creating the ranks and rank ratios based on the Pi values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pivalue_rank:
# The rank of the genes based on their Pi values (for the "ctrl" comparison).
# Values are ~[1, 22000]. Small ranks correspond to high Pi values (and therefore
# significant DEGs). High ranks correspond to either non differentially expressed
# genes (P value == 1) or to genes with discordant differential expression between
# "ref" and "ctrl" (e.g., up in "ctrl", down in "ref").
# significant DEGs). High ranks correspond to non differentially expressed
# genes (P value == 1).
# ctrl_pivalue_rankratio:
# The rank ratios of the genes based on the Pi value ranks (for the "ctrl"
# comparison). Values are [0, 1]. Small ratios correspond to high Pi values
# (and therefore significant DEGs). High ratios correspond to either non
# differentially expressed genes (P value == 1) or to genes with discordant
# differential expression between "ref" and "ctrl" (e.g., up in "ctrl",
# down in "ref").
FM$ctrl_pivalue_rank <- rank(-FM$ctrl_adj_pivalue, ties.method = "min")
# (and therefore significant DEGs). High ratios correspond to non
# differentially expressed genes (P value == 1).
FM$ctrl_pivalue_rank <- rank(-FM$ctrl_pivalue, ties.method = "average")
max_rank <- max(FM$ctrl_pivalue_rank)
FM$ctrl_pivalue_rankratio <- FM$ctrl_pivalue_rank / max_rank
rm(max_rank)
......@@ -209,10 +141,8 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
# in "ctrl").
FM$Delta <- FM$ref_pivalue_rankratio - FM$ctrl_pivalue_rankratio
# We return the enriched data-frame after we have added back the section we left apart at the
# begining (because of the NAs).
FM$ranking_value <- 1 - FM$Delta
FM <- rbind(FM, FM_nas)
# We return the enriched data-frame.
FM$gender_specific_score <- 1 - FM$Delta
return(FM)
}
......@@ -235,6 +165,7 @@ for (i in seq_len(length(config$integrations))) {
# We otherwise use default parameters SVN = TRUE, probe selection = max-avg
# integration = Marot-Mayer.
# We use four of the seven limma comparisons.
#
# Note: we use the filtered results of the integration ("integration") as well
# as the non filtered results ("integration_raw") but only as a control. The idea
# is that to identify say female specific genes, we focus on the female DEGs
......@@ -244,8 +175,6 @@ for (i in seq_len(length(config$integrations))) {
# and the raw results contain more of these non expressed or non differentially
# expressed genes).
limmas <- config$limma_analyses
B_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[2]]$name,
"_max-avg_integration.tsv")
F_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_integration.tsv")
Fr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
......@@ -254,21 +183,19 @@ for (i in seq_len(length(config$integrations))) {
"_max-avg_integration.tsv")
Mr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_integration_raw.tsv")
G_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[7]]$name,
"_max-avg_integration.tsv")
B <- read.delim(B_fn, stringsAsFactors = FALSE, row.names = 1)
F <- read.delim(F_fn, stringsAsFactors = FALSE, row.names = 1)
Fr <- read.delim(Fr_fn, stringsAsFactors = FALSE, row.names = 1)
M <- read.delim(M_fn, stringsAsFactors = FALSE, row.names = 1)
Mr <- read.delim(Mr_fn, stringsAsFactors = FALSE, row.names = 1)
G <- read.delim(G_fn, stringsAsFactors = FALSE, row.names = 1)
rm(B_fn, F_fn, Fr_fn, M_fn, Mr_fn, G_fn)
rm(F_fn, Fr_fn, M_fn, Mr_fn)
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
rm(Fr, Mr)
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
FM <- merge(x = Fr, y = Mr, by = "SYMBOL")
MF <- merge(x = Mr, y = Fr, by = "SYMBOL")
rm(Fr, Mr, F, M)
# We select the fields we need:
# Gene:
......@@ -293,26 +220,6 @@ for (i in seq_len(length(config$integrations))) {
# ctrl_adj_Pval:
# The corrected P value associated with the "ctrl" comparison.
# Values are [0, 1]. The best P values are the smallest.
B <- B %>% mutate(Gene = SYMBOL,
ref_logFC = wavg_csss_logFC,
ref_Pval = int_csss_pval_marotmayer,
ref_adj_Pval = int_csss_adj_pval_marotmayer) %>%
select(Gene, ref_logFC, ref_Pval, ref_adj_Pval)
F <- F %>% mutate(Gene = SYMBOL,
ref_logFC = wavg_csss_logFC,
ref_Pval = int_csss_pval_marotmayer,
ref_adj_Pval = int_csss_adj_pval_marotmayer) %>%
select(Gene, ref_logFC, ref_Pval, ref_adj_Pval)
M <- M %>% mutate(Gene = SYMBOL,
ref_logFC = wavg_csss_logFC,
ref_Pval = int_csss_pval_marotmayer,
ref_adj_Pval = int_csss_adj_pval_marotmayer) %>%
select(Gene, ref_logFC, ref_Pval, ref_adj_Pval)
G <- G %>% mutate(Gene = SYMBOL,
ref_logFC = wavg_csss_logFC,
ref_Pval = int_csss_pval_marotmayer,
ref_adj_Pval = int_csss_adj_pval_marotmayer) %>%
select(Gene, ref_logFC, ref_Pval, ref_adj_Pval)
FM <- FM %>% mutate(Gene = SYMBOL,
ref_logFC = wavg_csss_logFC.x,
ref_Pval = int_csss_pval_marotmayer.x,
......@@ -329,44 +236,33 @@ for (i in seq_len(length(config$integrations))) {
ctrl_Pval = int_csss_pval_marotmayer.y,
ctrl_adj_Pval = int_csss_adj_pval_marotmayer.y) %>%
select(Gene, ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, ctrl_adj_Pval)
# We compute the delta in both cases.
B_enriched <- compute_delta(B, has_ctrl = FALSE)
F_enriched <- compute_delta(F, has_ctrl = FALSE)
M_enriched <- compute_delta(M, has_ctrl = FALSE)
G_enriched <- compute_delta(G, has_ctrl = FALSE)
FM_enriched <- compute_delta(FM)
MF_enriched <- compute_delta(MF)
rm(FM, MF, G, F, M, B)
FM_enriched <- compute_genderspecificity_scores(FM)
MF_enriched <- compute_genderspecificity_scores(MF)
rm(FM, MF)
# We save the data.
B_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[2]]$name, "_rankings.tsv")
F_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name, "_rankings.tsv")
M_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name, "_rankings.tsv")
G_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[7]]$name, "_rankings.tsv")
FM_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_specific_rankings.tsv")
MF_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_specific_rankings.tsv")
write.table(B_enriched, file = B_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(F_enriched, file = F_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(M_enriched, file = M_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(G_enriched, file = G_ofile, sep = "\t", quote = FALSE, col.names = NA)
# We save the full data.
FM_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_genderspecificityscore_fulldata.tsv")
MF_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_genderspecificityscore_fulldata.tsv")
write.table(FM_enriched, file = FM_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(MF_enriched, file = MF_ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(B_ofile, F_ofile, M_ofile, G_ofile, FM_ofile, MF_ofile)
rm(FM_ofile, MF_ofile)
# Manual exploration.
print("Top female genes (Pi values):")
F_enriched %>% arrange(ref_pivalue_rankratio) %>% head(5)
print("Top male genes (Pi values):")
M_enriched %>% arrange(ref_pivalue_rankratio) %>% head(5)
print("Top factorial genes (Pi values):")
G_enriched %>% arrange(ref_pivalue_rankratio) %>% head(5)
print("Top female specific genes (Delta):")
FM_enriched %>% arrange(Delta) %>% head(5)
print("Top male specific genes (Delta):")
MF_enriched %>% arrange(Delta) %>% head(5)
# We save the simplified data (to be merged later on).
FM_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_genderspecificityscore.tsv")
MF_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_genderspecificityscore.tsv")
FM_enriched_short <- FM_enriched %>% select(Gene, ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC,
ctrl_Pval, ctrl_adj_Pval, gender_specific_score)
MF_enriched_short <- MF_enriched %>% select(Gene, ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC,
ctrl_Pval, ctrl_adj_Pval, gender_specific_score)
write.table(FM_enriched_short, file = FM_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(MF_enriched_short, file = MF_ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(FM_ofile, MF_ofile, FM_enriched_short, MF_enriched_short)
# Summary statistics of DEG versus all genes.
# The DEGs should have more negative than positive Delta values in general, because
......@@ -384,10 +280,10 @@ for (i in seq_len(length(config$integrations))) {
# This display the effect of the correction (vertically shift = correction effect).
# The bottom line is the genes for which there is litle effect. Its position
# also reflects how many genes are corrected.
FM_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_specific_deltaplot.png")
MF_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_specific_deltaplot.png")
FM_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_specific_deltaplot.png")
MF_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_specific_deltaplot.png")
png(FM_deltaplot_ofile)
plot(FM_enriched$ref_pivalue_rank, rank(FM_enriched$Delta))
dev.off()
......@@ -396,36 +292,7 @@ for (i in seq_len(length(config$integrations))) {
dev.off()
rm(FM_deltaplot_ofile, MF_deltaplot_ofile)
# Another way to look at it.
# The gender DEGs and the gender specific DEGs are different if the correction is effective.
# Thus the mean/median change (correction effect) in ranks can be computed (it should not
# be null).
FM_rank_diff <- FM_enriched$ref_pivalue_rank - rank(FM_enriched$Delta)
MF_rank_diff <- MF_enriched$ref_pivalue_rank - rank(MF_enriched$Delta)
print("[Female] Rank changes between Pi and Delta:")
summary(abs(FM_rank_diff))
print("[Male] Rank changes between Pi and Delta:")
summary(abs(MF_rank_diff))
rm(FM_rank_diff, MF_rank_diff)
# We study the differences between the factorial limma analysis and our lists
# of gender specific genes. Technically, they are not the same but there could be
# some overlap. However the way we do things might not allow us to detect it in this plot.
G_FM <- merge(x = G_enriched, y = FM_enriched, by.x = "Gene", by.y = "Gene")
G_MF <- merge(x = G_enriched, y = MF_enriched, by.x = "Gene", by.y = "Gene")
G_FM_cmp_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[7]]$name, "_vs_",
limmas[[5]]$name, "_specific.png")
G_MF_cmp_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name, "_vs_",
limmas[[6]]$name, "_specific.png")
png(G_FM_cmp_ofile)
plot(G_FM$ref_pivalue_rankratio.x, rank(G_FM$Delta))
dev.off()
png(G_MF_cmp_ofile)
plot(G_MF$ref_pivalue_rankratio.x, rank(G_MF$Delta))
dev.off()
rm(G_FM, G_MF, G_FM_cmp_ofile, G_MF_cmp_ofile)
rm(B_enriched, G_enriched, F_enriched, M_enriched, FM_enriched, MF_enriched)
rm(FM_enriched, MF_enriched)
rm(integration, limmas)
}
......
......@@ -390,8 +390,8 @@ plot_gene(gene = gene,
selection_name = selection$name,
palette = current_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
#plot_all = FALSE, plot_datasets = TRUE)
plot_all = TRUE, plot_datasets = FALSE)
plot_all = FALSE, plot_datasets = TRUE)
#plot_all = TRUE, plot_datasets = FALSE)
gene_datasets <- c("GSE20292", "GSE8397", "GSE49036")
plot_gene(gene = gene,
gene_datasets = gene_datasets,
......@@ -457,8 +457,8 @@ limma_analysis <- config$limma_analyses[[k]]
#gene_datasets <- c("GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
#gene <- "CD93"
#gene_datasets <- c("GSE20159", "GSE20163", "GSE26927", "GSE49036", "GSE8397", "NBB")
#gene <- "CYP1B1"
#gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
gene <- "RFC5"
gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
gene <- "EFNA1"
gene_datasets <- c("GSE20159", "GSE26927", "GSE49036", "GSE8397", "NBB")
......@@ -470,8 +470,8 @@ plot_gene(gene = gene,
selection_name = selection$name,
palette = current_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
#plot_all = FALSE, plot_datasets = TRUE)
plot_all = TRUE, plot_datasets = FALSE)
plot_all = FALSE, plot_datasets = TRUE)
#plot_all = TRUE, plot_datasets = FALSE)
gene_datasets <- c("GSE49036", "NBB", "GSE26927")
plot_gene(gene = gene,
gene_datasets = gene_datasets,
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("stringr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
input_rsedata_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
# The aim of this script is to take the results of the integration and create gene lists that can
# be further analysed by GSEA and network-based methods. Several filters are applied and new
# scores / metrics are also computed. The general idea is to define the gender-specific and
# gender-dimorphic genes that make sense while keeping the workfow as simple as possible.
# Filtering parameters
specificity_threshold <- 1.1
overlap_threshold <- 0.5
# We focus on one configuration, our main configuration (SNage, VSN, max-avg).
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
j <- 1
vsn <- config$variance_methods[[j]]
l <- 1
selection <- config$selections[[l]]
# We first read the dataset overlap data (by definition, common to males and females).
data_over_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
data_over <- read.delim(data_over_fn,
quote = "",
stringsAsFactors = FALSE) %>%
mutate(Gene = SYMBOL) %>% select(Gene, overlap_percentage)