Commit 69df6c97 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring the code for the identification of the...

Refactoring the code for the identification of the gender-specific/gender-dimorphic genes [first part only].
parent a13c5720
......@@ -23,4 +23,6 @@ check:
gexpr:
@sbatch ${CODE_FOLDER}create_gene_expr_matrices.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
\ No newline at end of file
@sbatch ${CODE_FOLDER}doc.sh
rankings:
@sbatch ${CODE_FOLDER}/prepare_rankings.sh
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("stringr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
input_rsedata_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
# The aim of this script is to apply the second set of filters to the results of the integration.
# The filters for the male and female analyses are the following:
# - The specificity index should be higher than 1.10.
# - The gene should be either dimorphic (up in one gender, down in the other) or the absolute
# value of the log fold-change should be higher for the gender of interest (to be
# gender-specific).
# - The nominal P value for the gender of interest should be lower then 1 (only to remove
# the genes that are then ties with all the same P value of 1). This is because this script
# also produces a general ranking that will be then used for GSEA.
# The filters for the gender disease status analysis are the following:
# - We first filter the matrix as to keep only the genes that are potentially dimorphic (based
# on the female and male analyses). This is done on the raw results as to be able to recompute
# the adjusted P values. We then retain only the genes that pass the first filters (as in the
# non raw results). Namely the filters are:
# - The sign of the female and male log fold-changes should be different.
# - The dataset overlap between the female and male analysis should be strictly higher than 50%.
# - The genes should have confidence scores in females and males above the respective thresholds
# (like for the female and male analysis except that here we filter the GDS data).
# - The nominal P value should be lower then 1 (only to remove the genes that are then ties
# with all the same P value of 1). This is because this script also produces a general ranking
# that will be then used for GSEA.
#
# The results of the analyses are then pre-processed in order to derive the lists of gender-specfiic
# and gender-dimorphic genes. The rules are the following:
# - The gender-specific genes are coming from the female and male analyses.
# - The gender-dimorphic genes are coming from all three analyses.
# - A gender-specific gene needs to be identified as specific before or it is potentially dimorphic
# but the nominal P value in the other gender is 1 (so we can consider that the opposite LFC is
# not meaningful).
# - A gender-dimorphic is therefore a gene identified as potentially dimorphic but for which the
# nominal P value is less than 1 (so the regulation might be significant).
# - The lists are then joined in order to check that the male and female gender-specific lists are
# disjoint and to solve the overlap between the three lists of gender-dimorphic genes. This
# is based on the best pi values accross all lists.
specificity_threshold <- 1.1
pval_threshold <- 1
overlap_threshold <- 0.5
# We only do the SNage integration scheme.
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
selection <- config$selections[[l]]
# A- we first handle the results of the female analysis. We read the enriched results
# and apply the second filters.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_enriched.tsv")
res_females <- read.delim(res_females_fn,
quote = "",
stringsAsFactors = FALSE)
res_females_dimorphic <- res_females %>%
filter(as.numeric(ranking_value) >= specificity_threshold) %>%
filter(as.numeric(int_csss_pval_marotmayer) < pval_threshold) %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC)))
res_females_specific <- res_females %>%
filter(as.numeric(ranking_value) >= specificity_threshold) %>%
filter(as.numeric(int_csss_pval_marotmayer) < pval_threshold) %>%
filter(sign(as.numeric(ref_logFC)) == sign(as.numeric(ctrl_logFC))) %>%
filter(abs(as.numeric(ref_logFC)) > abs(as.numeric(ctrl_logFC)))
res_females_filtered <- rbind(res_females_specific, res_females_dimorphic)
rm(res_females_fn, res_females, res_females_dimorphic, res_females_specific)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_filtered.tsv")
write.table(res_females_filtered, res_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_females_fn)
# We defined the purely gender-specific genes and the genes that could potentially be
# gender-dimorphic. The rules are the ones mentioned above.
fres_females_specific_p1 <- res_females_filtered %>%
filter(sign(as.numeric(ref_logFC)) == sign(as.numeric(ctrl_logFC)))
fres_females_specific_p2 <- res_females_filtered %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC))) %>%
filter(as.numeric(ctrl_Pval) == 1)
fres_females_dimorphic <- res_females_filtered %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC))) %>%
filter(as.numeric(ctrl_Pval) < 1)
fres_females_specific <- rbind(fres_females_specific_p1, fres_females_specific_p2)
rm(res_females_filtered, fres_females_specific_p1, fres_females_specific_p2)
rm(k, limma_analysis)
# B- we then handle the results of the male analysis. We read the enriched results
# and apply the second filters.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_enriched.tsv")
res_males <- read.delim(res_males_fn,
quote = "",
stringsAsFactors = FALSE)
res_males_dimorphic <- res_males %>%
filter(as.numeric(ranking_value) >= specificity_threshold) %>%
filter(as.numeric(int_csss_pval_marotmayer) < pval_threshold) %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC)))
res_males_specific <- res_males %>%
filter(as.numeric(ranking_value) >= specificity_threshold) %>%
filter(as.numeric(int_csss_pval_marotmayer) < pval_threshold) %>%
filter(sign(as.numeric(ref_logFC)) == sign(as.numeric(ctrl_logFC))) %>%
filter(abs(as.numeric(ref_logFC)) > abs(as.numeric(ctrl_logFC)))
res_males_filtered <- rbind(res_males_specific, res_males_dimorphic)
rm(res_males_fn, res_males, res_males_dimorphic, res_males_specific)
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_filtered.tsv")
write.table(res_males_filtered, res_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_males_fn)
# We defined the purely gender-specific genes and the genes that could potentially be
# gender-dimorphic. The rules are the ones mentioned above.
fres_males_specific_p1 <- res_males_filtered %>%
filter(sign(as.numeric(ref_logFC)) == sign(as.numeric(ctrl_logFC)))
fres_males_specific_p2 <- res_males_filtered %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC))) %>%
filter(as.numeric(ctrl_Pval) == 1)
fres_males_dimorphic <- res_males_filtered %>%
filter(sign(as.numeric(ref_logFC)) != sign(as.numeric(ctrl_logFC))) %>%
filter(as.numeric(ctrl_Pval) < 1)
fres_males_specific <- rbind(fres_males_specific_p1, fres_males_specific_p2)
rm(res_males_filtered, fres_males_specific_p1, fres_males_specific_p2)
rm(k, limma_analysis)
# We check that the overlap between the two specific files is 0 (otherwise we may
# have an issue that would need to be solved.
if (length(intersect(fres_females_specific$SYMBOL, fres_males_specific$SYMBOL)) != 0) {
message(paste0("[", Sys.time(), "] There is some overlap between the the female and male",
" gender-specific gene lists, which is unexpected."))
}
# We select only the fields we want to have in the final TSV file.
fres_females_specific <- fres_females_specific %>%
select("SYMBOL", "GSE20159", "GSE20163", "GSE20164", "GSE20292", "GSE26927", "GSE49036",
"GSE8397", "NBB", "P_GSE20159", "P_GSE20163", "P_GSE20164", "P_GSE20292", "P_GSE26927",
"P_GSE49036", "P_GSE8397", "P_NBB", "ref_logFC", "ref_Pval", "ref_adj_Pval", "ctrl_logFC",
"ctrl_Pval", "ctrl_adj_Pval", "confidence_score", "ranking_value")
fres_males_specific <- fres_males_specific %>%
select("SYMBOL", "GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397",
"Moreira", "NBB", "P_GSE20159", "P_GSE20163", "P_GSE20292", "P_GSE26927", "P_GSE49036",
"P_GSE8397", "P_Moreira", "P_NBB", "ref_logFC", "ref_Pval", "ref_adj_Pval", "ctrl_logFC",
"ctrl_Pval", "ctrl_adj_Pval", "confidence_score", "ranking_value")
# We save the results (gender-specific).
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_genelist_female_genderspecific.tsv")
write.table(fres_females_specific, res_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_genelist_male_genderspecific.tsv")
write.table(fres_males_specific, res_males_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_males_fn, k, limma_analysis)
# C- We take care of the GDS results. We read the raw_enriched results.
k <- 7
limma_analysis <- config$limma_analyses[[k]]
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw_enriched.tsv")
res_gds <- read.delim(res_gds_fn,
quote = "",
stringsAsFactors = FALSE)
# As indicated above, we restrict the analysis to the dimorphic genes with enough dataset overlap
# between the two genders.
res_gds_dimorphic <- res_gds %>%
filter(sign(as.numeric(F.wavg_csss_logFC)) != sign(as.numeric(M.wavg_csss_logFC))) %>%
filter(as.numeric(overlap_percentage) > overlap_threshold)
rm(res_gds)
# We recompute the adjusted P-values as to consider only these dimorphic genes.
original_pvalues <- as.numeric(res_gds_dimorphic$int_csss_pval_marotmayer)
res_gds_dimorphic$RE.int_csss_adj_pval_marotmayer <- p.adjust(original_pvalues, method = "BH")
res_gds_dimorphic$RE.gds_pi_value <- abs(res_gds_dimorphic$wavg_csss_logFC) *
-log10(res_gds_dimorphic$int_csss_pval_marotmayer)
rm(original_pvalues)
# We run again the first set of filters since we started from the raw results.
# These are equivalent to the filters in the integrate_datasets.R script (#492).
# In this case (GDS data), we have 6 datasets.
conf_threshold <- config$confidence_threshold / 6
nb_db_min <- max(config$nb_min_pval, config$perc_min_pval * 6)
res_gds_dimorphic <- res_gds_dimorphic %>%
filter(as.numeric(confidence_score) >= conf_threshold) %>%
filter(as.numeric(nb_csss_datasets) >= nb_db_min)
rm(conf_threshold, nb_db_min)
# The confidence thresholds are based on the number of data sources we have, 8 in our case
# for both the female and male analyses.
conf_threshold <- config$confidence_threshold / 8
nb_db_min <- max(config$nb_min_pval, config$perc_min_pval * 8)
res_gds_dimorphic <- res_gds_dimorphic %>%
filter(as.numeric(F.confidence_score) >= conf_threshold) %>%
filter(as.numeric(M.confidence_score) >= conf_threshold) %>%
filter(as.numeric(F.nb_csss_datasets) >= nb_db_min) %>%
filter(as.numeric(M.nb_csss_datasets) >= nb_db_min) %>%
filter(as.numeric(int_csss_pval_marotmayer) < pval_threshold)
rm(conf_threshold, nb_db_min)
# We save the filtered results.
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_filtered.tsv")
write.table(res_gds_dimorphic, res_gds_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_gds_fn)
# We start by filtering the GDS results as to keep only the GDS data and remove the
# results of the male and female analyses.
res_gds_dimorphic <- res_gds_dimorphic %>%
select("SYMBOL",
#"GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB",
#"P_GSE20159", "P_GSE20163", "P_GSE20292", "P_GSE26927", "P_GSE49036", "P_GSE8397",
#"P_NBB",
"wavg_csss_logFC", "int_csss_pval_marotmayer", "int_csss_adj_pval_marotmayer",
"RE.int_csss_adj_pval_marotmayer",
#"confidence_score",
"RE.gds_pi_value")
# We then do the same for the male and female dimorphic results.
fres_females_dimorphic_short <- fres_females_dimorphic %>%
select("SYMBOL", "ref_logFC", "ref_Pval", "ref_adj_Pval", "ref_pivalue")
fres_males_dimorphic_short <- fres_males_dimorphic %>%
select("SYMBOL", "ref_logFC", "ref_Pval", "ref_adj_Pval", "ref_pivalue")
# We do the merge keeping all genes (because some genes only have results for one
# of the analysis, some for two, and some for three).
fres_dimorphic <- merge(merge(res_gds_dimorphic, fres_females_dimorphic_short,
by = "SYMBOL",
all = TRUE),
fres_males_dimorphic_short,
by = "SYMBOL",
all = TRUE,
suffixes = c(".F", ".M"))
fres_dimorphic <- fres_dimorphic %>%
mutate(bestpi = pmax(fres_dimorphic$RE.gds_pi_value, fres_dimorphic$ref_pivalue.F,
fres_dimorphic$ref_pivalue.M, na.rm = TRUE)) %>%
mutate(tmp1 = ifelse(is.na(RE.gds_pi_value), "", ifelse(bestpi == RE.gds_pi_value, "GDS", "")),
tmp2 = ifelse(is.na(ref_pivalue.F), "", ifelse(bestpi == ref_pivalue.F, "F", "")),
tmp3 = ifelse(is.na(ref_pivalue.M), "", ifelse(bestpi == ref_pivalue.M, "M", ""))) %>%
mutate(source = paste0(tmp1, tmp2, tmp3)) %>%
select(!starts_with("tmp"))
# We save the filtered results.
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_genelist_genderdimorphic.tsv")
write.table(fres_dimorphic, res_gds_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_gds_fn)
rm(k, limma_analysis)
# We check that the overlap between the gender-specific and gender-dimorphic files is 0 (otherwise
# we may have an issue that would need to be solved.
if (length(intersect(fres_females_specific$SYMBOL, fres_dimorphic$SYMBOL)) != 0) {
message(paste0("[", Sys.time(), "] There is some overlap between the the female and male",
" gender-specific gene lists, which is unexpected."))
}
# TODO HERE THEREsI OVERLAOP + CEJHCK MALES AS WELL...
# We clean the workspace and log the session details for reproducibility.
rm(specificity_threshold, pval_threshold, overlap_threshold)
rm(l, selection, vsn, j, integration, int_criteria, i)
rm(config, output_data_dir, input_data_dir, input_edata_dir, input_rsedata_dir)
sessionInfo()
......@@ -38,6 +38,22 @@ message(paste0("[", Sys.time(), "] Configuration done."))
#' has_ctrl was set to FALSE).
compute_delta <- function(FM, has_ctrl = TRUE) {
# Before we start with the real computation, we need to filter out the NA values in the control
# part of the data and keep them for later re-merging.
if (has_ctrl) {
FM_nas <- FM %>% filter(is.na(ctrl_Pval)) %>%
mutate(ref_pivalue = NA) %>%
mutate(ctrl_pivalue = NA) %>%
mutate(ref_pivalue_rank = NA) %>%
mutate(ref_pivalue_rankratio = NA) %>%
mutate(ctrl_adj_pivalue = NA) %>%
mutate(ctrl_pivalue_rank = NA) %>%
mutate(ctrl_pivalue_rankratio = NA) %>%
mutate(Delta = NA) %>%
mutate(ranking_value = 2)
FM <- FM %>% filter(!is.na(ctrl_Pval))
}
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
# difficult since most values would be 1. We anyway intend to use the gender specific rankings
......@@ -193,8 +209,10 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
# in "ctrl").
FM$Delta <- FM$ref_pivalue_rankratio - FM$ctrl_pivalue_rankratio
# We return the enriched data-frame.
# We return the enriched data-frame after we have added back the section we left apart at the
# begining (because of the NAs).
FM$ranking_value <- 1 - FM$Delta
FM <- rbind(FM, FM_nas)
return(FM)
}
......@@ -226,17 +244,17 @@ for (i in seq_len(length(config$integrations))) {
# and the raw results contain more of these non expressed or non differentially
# expressed genes).
limmas <- config$limma_analyses
B_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[2]]$name,
B_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[2]]$name,
"_max-avg_integration.tsv")
F_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
F_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_integration.tsv")
Fr_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
Fr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
"_max-avg_integration_raw.tsv")
M_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
M_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_integration.tsv")
Mr_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
Mr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
"_max-avg_integration_raw.tsv")
G_fn <- paste0(input_data_dir, integration$name, "_VSN_", limmas[[7]]$name,
G_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[7]]$name,
"_max-avg_integration.tsv")
B <- read.delim(B_fn, stringsAsFactors = FALSE, row.names = 1)
F <- read.delim(F_fn, stringsAsFactors = FALSE, row.names = 1)
......@@ -248,8 +266,8 @@ for (i in seq_len(length(config$integrations))) {
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
FM <- merge(x = F, y = Mr, by = "SYMBOL", all = FALSE)
MF <- merge(x = M, y = Fr, by = "SYMBOL", all = FALSE)
FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
rm(Fr, Mr)
# We select the fields we need:
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("stringr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
input_rsedata_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# This function identifies, for each gene, the datasets that have been used for the integration.
# To do this, it takes a row of a dataframe and check which log fold-changes are of the same
# sign as the consensus and extract the relevant datasset names.
identify_datasets <- function(row, datasets, offset = 1) {
consensus <- as.numeric(row["wavg_csss_logFC"])
used_datasets <- c()
for (m in seq_len(length(datasets))) {
current_value <- as.numeric(row[offset + m])
if (!is.na(current_value) & sign(current_value) == sign(consensus)) {
used_datasets <- c(used_datasets, datasets[m])
}
}
return(paste(used_datasets, collapse = "|"))
}
# This function computes, for each gene, the overlap between the datasets that were used in
# the female and male analyses. It also computes the overlap as a percentage.
compute_overlap <- function(row) {
f_ds <- unlist(strsplit(row["female_used_datasets"], "\\|"))
m_ds <- unlist(strsplit(row["male_used_datasets"], "\\|"))
overlap <- intersect(f_ds,m_ds)
local_min <- min(length(f_ds), length(m_ds))
over_perc <-length(overlap) / local_min
over_str <- paste(overlap, collapse = "|")
return(c(over_str, length(f_ds), length(m_ds), length(overlap), over_perc))
}
# ================================================================================================
# Main
# ================================================================================================
# The aim of this script is to enrich the results of the integration by adding one column that
# corresponds to the overlap between the datasets used in the female and male analyses. The
# rationale is that both analyses can be based on quite distinct sets of datasets and in order to
# avoid inconsistencies, we compute the overlap as to be able to select the genes for which the
# male and female signals are consistent within most the datasets.
# We do only the SNage integration scheme.
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
selection <- config$selections[[l]]
# A- we first handle the results of the female analysis. We will read the raw results
# and identify which datasets have been used for each gene (in females).
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_females)[2:9]
datasets <- datasets[datasets != "GSE20164"]
res_females$used_datasets <- apply(res_females, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_females_fn)
# B- we then handle the results of the male analysis. We will read the raw results again
# and identify which datasets have been used for each gene (in males).
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_males)[2:9]
datasets <- datasets[datasets != "Moreira"]
res_males$used_datasets <- apply(res_males, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_males_fn)
# C- We compute the overlap between the used datasets in the female and male analyses.
# This is done per gene again. We additionally get the number of used datasets for female,
# male, common datasets. This overlap is also expressed as a percentage with respect to
# smallest of the female and male sets of datasets.
FM <- merge(x = res_females, y = res_males, by = "SYMBOL") %>%
mutate(female_used_datasets = used_datasets.x, male_used_datasets = used_datasets.y) %>%
select(SYMBOL, female_used_datasets, male_used_datasets)
x <- t(apply(FM, 1, compute_overlap))
FM$overlap_datasets <- x[,1]
FM$female_nb_used_datasets <- as.numeric(x[,2])
FM$male_nb_used_datasets <- as.numeric(x[,3])
FM$nb_overlap_datasets <- as.numeric(x[,4])
FM$overlap_percentage <- as.numeric(x[,5])
rm(x, res_females, res_males)
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
# We save the complete refining data as to keep track of things.
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
selection$name, "_FvsM_refining_data.tsv")
write.table(FM, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM, refining_fn)
# D- We then enrich both raw and non raw results for females. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
res_females <- merge(x = res_females, y = FM_ref, by = "SYMBOL", ALL.x = TRUE)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_refined.tsv")
write.table(res_females, res_females_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(res_females_fn)
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")