Commit 9527d599 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Coding style harmonized.

parent bbf6a822
......@@ -50,7 +50,7 @@ for (i in seq_len(length(config$datasets))) {
if (selected_dataset_name == "" || selected_dataset_name == dataset_name) {
# We only do the QC when it is necessary.
if (dataset$first_qc != 'TRUE') {
if (dataset$first_qc != "TRUE") {
message(paste0("[", Sys.time(), "] [", dataset_name, "] QC not necessary."))
next
}
......
......@@ -79,7 +79,7 @@ for (i in seq_len(length(config$platforms))) {
gene_annots_filename,
unique_probe_ids)
rm(gene_annots_filename)
# Then, we make sure we deal with duplicates and reorder the df to match the list we have.
# We do not use EntrezGene and EnsEMBL ids.
gene_annots <- gene_annots_raw %>%
......
......@@ -64,4 +64,4 @@ write.table(nb_samples,
remove(nb_samples_raw, nb_samples, nb_samples_fn)
# We log the session details for reproducibility.
sessionInfo()
\ No newline at end of file
sessionInfo()
......@@ -217,7 +217,7 @@ for (i in seq_len(length(config$datasets))) {
} else if (dataset$tissue == "iPSC-DA") {
# We save the index
ip_indx <- c(ip_indx, i)
# We save the data.
ip_exp_data_vsn[[length(ip_indx)]] <- exp_data_vsn
ip_exp_data_novsn[[length(ip_indx)]] <- exp_data_novsn
......@@ -292,16 +292,16 @@ for (i in seq_len(length(config$biomarkers))) {
da_data_vsn_zsc <- vector()
da_data_novsn_zsc <- vector()
for (ii in seq_len(length(da_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- da_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
......@@ -311,7 +311,7 @@ for (i in seq_len(length(config$biomarkers))) {
exp_data_novsn_zsc <- data.frame(Expression = da_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# We update the global data-frame.
da_data_vsn <- rbind(da_data_vsn, exp_data_vsn)
da_data_novsn <- rbind(da_data_novsn, exp_data_novsn)
......@@ -329,16 +329,16 @@ for (i in seq_len(length(config$biomarkers))) {
ip_data_vsn_zsc <- vector()
ip_data_novsn_zsc <- vector()
for (ii in seq_len(length(ip_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- ip_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
......@@ -348,7 +348,7 @@ for (i in seq_len(length(config$biomarkers))) {
exp_data_novsn_zsc <- data.frame(Expression = ip_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# We update the global data-frame.
ip_data_vsn <- rbind(ip_data_vsn, exp_data_vsn)
ip_data_novsn <- rbind(ip_data_novsn, exp_data_novsn)
......@@ -403,7 +403,7 @@ for (i in seq_len(length(config$biomarkers))) {
rm(sn_data_vsn, sn_data_novsn, sn_data_vsn_zsc, sn_data_novsn_zsc)
rm(da_data_vsn, da_data_novsn, da_data_vsn_zsc, da_data_novsn_zsc)
rm(ip_data_vsn, ip_data_novsn, ip_data_vsn_zsc, ip_data_novsn_zsc)
# We plot the gene.
bm_plot_fn <- paste0(output_data_dir, "Biomarker_", str_replace_all(biomarker$tissue, " ", "_"),
"_", biomarker$name, ".png")
......
......@@ -34,19 +34,19 @@ message(paste0("[", Sys.time(), "] Configuration done."))
#' @param file_prefix A striong representing the prefix of the file names to be used to
#' save the significant genes.
save_sign_genes <- function(data, file_prefix) {
# We filter the data.
sign_genes <- data %>%
dplyr::mutate(diff_exp = ifelse(sign(wavg_csss_logFC) == 1, "up", "down")) %>% # nolint
dplyr::select(SYMBOL, diff_exp)
sign_genes_up <- sign_genes %>% dplyr::filter(diff_exp == "up") # nolint
sign_genes_down <- sign_genes %>% dplyr::filter(diff_exp == "down") # nolint
# We define filenames.
sign_genes_fn <- paste0(file_prefix, "sign_genes.tsv")
sign_genes_up_fn <- paste0(file_prefix, "sign_genes_up.tsv")
sign_genes_down_fn <- paste0(file_prefix, "sign_genes_down.tsv")
# We write down data.
write.table(sign_genes,
file = sign_genes_fn,
......@@ -66,7 +66,7 @@ save_sign_genes <- function(data, file_prefix) {
quote = FALSE,
col.names = FALSE,
row.names = FALSE)
# We log some infos for the user.
loc_log <- paste0(" Total: ", dim(sign_genes)[1],
" Up: ", dim(sign_genes_up)[1],
......@@ -95,34 +95,34 @@ all_count_rnames <- vector()
# As usual, we loop over the integrations / variances / limmas / selections / combinations.
# We start by considering all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We define the current limma analysis.
limma_comparison <- config$limma_analyses[[k]]$name
# We have structures to keep all data, regardless of the
# gene-probe selection nmethod in order to make comparisons later.
sign_genes_all <- list()
selection_names <- vector()
combination_names <- vector()
# We loop over the different gene-probe selection methods.
for (l in seq_len(length(config$selections))) {
# We define the current gene-probe selection method.
selection <- config$selections[[l]]
selection_names <- c(selection_names, selection$name)
# We define the file prefix for this configuration defined
# based on all above loops.
file_prefix <- paste0(output_data_dir, integration$name, "_",
limma_comparison, "_", selection$name, "_")
# We read the results of the integration (contains results for all P value combination
# methods).
int_results_fn <- paste0(file_prefix, "integration.tsv")
......@@ -133,12 +133,6 @@ for (i in seq_len(length(config$integrations))) {
as.is = TRUE)
rm(int_results_fn)
# If there are no results, we do not do anything (rare - only some limma on iPSC-DA).
#if (dim(integration_results)[1] == 0) {
# rm(selection, file_prefix, integration_results)
# next
#}
# We create the data structures to later plot the use counts.
results_core_matrix_full <- integration_results %>%
dplyr::select(-starts_with("sum_of")) %>%
......@@ -148,7 +142,7 @@ for (i in seq_len(length(config$integrations))) {
dplyr::select(-starts_with("wavg")) %>%
dplyr::select(-starts_with("nb_")) %>%
dplyr::select(-starts_with("SYMBOL"))
# We filter out to non consensus log fold changes only for the use count plots.
results_csss_lfc <- rep(integration_results$wavg_csss_logFC, dim(results_core_matrix_full)[2])
dim(results_csss_lfc) <- dim(results_core_matrix_full)
......@@ -161,7 +155,7 @@ for (i in seq_len(length(config$integrations))) {
names(use_count_full)[1] <- "Total"
use_counts <- data.frame(Dataset = names(use_count_full), Count = use_count_full)
rm(results_core_matrix_full, nb_nas, use_count_full, results_csss_lfc)
# Plotting of the use counts (aka the number of P values that were combined via the
# integration).
use_counts_fn <- paste0(file_prefix, "use_counts.png")
......@@ -181,28 +175,28 @@ for (i in seq_len(length(config$integrations))) {
print(myplot)
dev.off()
rm(myplot, use_counts_fn, use_counts)
# Clean the df for further use (now we only need the global P values).
integration_results <- integration_results %>%
dplyr::select(SYMBOL, wavg_csss_logFC, starts_with("int_csss_adj_pval_")) #nolint
# We then study the various P value combination methods one by one.
all_local_counts <- vector()
for (o in seq_len(length(config$p_val_combinations))) {
pval_comb <- config$p_val_combinations[[o]]
combination_names <- c(combination_names, pval_comb$name)
# List the significant genes if any.
sign_genes <- integration_results %>%
dplyr::filter(.data[[paste0("int_csss_adj_pval_",
pval_comb$name)]] < config$p_val_sign_thres)
# Update the global structures (for comparison).
local_index <- length(config$p_val_combinations) * (l - 1)
sign_genes_all[[local_index + o]] <- sign_genes
all_local_counts <- c(all_local_counts, dim(sign_genes)[1])
rm(local_index)
# We plot the eCDFs of the P values (if we have at least one gene).
if (dim(sign_genes)[1] > 0) {
sign_genes_4ecdf <- sign_genes %>%
......@@ -228,13 +222,13 @@ for (i in seq_len(length(config$integrations))) {
dev.off()
rm(sign_genes_4ecdf, ecdf_fn, myplot)
} # End if there is at least one significant DEG.
# We now save the individual gene lists (even if empty).
save_sign_genes(sign_genes, paste0(file_prefix, pval_comb$name, "_"))
rm(pval_comb, sign_genes)
} # End for all P value combination methods.
rm(o, integration_results)
# We reformat the count matrix as to later be able to compare the different
# P value combination methods and gene selection methods.
all_counts <- rbind(all_counts, all_local_counts)
......@@ -243,7 +237,7 @@ for (i in seq_len(length(config$integrations))) {
rm(all_local_counts, selection, file_prefix)
} # End of for each gene-probe selection.
rm(l)
# At this stage, we want to compare the gene-probe selection / P value combination methods.
# We start by looking at the overlaps (raw counts and percentages).
# n covers the gene-probe selection methods and the P value combination methods.
......@@ -271,7 +265,7 @@ for (i in seq_len(length(config$integrations))) {
rm(n)
}
rm(m, sign_genes_all)
# We save these data (overlaps).
subnames <- rep(selection_names, each = length(config$p_val_combinations))
names <- paste0(subnames, "-", combination_names)
......@@ -284,9 +278,9 @@ for (i in seq_len(length(config$integrations))) {
write.table(overlap, file = overlap_fn, sep = "\t", quote = FALSE, col.names = NA)
rm(subnames, names, overlap, overlap_fn, selection_names)
rm(limma_comparison, file_prefix)
} # End of for each limma analysis.
rm(k,integration)
rm(k, integration)
} # End for each data integration scheme.
rm(i)
......
......@@ -25,14 +25,14 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 3, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
......@@ -118,12 +118,6 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
dplyr::select(SYMBOL, !! sym(paste0("int_csss_adj_pval_", pval_comb$name))) %>%
head(nb_hits)
# We add the predefined genes to plot regardless of their adjusted P values.
##other_hits <- results_data_df %>%
## dplyr::select(SYMBOL, !! sym(paste0("int_csss_adj_pval_", pval_comb$name))) %>%
## filter(SYMBOL %in% config$genes_to_plot)
##top_hits <- rbind(top_hits, other_hits)
# We plot each gene sequentially.
for (p in seq_len(dim(top_hits)[1])) {
......@@ -150,13 +144,14 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
probeset_tag <- paste0(dataset_arraytype, "_",
stringr::str_replace(selection_name, "-", "."))
probe_ids <- matching_data_gene[[probeset_tag]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id <- probe_ids[!is.na(probe_ids)]
if (length(probe_id) > 1) {
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL, " has multiple probes for dataset ",
dataset_name, " (", dataset_arraytype, "): ", paste0(probe_ids, collapse = "-")))
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL,
" has multiple probes for dataset ", dataset_name, " (",
dataset_arraytype, "): ", paste0(probe_ids, collapse = "-")))
}
rm(dataset, probeset_tag, dataset_arraytype, jj, probe_ids)
......@@ -177,7 +172,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
}
}
colnames(exp_data) <- "Expression"
# We read the expression data (Z-scores).
# The dataset might not have a probe for that gene.
zexp_data <- t(zexp_data_all[[ii]][1, ])
......@@ -190,7 +185,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
}
}
colnames(zexp_data) <- "Expression"
# We assemble clinical and expression and reformat.
local_block <- cbind(clin_data, exp_data) %>%
dplyr::mutate(dataset = dataset_name)
......
......@@ -40,12 +40,12 @@ identify_datasets <- function(row, datasets, offset = 1) {
return(paste(used_datasets, collapse = "|"))
}
# This function computes, for each gene, the overlap between the datasets that were used in
# This function computes, for each gene, the overlap between the datasets that were used in
# the female and male analyses. It also computes the overlap as a percentage.
compute_overlap <- function(row) {
f_ds <- unlist(strsplit(row["female_used_datasets"], "\\|"))
m_ds <- unlist(strsplit(row["male_used_datasets"], "\\|"))
overlap <- intersect(f_ds,m_ds)
overlap <- intersect(f_ds, m_ds)
local_min <- min(length(f_ds), length(m_ds))
over_perc <- length(overlap) / local_min
over_str <- paste(overlap, collapse = "|")
......@@ -58,8 +58,8 @@ compute_overlap <- function(row) {
# The aim of this script is to enrich the results of the integration by adding one column that
# corresponds to the overlap between the datasets used in the female and male analyses. The
# rationale is that both analyses can be based on quite distinct sets of datasets and in order to
# avoid inconsistencies, we compute the overlap as to be able to select the genes for which the
# rationale is that both analyses can be based on quite distinct sets of datasets and in order to
# avoid inconsistencies, we compute the overlap as to be able to select the genes for which the
# male and female signals are consistent within most the datasets.
# We do all integration schemes one by one.
......@@ -70,7 +70,7 @@ for (i in seq_len(length(config$integrations))) {
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
l <- 1
selection <- config$selections[[l]]
# A- we first read the raw results of the female analysis.
......@@ -99,7 +99,6 @@ for (i in seq_len(length(config$integrations))) {
"P_")], "P_", "")
rm(k, limma_analysis, res_males_fn)
# C- We then identify which datasets have been used for each gene (for both
# males and females).
datasets <- intersect(female_datasets, male_datasets)
......@@ -116,10 +115,10 @@ for (i in seq_len(length(config$integrations))) {
select(SYMBOL, female_used_datasets, male_used_datasets)
x <- t(apply(FM, 1, compute_overlap))
FM$overlap_datasets <- x[,1]
FM$female_nb_used_datasets <- as.numeric(x[,2])
FM$male_nb_used_datasets <- as.numeric(x[,3])
FM$nb_overlap_datasets <- as.numeric(x[,4])
FM$overlap_percentage <- as.numeric(x[,5])
FM$female_nb_used_datasets <- as.numeric(x[, 2])
FM$male_nb_used_datasets <- as.numeric(x[, 3])
FM$nb_overlap_datasets <- as.numeric(x[, 4])
FM$overlap_percentage <- as.numeric(x[, 5])
rm(x, res_females, res_males)
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
......
......@@ -24,22 +24,21 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
#' @title Function to compute the gender-specificity score based on the Pi value rankings.
#'
#' @description This function takes a dataframe with the log fold changes and P values of two
#' comparisons termed "ref" and "ctrl". The idea is to compute a score representing their
#' comparisons termed "ref" and "ctrl". The idea is to compute a score representing their
#' specificity for "ref" using "ctrl" to correct the non specific DEGs.
#' The specificity is based on the Pi values so the actual regulation (up/down) does not
#' matter and two genes with strong but opposite signals will not be considered specific.
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, and ctrl_adj_Pval.
#' @param has_ctrl A boolean indicating whether a "ctrl" comparison is present. If not then
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' no delta values are returned.
#' @return An enriched data-frame that contains additional columns, in particular one that
#' @return An enriched data-frame that contains additional columns, in particular one that
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
# difficult since most values would be 1. We anyway intend to use the gender specific rankings
......@@ -59,19 +58,19 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
#
# We start by the most obvious cases for which all values are present.
# The Pi value is simply the absolute log fold change * the P value (on a log10 scale).
FM$ref_pivalue <- -log10(FM$ref_Pval) * abs(FM$ref_logFC)
if (has_ctrl) {
FM$ctrl_pivalue <- -log10(FM$ctrl_Pval) * abs(FM$ctrl_logFC)
}
# We continue with special cases, that is when P value == 1.
# These are the genes that are definitely not differentially expressed.
# Because P value == 1, we have Pi value == 0.
# We use a small workaround to substract a small epsilon to the original P value.
# This way, all genes with P value == 1 are still ranked (therefore based on their log
# This way, all genes with P value == 1 are still ranked (therefore based on their log
# fold change only). This is because we want to avoid too many ties.
pseudo_pval_ref <- 0.1 * (9 + max(FM$ref_Pval[FM$ref_Pval < 1]))
pseudo_pi_ref <- -log10(pseudo_pval_ref) * abs(FM$ref_logFC[FM$ref_pivalue == 0])
......@@ -83,12 +82,12 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
rm(pseudo_pval_ctrl, pseudo_pi_ctrl)
}
rm(pseudo_pval_ref, pseudo_pi_ref)
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the Pi values. We can not use the Pi values directly since they
# can be so different between the two comparisons.
#
# Creating the ranks and rank ratios based on the Pi values for the "ref" comparison.
# Nothing special there, we simply order and rank.
#
......@@ -105,13 +104,13 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ref_pivalue_rank)
FM$ref_pivalue_rankratio <- FM$ref_pivalue_rank / max_rank
rm(max_rank)
# In case, we have no control, the job is done at this stage.
if (!has_ctrl) {
FM$ranking_value <- FM$ref_pivalue
return(FM)
}
# Now creating the ranks and rank ratios based on the Pi values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pivalue_rank:
......@@ -128,7 +127,7 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ctrl_pivalue_rank)
FM$ctrl_pivalue_rankratio <- FM$ctrl_pivalue_rank / max_rank
rm(max_rank)
# The delta is simply the difference between the "ref" and "ctrl" rank ratios, with
# rank ratios based on the respective Pi values.
# We will add one field:
......@@ -140,7 +139,7 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# correspond to neutral genes (as differentially expressed in "ref" and
# in "ctrl").
FM$Delta <- FM$ref_pivalue_rankratio - FM$ctrl_pivalue_rankratio
# We return the enriched data-frame.
FM$gender_specific_score <- 1 - FM$Delta
return(FM)
......@@ -148,11 +147,11 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# To do the ranking stuff but based on FDR instead of pi.
compute_genderspecificity_scores_FDR <- function(FM, has_ctrl = TRUE) {
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the FDR values.
#
# Creating the ranks and rank ratios based on the P values for the "ref" comparison.
# Nothing special there, we simply order and rank.
#
......@@ -169,13 +168,13 @@ compute_genderspecificity_scores_FDR <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ref_pvalue_rank)
FM$ref_pvalue_rankratio <- FM$ref_pvalue_rank / max_rank
rm(max_rank)
# In case, we have no control, the job is done at this stage.
if (!has_ctrl) {
FM$ranking_value <- FM$ref_pvalue
return(FM)
}
# Now creating the ranks and rank ratios based on the P values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pvalue_rank:
......@@ -192,7 +191,7 @@ compute_genderspecificity_scores_FDR <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ctrl_pvalue_rank)
FM$ctrl_pvalue_rankratio <- FM$ctrl_pvalue_rank / max_rank
rm(max_rank)
# The delta is simply the difference between the "ref" and "ctrl" rank ratios, with
# rank ratios based on the respective P values.
# We will add one field:
......@@ -204,7 +203,7 @@ compute_genderspecificity_scores_FDR <- function(FM, has_ctrl = TRUE) {
# correspond to neutral genes (as differentially expressed in "ref" and
# in "ctrl").
FM$Delta <- FM$ref_pvalue_rankratio - FM$ctrl_pvalue_rankratio
# We return the enriched data-frame.
FM$gender_specific_score <- 1 - FM$Delta
return(FM)
......@@ -227,14 +226,13 @@ for (i in seq_len(length(config$integrations))) {
# Inputs. The results of all comparisons of interest (from the integration).
# We otherwise use default parameters SVN = TRUE, probe selection = max-avg
# integration = Marot-Mayer.
# We use four of the seven limma comparisons.
# and integration = Marot-Mayer. We use four of the seven limma comparisons.
#
# Note: we use the filtered results of the integration ("integration") as well
# as the non filtered results ("integration_raw") but only as a control. The idea
# is that to identify say female specific genes, we focus on the female DEGs
# (filtered results) since we want the genes to be at least DEG in females but
# we control for the behaviour in males using the raw results since it does not
# is that to identify say female specific genes, we focus on the female DEGs
# (filtered results) since we want the genes to be at least DEG in females but
# we control for the behaviour in males using the raw results since it does not
# matter if the gene is DEG for males (actually it is even better if it is not
# and the raw results contain more of these non expressed or non differentially
# expressed genes).
......@@ -252,15 +250,14 @@ for (i in seq_len(length(config$integrations))) {
M <- read.delim(M_fn, stringsAsFactors = FALSE, row.names = 1)
Mr <- read.delim(Mr_fn, stringsAsFactors = FALSE, row.names = 1)
rm(F_fn, Fr_fn, M_fn, Mr_fn)
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
# We used to rely on "FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)"
# and "MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)" but now we integrate
# both Raw results.
FM <- merge(x = Fr, y = Mr, by = "SYMBOL")
MF <- merge(x = Mr, y = Fr, by = "SYMBOL")
## FM <- merge(x = F, y = M, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = F, by = "SYMBOL", all.x = TRUE)
rm(Fr, Mr, F, M)
# We select the fields we need:
......@@ -308,10 +305,9 @@ for (i in seq_len(length(config$integrations))) {
MF_enriched <- compute_genderspecificity_scores(MF)
rm(FM, MF)
FM_enriched_FDR <- compute_genderspecificity_scores_FDR(FM)