Commit fbfefcbd authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Cosmetic changes to the integration step.

parent 3e4e2db5
......@@ -47,7 +47,7 @@ compute_overlap <- function(row) {
m_ds <- unlist(strsplit(row["male_used_datasets"], "\\|"))
overlap <- intersect(f_ds,m_ds)
local_min <- min(length(f_ds), length(m_ds))
over_perc <-length(overlap) / local_min
over_perc <- length(overlap) / local_min
over_str <- paste(overlap, collapse = "|")
return(c(over_str, length(f_ds), length(m_ds), length(overlap), over_perc))
}
......
......@@ -39,7 +39,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
# difficult since most values would be 1. We anyway intend to use the gender specific rankings
......@@ -59,14 +59,14 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
#
# We start by the most obvious cases for which all values are present.
# The Pi value is simply the absolute log fold change * the P value (on a log10 scale).
FM$ref_pivalue <- -log10(FM$ref_Pval) * abs(FM$ref_logFC)
if (has_ctrl) {
FM$ctrl_pivalue <- -log10(FM$ctrl_Pval) * abs(FM$ctrl_logFC)
}
# We continue with special cases, that is when P value == 1.
# These are the genes that are definitely not differentially expressed.
# Because P value == 1, we have Pi value == 0.
......@@ -83,12 +83,12 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
rm(pseudo_pval_ctrl, pseudo_pi_ctrl)
}
rm(pseudo_pval_ref, pseudo_pi_ref)
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the Pi values. We can not use the Pi values directly since they
# can be so different between the two comparisons.
# Creating the ranks and rank ratios based on the Pi values for the "ref" comparison.
# Nothing special there, we simply order and rank.
#
......@@ -105,13 +105,13 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ref_pivalue_rank)
FM$ref_pivalue_rankratio <- FM$ref_pivalue_rank / max_rank
rm(max_rank)
# In case, we have no control, the job is done at this stage.
if (!has_ctrl) {
FM$ranking_value <- FM$ref_pivalue
return(FM)
}
# Now creating the ranks and rank ratios based on the Pi values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pivalue_rank:
......@@ -128,7 +128,7 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
max_rank <- max(FM$ctrl_pivalue_rank)
FM$ctrl_pivalue_rankratio <- FM$ctrl_pivalue_rank / max_rank
rm(max_rank)
# The delta is simply the difference between the "ref" and "ctrl" rank ratios, with
# rank ratios based on the respective Pi values.
# We will add one field:
......@@ -140,7 +140,71 @@ compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# correspond to neutral genes (as differentially expressed in "ref" and
# in "ctrl").
FM$Delta <- FM$ref_pivalue_rankratio - FM$ctrl_pivalue_rankratio
# We return the enriched data-frame.
FM$gender_specific_score <- 1 - FM$Delta
return(FM)
}
# To do the ranking stuff but based on FDR instead of pi.
compute_genderspecificity_scores_FDR <- function(FM, has_ctrl = TRUE) {
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the FDR values.
# Creating the ranks and rank ratios based on the P values for the "ref" comparison.
# Nothing special there, we simply order and rank.
#
# We will add two fields:
# ref_pvalue_rank:
# The rank of the genes based on their P values (for the "ref" comparison).
# Values are ~[1, 22000]. Small ranks correspond to small P values (and therefore
# significant DEGs).
# ref_pvalue_rankratio:
# The rank ratios of the genes based on the P value ranks (for the "ref"
# comparison). Values are [0, 1]. Small ratios correspond to small P values
# (and therefore significant DEGs).
FM$ref_pvalue_rank <- rank(FM$ref_adj_Pval, ties.method = "average")
max_rank <- max(FM$ref_pvalue_rank)
FM$ref_pvalue_rankratio <- FM$ref_pvalue_rank / max_rank
rm(max_rank)
# In case, we have no control, the job is done at this stage.
if (!has_ctrl) {
FM$ranking_value <- FM$ref_pvalue
return(FM)
}
# Now creating the ranks and rank ratios based on the P values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pvalue_rank:
# The rank of the genes based on their P values (for the "ctrl" comparison).
# Values are ~[1, 22000]. Small ranks correspond to small P values (and therefore
# significant DEGs). High ranks correspond to non differentially expressed
# genes (P value == 1).
# ctrl_pvalue_rankratio:
# The rank ratios of the genes based on the P value ranks (for the "ctrl"
# comparison). Values are [0, 1]. Small ratios correspond to small P values
# (and therefore significant DEGs). High ratios correspond to non
# differentially expressed genes (P value == 1).
FM$ctrl_pvalue_rank <- rank(FM$ctrl_adj_Pval, ties.method = "average")
max_rank <- max(FM$ctrl_pvalue_rank)
FM$ctrl_pvalue_rankratio <- FM$ctrl_pvalue_rank / max_rank
rm(max_rank)
# The delta is simply the difference between the "ref" and "ctrl" rank ratios, with
# rank ratios based on the respective P values.
# We will add one field:
# Delta:
# The difference between the "ref" and "ctrl" P based rank ratios.
# Values are [-1, 1]. Negative values correspond to "ref" specific genes
# Positive values correspond non "ref" specific genes, including but not
# limited to "ctrl" specific genes, values around 0 are in the middle and
# correspond to neutral genes (as differentially expressed in "ref" and
# in "ctrl").
FM$Delta <- FM$ref_pvalue_rankratio - FM$ctrl_pvalue_rankratio
# We return the enriched data-frame.
FM$gender_specific_score <- 1 - FM$Delta
return(FM)
......@@ -191,10 +255,12 @@ for (i in seq_len(length(config$integrations))) {
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
FM <- merge(x = Fr, y = Mr, by = "SYMBOL")
MF <- merge(x = Mr, y = Fr, by = "SYMBOL")
## FM <- merge(x = F, y = M, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = F, by = "SYMBOL", all.x = TRUE)
rm(Fr, Mr, F, M)
# We select the fields we need:
......@@ -242,6 +308,10 @@ for (i in seq_len(length(config$integrations))) {
MF_enriched <- compute_genderspecificity_scores(MF)
rm(FM, MF)
FM_enriched_FDR <- compute_genderspecificity_scores_FDR(FM)
MF_enriched_FDR <- compute_genderspecificity_scores_FDR(MF)
# We save the full data.
FM_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_genderspecificityscore_fulldata.tsv")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment