gender_specific_pathways.R 8.17 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env Rscript

# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("readxl")
library("tidyverse")
library("ggplot2")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))

# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config          <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir  <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))

# ================================================================================================
# Functions
# ================================================================================================

#' @title Compute the specificity scores of enriched pathways.
#' 
#' @description This functions takes two dataframes that contain the enriched pathways of a
#' reference and a control experiments. The functions then merge the two dataframes, computes
#' a Delta score that represents the specificity of the enrichemnt in the reference experiment
#' with respect to the control experiment. It then returs the pathways and their delta scores,
#' possibly filtered to retain only the most significant pathways.
#' 
#' @param DB_ref The reference data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @param DB_ctrl The control data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @pval_thres The threshold to use to return only the pathways that reach that level of
#' significance either in the control or in the reference experiments.
#' @return A dataframe that contains the original data augmented with the Delta scores.
compute_delta <- function(DB_ref, DB_ctrl, pval_thres = 0.1) {

  # Merging both dataframes, keeping onlyh the common functional terms.
  DB <- merge(x = DB_ref, y = DB_ctrl, by = "ID") %>%
    mutate (Description = Description.x,
            pval_ref    = p.adjust.x,
            pval_ctrl    = p.adjust.y) %>% select(ID, Description, pval_ref, pval_ctrl)

  # We compute the delta (-log P based delta).
  # Delta = -log10(P value reference) + log10( P value control).
  DB$logP_ref  <- -log10(DB$pval_ref)
  DB$logP_ctrl <- -log10(DB$pval_ctrl)
  DB$Delta     <- DB$logP_ref - DB$logP_ctrl

  # We keep only pathways that are significant or almost significant at least once (either
  # in ref or in control).
  res <- DB %>% filter(pval_ref < pval_thres | pval_ctrl < pval_thres) %>% arrange(Delta)
  
  # We clean and return the selected pathways.
  rm(DB)
  return(res)
}

# ================================================================================================
# Main
# ================================================================================================

# This script aims at merging the results of functional enrichment for male and female DEGs in
# order to select the pathways that are only enriched for one of the two genders (gender specific
# pathways).
#

74
75
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
76

77
78
  # We get the integration name for that scheme.
  integration <- config$integrations[[i]]
79

80
81
  # We create a structure for all results.
  gsp_data <- c()
82
83
  male_prefix   <- paste0(integration$name, "_PDVsControl_males_")
  female_prefix <- paste0(integration$name, "_PDVsControl_females_")
84
85
86
87
  
  #
  # KEGG by CP
  #
88
89
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
90
91
92
93
94
95
96
97
98
99
100
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "KEGG"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # MSIGDB by CP
  #
101
102
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
103
104
105
106
107
108
109
110
111
112
113
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "MSIGDB"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # REACTOME by CP
  #
114
115
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_REACTOME.tsv")
116
117
118
119
120
121
122
123
124
125
126
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "REACTOME"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-BP by CP
  #
127
128
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
129
130
131
132
133
134
135
136
137
138
139
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOBP"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-CC by CP
  #
140
141
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
142
143
144
145
146
147
148
149
150
151
152
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOCC"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-MF by CP
  #
153
154
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOMF"))
  rm(M_filename, F_filename, M_data, F_data)


  # We do not run it yet for the ROT and PF tools since there is no obvious p-value to use (there are several).
  # We can later adapt this script for these two tools as well.

  # Save file with all specific pathways.
  ofile <- paste0(output_data_dir, integration$name, "_gender_specific_pathways.tsv")
  write.table(gsp_data, file = ofile,  sep = "\t", quote = FALSE, col.names = NA)
  rm(ofile, gsp_data)
  message(paste0("[", Sys.time(), "] [", integration$name, "] Gender specific pathways identified."))
  rm(integration, male_prefix, female_prefix)
} # End for each integration.
rm(i)
175
176
177
178

# Final cleaning.
rm(config, input_data_dir, output_data_dir)
sessionInfo()