gender_specific_pathways.R 8.23 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/usr/bin/env Rscript

# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("readxl")
library("tidyverse")
library("ggplot2")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))

# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config          <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir  <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))

# ================================================================================================
# Functions
# ================================================================================================

#' @title Compute the specificity scores of enriched pathways.
28
#'
29
30
31
32
33
#' @description This functions takes two dataframes that contain the enriched pathways of a
#' reference and a control experiments. The functions then merge the two dataframes, computes
#' a Delta score that represents the specificity of the enrichemnt in the reference experiment
#' with respect to the control experiment. It then returs the pathways and their delta scores,
#' possibly filtered to retain only the most significant pathways.
34
#'
35
36
37
38
39
40
41
42
43
44
45
#' @param DB_ref The reference data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @param DB_ctrl The control data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @pval_thres The threshold to use to return only the pathways that reach that level of
#' significance either in the control or in the reference experiments.
#' @return A dataframe that contains the original data augmented with the Delta scores.
compute_delta <- function(DB_ref, DB_ctrl, pval_thres = 0.1) {

  # Merging both dataframes, keeping onlyh the common functional terms.
  DB <- merge(x = DB_ref, y = DB_ctrl, by = "ID") %>%
46
47
48
49
    mutate(Description = Description.x,
           pval_ref    = p.adjust.x,
           pval_ctrl    = p.adjust.y) %>%
    select(ID, Description, pval_ref, pval_ctrl)
50
51
52
53
54
55
56
57
58
59

  # We compute the delta (-log P based delta).
  # Delta = -log10(P value reference) + log10( P value control).
  DB$logP_ref  <- -log10(DB$pval_ref)
  DB$logP_ctrl <- -log10(DB$pval_ctrl)
  DB$Delta     <- DB$logP_ref - DB$logP_ctrl

  # We keep only pathways that are significant or almost significant at least once (either
  # in ref or in control).
  res <- DB %>% filter(pval_ref < pval_thres | pval_ctrl < pval_thres) %>% arrange(Delta)
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74
  # We clean and return the selected pathways.
  rm(DB)
  return(res)
}

# ================================================================================================
# Main
# ================================================================================================

# This script aims at merging the results of functional enrichment for male and female DEGs in
# order to select the pathways that are only enriched for one of the two genders (gender specific
# pathways).
#

75
76
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
77

78
79
  # We get the integration name for that scheme.
  integration <- config$integrations[[i]]
80

81
82
  # We create a structure for all results.
  gsp_data <- c()
83
84
  male_prefix   <- paste0(integration$name, "_PDVsControl_males_")
  female_prefix <- paste0(integration$name, "_PDVsControl_females_")
85

86
87
88
  #
  # KEGG by CP
  #
89
90
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_KEGG.tsv")
91
92
93
94
95
96
97
98
99
100
101
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "KEGG"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # MSIGDB by CP
  #
102
103
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_MSIGDB.tsv")
104
105
106
107
108
109
110
111
112
113
114
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "MSIGDB"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # REACTOME by CP
  #
115
116
117
118
  M_filename <- paste0(output_data_dir, "CP_", male_prefix,
                       "max-avg_gs_pivalue_gsea_REACTOME.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix,
                       "max-avg_gs_pivalue_gsea_REACTOME.tsv")
119
120
121
122
123
124
125
126
127
128
129
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "REACTOME"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-BP by CP
  #
130
131
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOBP.tsv")
132
133
134
135
136
137
138
139
140
141
142
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOBP"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-CC by CP
  #
143
144
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOCC.tsv")
145
146
147
148
149
150
151
152
153
154
155
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOCC"))
  rm(M_filename, F_filename, M_data, F_data)

  #
  # GO-MF by CP
  #
156
157
  M_filename <- paste0(output_data_dir, "CP_", male_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
  F_filename <- paste0(output_data_dir, "CP_", female_prefix, "max-avg_gs_pivalue_gsea_GOMF.tsv")
158
159
160
161
162
163
164
165
166
  M_data  <- read.delim(M_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  F_data  <- read.delim(F_filename, row.names = 1, stringsAsFactors = FALSE) %>%
    select(ID, Description, p.adjust)
  gsp_data <- rbind(gsp_data, compute_delta(M_data, F_data) %>%
                      mutate(tool = "CP", source = "GOMF"))
  rm(M_filename, F_filename, M_data, F_data)


167
168
  # We do not run it yet for the ROT and PF tools since there is no obvious p-value to
  # use (there are several). We can later adapt this script for these two tools as well.
169
170
171
172
173

  # Save file with all specific pathways.
  ofile <- paste0(output_data_dir, integration$name, "_gender_specific_pathways.tsv")
  write.table(gsp_data, file = ofile,  sep = "\t", quote = FALSE, col.names = NA)
  rm(ofile, gsp_data)
174
175
  message(paste0("[", Sys.time(), "] [", integration$name,
                 "] Gender specific pathways identified."))
176
177
178
  rm(integration, male_prefix, female_prefix)
} # End for each integration.
rm(i)
179
180
181
182

# Final cleaning.
rm(config, input_data_dir, output_data_dir)
sessionInfo()