Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
243303b3
Commit
243303b3
authored
Sep 23, 2020
by
Leon-Charles Tranchevent
Browse files
Cosmetic changes on the enrichment procedure (mostly refactoring file names).
parent
cebef2d7
Changes
6
Hide whitespace changes
Inline
Side-by-side
17-Enrichment_all/CP_enrich.sh
View file @
243303b3
...
...
@@ -3,7 +3,7 @@
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
0
#SBATCH -n 2
5
#SBATCH --time=0-06:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
...
...
17-Enrichment_all/Makefile
View file @
243303b3
...
...
@@ -15,7 +15,7 @@ enrich:
@
sbatch
${CODE_FOLDER}
ROT_enrich.sh
pfenrich
:
@
/bin/bash
${CODE_FOLDER}
PF_enrich.sh
merge
:
@
sbatch
${CODE_FOLDER}
merge.sh
#
merge:
#
@sbatch ${CODE_FOLDER}merge.sh
gsp
:
@
sbatch
${CODE_FOLDER}
gender_specific_pathways.sh
17-Enrichment_all/gender_specific_pathways.R
View file @
243303b3
...
...
@@ -79,14 +79,14 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data
<-
c
()
male_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_females_"
)
male_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_females_"
)
#
# KEGG by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_KEGG.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_KEGG.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_KEGG.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_KEGG.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -98,8 +98,8 @@ for (i in seq_len(length(config$integrations))) {
#
# MSIGDB by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_MSIGDB.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_MSIGDB.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -111,8 +111,8 @@ for (i in seq_len(length(config$integrations))) {
#
# REACTOME by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_REACTOME.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_REACTOME.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_REACTOME.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_REACTOME.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -124,8 +124,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-BP by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_GOBP.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_GOBP.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_GOBP.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_GOBP.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -137,8 +137,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-CC by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_GOCC.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_GOCC.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_GOCC.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_GOCC.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -150,8 +150,8 @@ for (i in seq_len(length(config$integrations))) {
#
# GO-MF by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"gsea_GOMF.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"gsea_GOMF.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"
max-avg_gs_pivalue_
gsea_GOMF.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"
max-avg_gs_pivalue_
gsea_GOMF.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -163,8 +163,8 @@ for (i in seq_len(length(config$integrations))) {
#
# MSIGDB by G2P
#
M_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
male_prefix
,
"MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
female_prefix
,
"MSIGDB.tsv"
)
M_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
male_prefix
,
"
max-avg_gs_pivalue_
MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
female_prefix
,
"
max-avg_gs_pivalue_
MSIGDB.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
M_data
$
adj_pv
=
M_data
$
pv
*
dim
(
M_data
)[
1
]
M_data
$
adj_pv
[
M_data
$
adj_pv
>
1
]
<-
1
...
...
17-Enrichment_all/gender_specific_pathways.sh
View file @
243303b3
...
...
@@ -24,6 +24,7 @@ module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript
--vanilla
${
CODE_FOLDER
}
/gender_specific_pathways.R
>
${
OUTPUT_FOLDER
}
/gsp_log.out 2>
${
OUTPUT_FOLDER
}
/gsp_log.err
Rscript
--vanilla
${
CODE_FOLDER
}
/pathways_with_gender_specific_genes.R
>
${
OUTPUT_FOLDER
}
/pgsg_log.out 2>
${
OUTPUT_FOLDER
}
/pgsg_log.err
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
/slurm-
*
out
${
OUTPUT_FOLDER
}
/
17-Enrichment_all/pathways_with_gender_specific_genes.R
0 → 100644
View file @
243303b3
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"readxl"
)
library
(
"tidyverse"
)
library
(
"ggplot2"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
get_overlap
<-
function
(
row
)
{
genes_ref
<-
str_split
(
row
[
5
],
"/"
)[[
1
]]
genes_ctrl
<-
str_split
(
row
[
6
],
"/"
)[[
1
]]
overlap
<-
length
(
intersect
(
genes_ref
,
genes_ctrl
))
/
min
(
length
(
genes_ref
),
length
(
genes_ctrl
))
return
(
overlap
)
}
#' @title Compute the specificity scores of enriched pathways.
#'
#' @description This functions takes two dataframes that contain the enriched pathways of a
#' reference and a control experiments. The functions then merge the two dataframes, computes
#' a Delta score that represents the specificity of the enrichemnt in the reference experiment
#' with respect to the control experiment. It then returs the pathways and their delta scores,
#' possibly filtered to retain only the most significant pathways.
#'
#' @param DB_ref The reference data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @param DB_ctrl The control data-frame, contains at least the pathway identifiers ("ID),
#' descriptions ("Description") and adjusted P values ("p.adjust").
#' @pval_thres The threshold to use to return only the pathways that reach that level of
#' significance either in the control or in the reference experiments.
#' @return A dataframe that contains the original data augmented with the Delta scores.
compute_overlap
<-
function
(
DB_ref
,
DB_ctrl
,
pval_thres
=
0.1
)
{
# Merging both dataframes, keeping onlyh the common functional terms.
DB
<-
merge
(
x
=
DB_ref
,
y
=
DB_ctrl
,
by
=
"ID"
)
%>%
mutate
(
Description
=
Description.x
,
pval_ref
=
p.adjust.x
,
pval_ctrl
=
p.adjust.y
,
genes_ref
=
core_enrichment.x
,
genes_ctrl
=
core_enrichment.y
)
%>%
select
(
ID
,
Description
,
pval_ref
,
pval_ctrl
,
genes_ref
,
genes_ctrl
)
# We compute the overlap.
DB
$
Overlap
<-
apply
(
DB
,
1
,
get_overlap
)
# We keep only pathways that are significant or almost significant at least once (either
# in ref or in control).
res
<-
DB
%>%
filter
(
pval_ref
<
pval_thres
|
pval_ctrl
<
pval_thres
)
%>%
arrange
(
Overlap
)
# We clean and return the selected pathways.
rm
(
DB
)
return
(
res
)
}
# ================================================================================================
# Main
# ================================================================================================
# This script aims at merging the results of functional enrichment for male and female DEGs in
# order to select the pathways that are only enriched for one of the two genders (gender specific
# pathways).
#
# We do all integration schemes one by one.
for
(
i
in
seq_len
(
length
(
config
$
integrations
)))
{
# We get the integration name for that scheme.
integration
<-
config
$
integrations
[[
i
]]
# We create a structure for all results.
gsp_data
<-
c
()
male_prefix
<-
paste0
(
integration
$
name
,
"_VSN_PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_VSN_PDVsControl_females_"
)
#
# KEGG by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_KEGG.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_KEGG.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"KEGG"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# MSIGDB by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_MSIGDB.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"MSIGDB"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# REACTOME by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_REACTOME.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_REACTOME.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"REACTOME"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# GO-BP by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_GOBP.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_GOBP.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"GOBP"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# GO-CC by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_GOCC.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_GOCC.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"GOCC"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# GO-MF by CP
#
M_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
male_prefix
,
"max-avg_all_pivalue_gsea_GOMF.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"CP_"
,
female_prefix
,
"max-avg_all_pivalue_gsea_GOMF.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
%>%
select
(
ID
,
Description
,
p.adjust
,
core_enrichment
)
gsp_data
<-
rbind
(
gsp_data
,
compute_overlap
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"CP"
,
source
=
"GOMF"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
# Save file with all specific pathways.
ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_pathways_with_gender_specific_genes.tsv"
)
write.table
(
gsp_data
,
file
=
ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
rm
(
ofile
,
gsp_data
)
message
(
paste0
(
"["
,
Sys.time
(),
"] ["
,
integration
$
name
,
"] Pathways with gender specific genes identified."
))
rm
(
integration
,
male_prefix
,
female_prefix
)
}
# End for each integration.
rm
(
i
)
# Final cleaning.
rm
(
config
,
input_data_dir
,
output_data_dir
)
sessionInfo
()
Confs/project_config.yml
View file @
243303b3
...
...
@@ -36,7 +36,7 @@ limma_analyses:
coefficient
:
"
PD
-
Control"
name
:
"
PDVsControl"
clinical_factor
:
"
Disease_status"
use_for_network
:
"
TRU
E"
use_for_network
:
"
FALS
E"
-
factor
:
gender_disease_status
coefficient
:
"
F.PD
-
M.PD"
...
...
@@ -66,7 +66,7 @@ limma_analyses:
coefficient
:
"
(F.PD
-
F.Control)
-
(M.PD
-
M.Control)"
name
:
"
Gender_disease_status"
clinical_factor
:
"
Gender_disease_status"
use_for_network
:
"
TRU
E"
use_for_network
:
"
FALS
E"
# Integration schemes
nb_min_pval
:
2
perc_min_pval
:
0.33334
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment