Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
67e7de53
Commit
67e7de53
authored
Jun 22, 2020
by
Leon-Charles Tranchevent
Browse files
New combined scores to integrate both the pi values and the gender-specificity scores.
parent
fdf6b966
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
16-Data_integration_all/extra_figures.R
View file @
67e7de53
...
...
@@ -363,10 +363,8 @@ rm(m)
#for (gene in config$genes_to_plot) {#}
# DEBUG
gene
<-
"RFC5"
##gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
gene_datasets
<-
c
(
"GSE20163"
)
gene
<-
"XIST"
gene_datasets
<-
c
(
"GSE20159"
,
"GSE20163"
,
"GSE20292"
,
"GSE26927"
,
"GSE49036"
,
"GSE8397"
,
"NBB"
)
plot_gene
(
gene
=
gene
,
gene_datasets
=
gene_datasets
,
...
...
16-Data_integration_all/merge_and_filter_rankings.R
View file @
67e7de53
This diff is collapsed.
Click to expand it.
17-Enrichment_all/CP_enrich.R
View file @
67e7de53
...
...
@@ -302,6 +302,78 @@ plot_top_gsea_hits <- function(gsea,
}
}
#' @title Reading a file with LFC and P values and computes the pi values.
#'
#' @description This functions reads a results files given an integration scheme, a limma
#' tag and a file tag. It then compute the pi values, mapped the official symbols to
#' EntrezGene identifiers.
#'
#' @param integration The integration scheme to use (SN, DA, SNage).
#' @param limma_tag The name of the limma analysis to use.
#' @param file_tag The name of the file tag to use.
#' @param negative A boolean indicating whether the sign of the pi values should be inversed.
#' @return A dataframe with the gene official symbols, the EntrezGene identifiers (can be NA),
#' the log fold-change, P values and pi values.
get_pi_values
<-
function
(
integration
,
limma_tag
,
file_tag
,
negative
=
FALSE
)
{
# We define the I/Os.
analysis_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
limma_tag
,
"_max-avg_"
,
file_tag
)
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
ranking_file
<-
paste0
(
output_data_dir
,
analysis_prefix
,
"_rankings.tsv"
)
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
ranking_file
,
stringsAsFactors
=
FALSE
)
rm
(
ranking_file
)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean
<-
NULL
if
(
sum
(
grepl
(
"\\|"
,
ranking
$
Gene
))
>
0
)
{
tmp_res
<-
t
(
apply
(
ranking
,
1
,
duplicate_row_first_cell
))
ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
rm
(
tmp_res
)
}
else
{
ranking_clean
<-
ranking
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
}
rm
(
ranking
)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean
<-
ranking_clean
%>%
mutate
(
pi_value
=
-
log10
(
P_value
)
*
abs
(
log_fold_change
))
pseudo_pval_ref
<-
0.1
*
(
9
+
max
(
ranking_clean
$
P_value
[
ranking_clean
$
P_value
<
1
]))
pseudo_pi_ref
<-
-
log10
(
pseudo_pval_ref
)
*
abs
(
ranking_clean
$
log_fold_change
[
ranking_clean
$
pi_value
==
0
])
ranking_clean
$
pi_value
[
ranking_clean
$
pi_value
==
0
]
<-
pseudo_pi_ref
if
(
negative
)
{
ranking_clean
$
pi_value
<-
-
ranking_clean
$
pi_value
}
rm
(
pseudo_pval_ref
,
pseudo_pi_ref
)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes
<-
bitr
(
ranking_clean
$
Gene
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
ranking_final
<-
merge
(
x
=
ranking_clean
,
y
=
mapped_egenes
,
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log_fold_change
,
P_value
,
pi_value
)
ranking_final
$
log_fold_change
<-
as.numeric
(
ranking_final
$
log_fold_change
)
ranking_final
$
P_value
<-
as.numeric
(
ranking_final
$
P_value
)
ranking_final
$
pi_value
<-
as.numeric
(
ranking_final
$
pi_value
)
ranking_final
<-
ranking_final
%>%
arrange
(
desc
(
pi_value
))
rm
(
mapped_egenes
,
ranking_clean
)
return
(
ranking_final
)
}
# ================================================================================================
# Main
# ================================================================================================
...
...
@@ -316,139 +388,91 @@ for (i in seq_len(length(config$integrations))) {
if
(
integration
$
use_for_enrichment
==
"FALSE"
)
{
next
}
# We do all limma comparisons one by one.
for
(
j
in
seq_len
(
length
(
config
$
limma_analyses
)))
{
# We extract the Limma parameters.
limma_parameters
<-
config
$
limma_analyses
[[
j
]]
# We only do the enrichment if it is necessary.
if
(
limma_parameters
$
use_for_enrichment
==
"FALSE"
)
{
next
# For each defined strategy
for
(
j
in
seq_len
(
length
(
config
$
enrichment_strategies
)))
{
# We get the integration name for that scheme.
enrichment_strategy
<-
config
$
enrichment_strategies
[[
j
]]
# I/Os
analysis_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
enrichment_strategy
$
limmas
[
1
],
"_max-avg_"
,
enrichment_strategy
$
file_tags
[
1
])
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
# We get the pi values of the two files
ranking_final
<-
get_pi_values
(
integration
,
enrichment_strategy
$
limmas
[
1
],
enrichment_strategy
$
file_tags
[
1
],
negative
=
FALSE
)
if
(
!
is.na
(
enrichment_strategy
$
limmas
[
2
]))
{
ranking_final_neg
<-
get_pi_values
(
integration
,
enrichment_strategy
$
limmas
[
2
],
enrichment_strategy
$
file_tags
[
2
],
negative
=
TRUE
)
ranking_final
<-
rbind
(
ranking_final
,
ranking_final_neg
)
}
for
(
tag
in
limma_parameters
$
enrichment_file_tags
)
{
# We define the I/Os.
analysis_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
limma_parameters
$
name
,
"_max-avg_"
,
tag
)
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
ranking_file
<-
paste0
(
output_data_dir
,
analysis_prefix
,
"_rankings.tsv"
)
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
ranking_file
,
stringsAsFactors
=
FALSE
)
rm
(
ranking_file
)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean
<-
NULL
if
(
sum
(
grepl
(
"\\|"
,
ranking
$
Gene
))
>
0
)
{
tmp_res
<-
t
(
apply
(
ranking
,
1
,
duplicate_row_first_cell
))
ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
rm
(
tmp_res
)
}
else
{
ranking_clean
<-
ranking
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_final_eg
<-
ranking_final
%>%
filter
(
!
is.na
(
EGene
))
ranking_eg
<-
ranking_final_eg
$
pi_value
ranking_sy
<-
ranking_final
$
pi_value
logFC_eg
<-
ranking_final_eg
$
log_fold_change
logFC_sy
<-
ranking_final
$
log_fold_change
names
(
ranking_eg
)
<-
ranking_final_eg
$
EGene
names
(
ranking_sy
)
<-
ranking_final
$
Gene
names
(
logFC_eg
)
<-
ranking_final_eg
$
EGene
names
(
logFC_sy
)
<-
ranking_final
$
Gene
ranking_eg
<-
ranking_eg
[
!
duplicated
(
names
(
ranking_eg
))]
ranking_sy
<-
ranking_sy
[
!
duplicated
(
names
(
ranking_sy
))]
logFC_eg
<-
logFC_eg
[
!
duplicated
(
names
(
logFC_eg
))]
logFC_sy
<-
logFC_sy
[
!
duplicated
(
names
(
logFC_sy
))]
rm
(
ranking_final
,
ranking_final_eg
)
# We loop over the functional sources.
for
(
k
in
seq_len
(
length
(
config
$
functional_sources
)))
{
# We define the current source.
func_source
<-
config
$
functional_sources
[[
k
]]
if
(
is.null
(
func_source
$
cp_name
)
||
func_source
$
cp_name
==
""
)
{
next
}
rm
(
ranking
)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean
<-
ranking_clean
%>%
mutate
(
pi_value
=
-
log10
(
P_value
)
*
abs
(
log_fold_change
))
pseudo_pval_ref
<-
0.1
*
(
9
+
max
(
ranking_clean
$
P_value
[
ranking_clean
$
P_value
<
1
]))
pseudo_pi_ref
<-
-
log10
(
pseudo_pval_ref
)
*
abs
(
ranking_clean
$
log_fold_change
[
ranking_clean
$
pi_value
==
0
])
ranking_clean
$
pi_value
[
ranking_clean
$
pi_value
==
0
]
<-
pseudo_pi_ref
rm
(
pseudo_pval_ref
,
pseudo_pi_ref
)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes
<-
bitr
(
ranking_clean
$
Gene
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
ranking_final
<-
merge
(
x
=
ranking_clean
,
y
=
mapped_egenes
,
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log_fold_change
,
P_value
,
pi_value
)
ranking_final
$
log_fold_change
<-
as.numeric
(
ranking_final
$
log_fold_change
)
ranking_final
$
P_value
<-
as.numeric
(
ranking_final
$
P_value
)
ranking_final
$
pi_value
<-
as.numeric
(
ranking_final
$
pi_value
)
ranking_final
<-
ranking_final
%>%
arrange
(
desc
(
pi_value
))
ranking_final_eg
<-
ranking_final
%>%
filter
(
!
is.na
(
EGene
))
rm
(
mapped_egenes
,
ranking_clean
)
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_eg
<-
ranking_final_eg
$
pi_value
ranking_sy
<-
ranking_final
$
pi_value
logFC_eg
<-
ranking_final_eg
$
log_fold_change
logFC_sy
<-
ranking_final
$
log_fold_change
names
(
ranking_eg
)
<-
ranking_final_eg
$
EGene
names
(
ranking_sy
)
<-
ranking_final
$
Gene
names
(
logFC_eg
)
<-
ranking_final_eg
$
EGene
names
(
logFC_sy
)
<-
ranking_final
$
Gene
ranking_eg
<-
ranking_eg
[
!
duplicated
(
names
(
ranking_eg
))]
ranking_sy
<-
ranking_sy
[
!
duplicated
(
names
(
ranking_sy
))]
logFC_eg
<-
logFC_eg
[
!
duplicated
(
names
(
logFC_eg
))]
logFC_sy
<-
logFC_sy
[
!
duplicated
(
names
(
logFC_sy
))]
rm
(
ranking_final
,
ranking_final_eg
)
# We loop over the functional sources.
for
(
k
in
seq_len
(
length
(
config
$
functional_sources
)))
{
# We define the current source.
func_source
<-
config
$
functional_sources
[[
k
]]
if
(
is.null
(
func_source
$
cp_name
)
||
func_source
$
cp_name
==
""
)
{
next
}
# We define the reference genes (depending on the source).
ranking
<-
NULL
logFC
<-
NULL
if
(
func_source
$
cp_gene_id_type
==
"entrezgene"
)
{
ranking
<-
ranking_eg
logFC
<-
logFC_eg
}
else
if
(
func_source
$
cp_gene_id_type
==
"symbol"
)
{
ranking
<-
ranking_sy
logFC
<-
logFC_sy
}
# We run the GSEA itself.
gsea_pi
<-
perform_gsea
(
ranking
,
type
=
func_source
$
cp_name
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
,
simplify
=
FALSE
)
# We plot all figures for the gsea (PVAL * FC).
plot_enrichment
(
gsea_pi
,
sign
(
logFC
),
ont_tag
=
func_source
$
cp_name
,
type_tag
=
"gsea"
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
)
# We define the reference genes (depending on the source).
ranking
<-
NULL
logFC
<-
NULL
if
(
func_source
$
cp_gene_id_type
==
"entrezgene"
)
{
ranking
<-
ranking_eg
logFC
<-
logFC_eg
}
else
if
(
func_source
$
cp_gene_id_type
==
"symbol"
)
{
ranking
<-
ranking_sy
logFC
<-
logFC_sy
}
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits
(
gsea_pi
,
n
=
10
,
ont_tag
=
func_source
$
cp_name
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
,
file_tag
=
ofile_prefix
)
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
analysis_prefix
,
"] CP GSEA enrichment ("
,
func_source
$
cp_name
,
") done."
))
rm
(
func_source
,
gsea_pi
,
logFC
,
ranking
)
}
# End for each functional source.
rm
(
k
,
ranking_eg
,
ranking_sy
,
logFC_eg
,
logFC_sy
,
analysis_prefix
,
ofile_prefix
)
}
# End for each enrichment file tag.
rm
(
tag
,
limma_parameters
)
}
# End for each limma comparison.
# We run the GSEA itself.
gsea_pi
<-
perform_gsea
(
ranking
,
type
=
func_source
$
cp_name
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
,
simplify
=
FALSE
)
# We plot all figures for the gsea (PVAL and FC).
plot_enrichment
(
gsea_pi
,
sign
(
logFC
),
ont_tag
=
func_source
$
cp_name
,
type_tag
=
"gsea"
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
)
# For GSEA, in addition, we plot the top 10 hits individually.
plot_top_gsea_hits
(
gsea_pi
,
n
=
10
,
ont_tag
=
func_source
$
cp_name
,
file_dir
=
output_data_dir
,
file_prefix
=
ofile_prefix
,
file_tag
=
ofile_prefix
)
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
analysis_prefix
,
"] CP GSEA enrichment ("
,
func_source
$
cp_name
,
") done."
))
rm
(
func_source
,
gsea_pi
,
logFC
,
ranking
)
}
# End for each functional source.
rm
(
k
,
ranking_eg
,
ranking_sy
,
logFC_eg
,
logFC_sy
,
analysis_prefix
,
ofile_prefix
)
}
# End for each enrichment strategy.
rm
(
j
,
integration
)
}
# End for each integration.
rm
(
i
)
...
...
Confs/project_config.yml
View file @
67e7de53
...
...
@@ -30,56 +30,42 @@ limma_analyses:
coefficient
:
"
F
-
M"
name
:
"
FemaleVsMale"
clinical_factor
:
"
Gender"
use_for_enrichment
:
"
FALSE"
enrichment_file_tags
:
[]
use_for_network
:
"
FALSE"
-
factor
:
Disease.status
coefficient
:
"
PD
-
Control"
name
:
"
PDVsControl"
clinical_factor
:
"
Disease_status"
use_for_enrichment
:
"
FALSE"
enrichment_file_tags
:
[]
use_for_network
:
"
TRUE"
-
factor
:
gender_disease_status
coefficient
:
"
F.PD
-
M.PD"
name
:
"
FemaleVsMale_PD"
clinical_factor
:
"
Gender_PD"
use_for_enrichment
:
"
FALSE"
enrichment_file_tags
:
[]
use_for_network
:
"
FALSE"
-
factor
:
gender_disease_status
coefficient
:
"
F.Control
-
M.Control"
name
:
"
FemaleVsMale_control"
clinical_factor
:
"
Gender_Control"
use_for_enrichment
:
"
FALSE"
enrichment_file_tags
:
[]
use_for_network
:
"
FALSE"
-
factor
:
gender_disease_status
coefficient
:
"
F.PD
-
F.Control"
name
:
"
PDVsControl_females"
clinical_factor
:
"
Disease_status_females"
use_for_enrichment
:
"
TRUE"
enrichment_file_tags
:
[
"
genderspecific"
,
"
genderspecific_genderdimorphic"
]
use_for_network
:
"
TRUE"
-
factor
:
gender_disease_status
coefficient
:
"
M.PD
-
M.Control"
name
:
"
PDVsControl_males"
clinical_factor
:
"
Disease_status_males"
use_for_enrichment
:
"
TRUE"
enrichment_file_tags
:
[
"
genderspecific"
,
"
genderspecific_genderdimorphic"
]
use_for_network
:
"
TRUE"
-
factor
:
gender_disease_status
coefficient
:
"
(F.PD
-
F.Control)
-
(M.PD
-
M.Control)"
name
:
"
Gender_disease_status"
clinical_factor
:
"
Gender_disease_status"
use_for_enrichment
:
"
TRUE"
enrichment_file_tags
:
[
"
genderdimorphic"
]
use_for_network
:
"
TRUE"
# Integration schemes
nb_min_pval
:
2
...
...
@@ -89,12 +75,12 @@ integrations:
-
name
:
SN
criteria
:
tissue;SN
use_for_enrichment
:
"
FALS
E"
use_for_enrichment
:
"
TRU
E"
use_for_network
:
"
FALSE"
-
name
:
DA
criteria
:
tissue;DA
use_for_enrichment
:
"
FALS
E"
use_for_enrichment
:
"
TRU
E"
use_for_network
:
"
FALSE"
-
name
:
SNage
...
...
@@ -128,6 +114,22 @@ p_val_combinations:
short_name
:
zt
method
:
"
z.transform"
# Enrichments parameters
enrichment_strategies
:
-
limmas
:
[
"
PDVsControl_females"
,
"
PDVsControl_males"
]
file_tags
:
[
"
genderspecific"
,
"
genderspecific"
]
-
limmas
:
[
"
PDVsControl_males"
,
"
PDVsControl_females"
]
file_tags
:
[
"
genderspecific"
,
"
genderspecific"
]
-
limmas
:
[
"
PDVsControl_females"
,
"
PDVsControl_males"
]
file_tags
:
[
"
genderspecific_genderdimorphic"
,
"
genderspecific"
]
-
limmas
:
[
"
PDVsControl_males"
,
"
PDVsControl_females"
]
file_tags
:
[
"
genderspecific_genderdimorphic"
,
"
genderspecific"
]
-
limmas
:
[
"
Gender_disease_status"
]
file_tags
:
[
"
genderdimorphic"
]
nb_permutations
:
150000
min_gs_size
:
10
max_gs_size
:
300
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment