Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
a7976e94
Commit
a7976e94
authored
Jun 22, 2020
by
Leon-Charles Tranchevent
Browse files
Functional enrichment of all ranking files instead of only the files of the main configuration.
parent
67e7de53
Changes
7
Hide whitespace changes
Inline
Side-by-side
16-Data_integration_all/merge_and_filter_rankings.R
View file @
a7976e94
...
...
@@ -284,7 +284,9 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_pivalue_rankings.tsv"
)
write.table
(
res_females
%>%
select
(
Gene
,
pi_value
)
%>%
arrange
(
desc
(
pi_value
)),
mutate
(
ranking_value
=
pi_value
)
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_value
)
%>%
arrange
(
desc
(
ranking_value
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -295,7 +297,9 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_pivalue_rankings.tsv"
)
write.table
(
res_males
%>%
select
(
Gene
,
pi_value
)
%>%
arrange
(
desc
(
pi_value
)),
mutate
(
ranking_value
=
pi_value
)
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_value
)
%>%
arrange
(
desc
(
ranking_value
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -307,9 +311,10 @@ for (i in seq_len(length(config$integrations))) {
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_females
%>%
select
(
Gene
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
arrange
(
desc
(
gender_specific_score
)),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
arrange
(
desc
(
ranking_value
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -319,9 +324,10 @@ for (i in seq_len(length(config$integrations))) {
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_males
%>%
select
(
Gene
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
arrange
(
desc
(
gender_specific_score
)),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
arrange
(
desc
(
ranking_value
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -334,11 +340,11 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_females
)[
1
]
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
arrange
(
desc
(
combined_scor
e
)),
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
arrange
(
desc
(
ranking_valu
e
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -349,11 +355,11 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gsgd_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_males
)[
1
]
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
arrange
(
desc
(
combined_scor
e
)),
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
arrange
(
desc
(
ranking_valu
e
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -366,7 +372,10 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_pivalue_rankings.tsv"
)
write.table
(
res_females
%>%
select
(
Gene
,
pi_value
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
pi_value
)),
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
)
%>%
mutate
(
ranking_value
=
pi_value
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
ranking_value
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -377,7 +386,10 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_pivalue_rankings.tsv"
)
write.table
(
res_males
%>%
select
(
Gene
,
pi_value
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
pi_value
)),
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
)
%>%
mutate
(
ranking_value
=
pi_value
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
ranking_value
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -390,11 +402,12 @@ for (i in seq_len(length(config$integrations))) {
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_females
%>%
select
(
Gene
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
gender_specific_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -404,18 +417,18 @@ for (i in seq_len(length(config$integrations))) {
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_males
%>%
select
(
Gene
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
gender_specific_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
res_males_fn
,
k
,
limma_analysis
)
# We continue with the rankings based on both metrics, for the gender-specific
# genes only.
k
<-
5
...
...
@@ -423,12 +436,12 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_females
)[
1
]
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
combined_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -439,12 +452,12 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gs_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_males
)[
1
]
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
filter
(
!
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
combined_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -457,7 +470,10 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_pivalue_rankings.tsv"
)
write.table
(
res_females
%>%
select
(
Gene
,
pi_value
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
pi_value
)),
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
)
%>%
mutate
(
ranking_value
=
pi_value
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
ranking_value
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -468,24 +484,28 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_pivalue_rankings.tsv"
)
write.table
(
res_males
%>%
select
(
Gene
,
pi_value
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
pi_value
)),
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
)
%>%
mutate
(
ranking_value
=
pi_value
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
ranking_value
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
res_males_fn
,
k
,
limma_analysis
)
# We continue with the rankings based on gender-specificity scores, for the gender-dimorphic
# genes only.
k
<-
5
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_females
%>%
select
(
Gene
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
gender_specific_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -495,11 +515,12 @@ for (i in seq_len(length(config$integrations))) {
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_gdrscore_rankings.tsv"
)
write.table
(
merge
(
res_males
%>%
select
(
Gene
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
mutate
(
ranking_value
=
gender_specific_score
)
%>%
select
(
Gene
,
ranking_value
),
by
=
"Gene"
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
gender_specific_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -513,12 +534,12 @@ for (i in seq_len(length(config$integrations))) {
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_females
)[
1
]
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_females
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_females
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
combined_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
@@ -529,12 +550,12 @@ for (i in seq_len(length(config$integrations))) {
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_gd_combinedscores_rankings.tsv"
)
n
<-
dim
(
res_males
)[
1
]
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
pi_value
),
write.table
(
merge
(
res_males
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
),
spec_males
%>%
select
(
Gene
,
gender_specific_score
),
by
=
"Gene"
)
%>%
mutate
(
combined_scor
e
=
((
1
-
rank
(
-
pi_value
)
/
n
)
+
(
1
-
rank
(
-
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
combined_scor
e
)
%>%
by
=
"Gene"
)
%>%
mutate
(
ranking_valu
e
=
((
rank
(
pi_value
)
/
n
)
+
(
rank
(
gender_specific_score
)
/
n
)))
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
pi_value
,
ranking_valu
e
)
%>%
filter
(
Gene
%in%
potGD
$
Gene
)
%>%
arrange
(
desc
(
combined_scor
e
)),
arrange
(
desc
(
ranking_valu
e
)),
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
...
...
17-Enrichment_all/CP_enrich.R
View file @
a7976e94
...
...
@@ -45,7 +45,13 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys
<-
strsplit
(
row
[
1
],
sep
,
fixed
=
TRUE
)[[
1
]]
val_1
<-
rep
(
row
[
2
],
length
(
keys
))
val_2
<-
rep
(
row
[
3
],
length
(
keys
))
to_ret
<-
cbind
(
Gene
=
keys
,
log_fold_change
=
val_1
,
P_value
=
val_2
)
val_3
<-
rep
(
row
[
4
],
length
(
keys
))
val_4
<-
rep
(
row
[
5
],
length
(
keys
))
to_ret
<-
cbind
(
Gene
=
keys
,
log_fold_change
=
val_1
,
P_value
=
val_2
,
pi_value
=
val_3
,
ranking_value
=
val_4
)
return
(
to_ret
)
}
...
...
@@ -75,7 +81,7 @@ perform_gsea <- function(ranked_genes,
file_dir
=
""
,
file_prefix
=
""
,
simplify
=
TRUE
)
{
# By default, we set the results to NULL.
gsea
<-
NULL
switch
(
type
,
...
...
@@ -176,7 +182,7 @@ perform_gsea <- function(ranked_genes,
TERM2GENE
=
msigdb
)
}
)
# We simplify the enrichment results by removing the potentially redundant terms.
if
(
simplify
==
TRUE
)
{
gsea
<-
clusterProfiler
::
simplify
(
gsea
,
...
...
@@ -184,11 +190,11 @@ perform_gsea <- function(ranked_genes,
by
=
"p.adjust"
,
select_fun
=
min
)
}
# We save the table as TSV file.
gsea_fn
<-
paste0
(
file_dir
,
file_prefix
,
"gsea_"
,
type
,
".tsv"
)
write.table
(
gsea
,
file
=
gsea_fn
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
# We return the enrichResult object.
return
(
gsea
)
}
...
...
@@ -214,7 +220,7 @@ plot_enrichment <- function(enrich,
type_tag
=
"enrich"
,
file_dir
=
""
,
file_prefix
=
""
)
{
# We start by defining the text size of the y labels wrt
# to the length of these labels. Default size is 9.
# This was empirically defined and it is used to circumvent a bug that prevents
...
...
@@ -228,7 +234,7 @@ plot_enrichment <- function(enrich,
y_text_elmt
<-
ggplot2
::
element_blank
()
}
}
# Dotplot of the the top enriched terms.
enrich_dotplot_fn
<-
paste0
(
file_dir
,
file_prefix
,
type_tag
,
"_"
,
ont_tag
,
"_dotplot.png"
)
...
...
@@ -240,7 +246,7 @@ plot_enrichment <- function(enrich,
plot.title
=
element_text
(
size
=
10
))
print
(
p
)
dev.off
()
# The heatplot between genes and top enriched terms.
enrich_heatplot_fn
<-
paste0
(
file_dir
,
file_prefix
,
type_tag
,
"_"
,
ont_tag
,
"_heatplot.png"
)
...
...
@@ -253,7 +259,7 @@ plot_enrichment <- function(enrich,
axis.text.x
=
element_blank
())
print
(
p
)
dev.off
()
# Network of top enriched terms (emapplot).
enrich_emapplot_fn
<-
paste0
(
file_dir
,
file_prefix
,
type_tag
,
"_"
,
ont_tag
,
"_emapplot.png"
)
...
...
@@ -302,29 +308,25 @@ plot_top_gsea_hits <- function(gsea,
}
}
#' @title Reading a file with LFC and P values and computes the pi values.
#'
#' @description This functions reads a results files given an integration scheme, a limma
#' tag and a file tag. It then compute the pi values, mapped the official symbols to
#' EntrezGene identifiers.
#'
#' @param integration The integration scheme to use (SN, DA, SNage).
#' @param limma_tag The name of the limma analysis to use.
#' @param file_tag The name of the file tag to use.
#' @param negative A boolean indicating whether the sign of the pi values should be inversed.
#' @return A dataframe with the gene official symbols, the EntrezGene identifiers (can be NA),
#' the log fold-change, P values and pi values.
get_pi_values
<-
function
(
integration
,
limma_tag
,
file_tag
,
negative
=
FALSE
)
{
# ================================================================================================
# Main
# ================================================================================================
# We perform the enrichment on all rankings files.
ranking_files
<-
list.files
(
path
=
output_data_dir
,
pattern
=
"*rankings.tsv"
)
# We do all integration schemes one by one.
for
(
ranking_file
in
ranking_files
)
{
# We define the I/Os.
analysis_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
limma_tag
,
"_max-avg_"
,
file_tag
)
analysis_prefix
<-
strsplit
(
ranking_file
,
"_rankings.tsv"
)[[
1
]]
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
ranking
_file
<-
paste0
(
output_data_dir
,
analysis_prefix
,
"_
ranking
s.tsv"
)
input
_file
name
<-
paste0
(
output_data_dir
,
ranking
_file
)
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
ranking_fil
e
,
stringsAsFactors
=
FALSE
)
rm
(
ranking_fil
e
)
ranking
<-
read.delim
(
input_filenam
e
,
stringsAsFactors
=
FALSE
)
rm
(
input_filenam
e
)
# We clean genes (no duplicate entries with pipe separated gene symbols).
# Only if necessary (that is if we have at least one pipe).
ranking_clean
<-
NULL
...
...
@@ -333,28 +335,16 @@ get_pi_values <- function(integration, limma_tag, file_tag, negative = FALSE) {
ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
rm
(
tmp_res
)
}
else
{
ranking_clean
<-
ranking
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
}
ranking_clean
$
log_fold_change
<-
as.numeric
(
ranking_clean
$
log_fold_change
)
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
ranking_clean
$
pi_value
<-
as.numeric
(
ranking_clean
$
pi_value
)
ranking_clean
$
ranking_value
<-
as.numeric
(
ranking_clean
$
ranking_value
)
rm
(
ranking
)
# We then add the pi values, retracting a small epsilon to the P values when they are equal
# to 1, as to avoid ties (which are therefore separated via the associated log fold-change).
ranking_clean
<-
ranking_clean
%>%
mutate
(
pi_value
=
-
log10
(
P_value
)
*
abs
(
log_fold_change
))
pseudo_pval_ref
<-
0.1
*
(
9
+
max
(
ranking_clean
$
P_value
[
ranking_clean
$
P_value
<
1
]))
pseudo_pi_ref
<-
-
log10
(
pseudo_pval_ref
)
*
abs
(
ranking_clean
$
log_fold_change
[
ranking_clean
$
pi_value
==
0
])
ranking_clean
$
pi_value
[
ranking_clean
$
pi_value
==
0
]
<-
pseudo_pi_ref
if
(
negative
)
{
ranking_clean
$
pi_value
<-
-
ranking_clean
$
pi_value
}
rm
(
pseudo_pval_ref
,
pseudo_pi_ref
)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes
<-
bitr
(
ranking_clean
$
Gene
,
fromType
=
"SYMBOL"
,
...
...
@@ -365,117 +355,77 @@ get_pi_values <- function(integration, limma_tag, file_tag, negative = FALSE) {
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log_fold_change
,
P_value
,
pi
_value
)
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log_fold_change
,
ranking
_value
)
ranking_final
$
log_fold_change
<-
as.numeric
(
ranking_final
$
log_fold_change
)
ranking_final
$
P_value
<-
as.numeric
(
ranking_final
$
P
_value
)
ranking_final
$
pi_value
<-
as.numeric
(
ranking_final
$
pi
_value
)
ranking_final
<-
ranking_final
%>%
arrange
(
desc
(
pi_valu
e
))
ranking_final
$
ranking_value
<-
as.numeric
(
ranking_final
$
ranking
_value
)
ranking_final
<-
ranking_final
%>%
arrange
(
desc
(
ranking
_value
)
)
ranking_final
_eg
<-
ranking_final
%>%
filter
(
!
is.na
(
EGen
e
))
rm
(
mapped_egenes
,
ranking_clean
)
return
(
ranking_final
)
}
# ================================================================================================
# Main
# ================================================================================================
# We do all integration schemes one by one.
for
(
i
in
seq_len
(
length
(
config
$
integrations
)))
{
# We get the integration name for that scheme.
integration
<-
config
$
integrations
[[
i
]]
# We only do the enrichment if it is necessary.
if
(
integration
$
use_for_enrichment
==
"FALSE"
)
{
next
}
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_eg
<-
ranking_final_eg
$
ranking_value
ranking_sy
<-
ranking_final
$
ranking_value
logFC_eg
<-
ranking_final_eg
$
log_fold_change
logFC_sy
<-
ranking_final
$
log_fold_change
names
(
ranking_eg
)
<-
ranking_final_eg
$
EGene
names
(
ranking_sy
)
<-
ranking_final
$
Gene
names
(
logFC_eg
)
<-
ranking_final_eg
$
EGene
names
(
logFC_sy
)
<-
ranking_final
$
Gene
ranking_eg
<-
ranking_eg
[
!
duplicated
(
names
(
ranking_eg
))]
ranking_sy
<-
ranking_sy
[
!
duplicated
(
names
(
ranking_sy
))]
logFC_eg
<-
logFC_eg
[
!
duplicated
(
names
(
logFC_eg
))]
logFC_sy
<-
logFC_sy
[
!
duplicated
(
names
(
logFC_sy
))]
rm
(
ranking_final
,
ranking_final_eg
)
#
For each defined strategy
for
(
j
in
seq_len
(
length
(
config
$
enrichment_strategi
es
)))
{
#
We loop over the functional sources.
for
(
k
in
seq_len
(
length
(
config
$
functional_sourc
es
)))
{
# We get the integration name for that scheme.
enrichment_strategy
<-
config
$
enrichment_strategies
[[
j
]]
# I/Os
analysis_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
enrichment_strategy
$
limmas
[
1
],
"_max-avg_"
,
enrichment_strategy
$
file_tags
[
1
])
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
# We get the pi values of the two files
ranking_final
<-
get_pi_values
(
integration
,
enrichment_strategy
$
limmas
[
1
],
enrichment_strategy
$
file_tags
[
1
],
negative
=
FALSE
)
if
(
!
is.na
(
enrichment_strategy
$
limmas
[
2
]))
{
ranking_final_neg
<-
get_pi_values
(
integration
,
enrichment_strategy
$
limmas
[
2
],
enrichment_strategy
$
file_tags
[
2
],
negative
=
TRUE
)
ranking_final
<-
rbind
(
ranking_final
,
ranking_final_neg
)
# We define the current source.
func_source
<-
config
$
functional_sources
[[
k
]]
if
(
is.null
(
func_source
$
cp_name
)
||
func_source
$
cp_name
==
""
)
{
next
}
# We prepare the gene id specific rankings (using either gene names or entrezgene ids).
ranking_final_eg
<-
ranking_final
%>%
filter
(
!
is.na
(
EGene
))
ranking_eg
<-
ranking_final_eg
$
pi_value
ranking_sy
<-
ranking_final
$
pi_value
logFC_eg
<-
ranking_final_eg
$
log_fold_change
logFC_sy
<-
ranking_final
$
log_fold_change
names
(
ranking_eg
)
<-
ranking_final_eg
$
EGene
names
(
ranking_sy
)
<-
ranking_final
$
Gene
names
(
logFC_eg
)
<-
ranking_final_eg
$
EGene
names
(
logFC_sy
)
<-
ranking_final
$
Gene
ranking_eg
<-
ranking_eg
[
!
duplicated
(
names
(
ranking_eg
))]
ranking_sy
<-
ranking_sy
[
!
duplicated
(
names
(
ranking_sy
))]
logFC_eg
<-
logFC_eg
[
!
duplicated
(
names
(
logFC_eg
))]
logFC_sy
<-
logFC_sy
[
!
duplicated
(
names
(
logFC_sy
))]
rm
(
ranking_final
,
ranking_final_eg
)
# We loop over the functional sources.
for
(
k
in
seq_len
(
length
(
config
$
functional_sources
)))
{
# We define the current source.
func_source
<-
config
$
functional_sources
[[
k
]]
if
(
is.null
(
func_source
$
cp_name
)
||
func_source
$
cp_name
==
""
)
{
next
}
# We define the reference genes (depending on the source).
ranking
<-
NULL
logFC
<-
NULL
if
(
func_source
$
cp_gene_id_type
==
"entrezgene"
)
{
ranking
<-
ranking_eg
logFC
<-
logFC_eg
}
else
if
(
func_source
$
cp_gene_id_type
==
"symbol"
)
{
ranking
<-
ranking_sy
logFC
<-
logFC_sy
}
# We define the reference genes (depending on the source).
ranking
<-
NULL
logFC
<-
NULL
if
(
func_source
$
cp_gene_id_type
==
"entrezgene"
)
{
ranking
<-
ranking_eg
logFC
<-
logFC_eg
}
else
if
(
func_source
$
cp_gene_id_type
==
"symbol"
)
{
ranking
<-
ranking_sy
logFC
<-
logFC_sy
}
# We run the GSEA itself.
gsea_pi
<-
perform_gsea
(
ranking
,
type
=
func_source
$
cp_name
,