Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
3e4e2db5
Commit
3e4e2db5
authored
Dec 21, 2020
by
Leon-Charles Tranchevent
Browse files
Refined the gender specific pathway figures.
parent
84aa8519
Changes
5
Hide whitespace changes
Inline
Side-by-side
06-Data_integration/extra_figures.R
View file @
3e4e2db5
...
...
@@ -303,14 +303,6 @@ i <- 3
integration
<-
config
$
integrations
[[
i
]]
int_criteria
<-
str_split
(
integration
$
criteria
,
","
)[[
1
]]
# We rely on the last limma analyses (gender_disease_status) but this is only to get the
# 4 categories of interest.
##k <- 5
##limma_analysis <- config$limma_analyses[[k]]
# We use the first gene-probe selection method (max-avg).
l
<-
1
selection
<-
config
$
selections
[[
l
]]
...
...
@@ -446,6 +438,20 @@ for (gene in top_genes) {
message
(
paste0
(
"["
,
gene
,
"]["
,
dataset
,
"] plotted."
))
}
}
# Manual figures via rstudio (for manuscript).
# dataset <- "GSE49036"
# gene <- "EFNA1"
# plot_gene(gene = gene,
# gene_datasets = c(dataset),
# output_data_dir = output_data_dir,
# integration_name = integration$name,
# limma_analysis = config$limma_analyses[[5]],
# selection_name = selection$name,
# palette = gds_plt,
# clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
# plot_all = FALSE, plot_datasets = TRUE)
rm
(
gene
,
top_genes
,
common_datasets
)
rm
(
selection
,
exp_data_all
,
zexp_data_all
,
clin_data_all
,
m_indx
,
m_indx_map
)
rm
(
l
,
gds_plt
,
vsn
)
...
...
06-Data_integration/refine_GD_rankings.R
View file @
3e4e2db5
...
...
@@ -29,7 +29,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# create a single GD ranking based on both by summing up the male and female signals.
# We focus on one configuration, our main configuration (SNage, VSN, max-avg) but we also
# run the other integration schemes (SN
and
DA) with the same default parameters.
# run the other integration schemes (SN
/ DA / iPSC-
DA) with the same default parameters.
for
(
i
in
seq_len
(
length
(
config
$
integrations
)))
{
# We get the integration name for that scheme.
...
...
07-Enrichment/create_visualizations.R
View file @
3e4e2db5
...
...
@@ -25,34 +25,15 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell
<-
function
(
row
,
sep
=
"|"
)
{
keys
<-
strsplit
(
row
[
1
],
sep
,
fixed
=
TRUE
)[[
1
]]
val_1
<-
rep
(
row
[
2
],
length
(
keys
))
val_2
<-
rep
(
row
[
3
],
length
(
keys
))
to_ret
<-
cbind
(
Gene
=
keys
,
logFC
=
val_1
,
adj_P_value
=
val_2
)
return
(
to_ret
)
}
# ================================================================================================
# Main
# ================================================================================================
# We define the I/Os.
F_ranking_file
<-
paste0
(
output_data_dir
,
"SNage_
VSN_
PDVsControl_females_max-avg_all_pivalue_rankings.tsv"
)
"SNage_PDVsControl_females_max-avg_all_pivalue_rankings.tsv"
)
M_ranking_file
<-
paste0
(
output_data_dir
,
"SNage_
VSN_
PDVsControl_males_max-avg_all_pivalue_rankings.tsv"
)
"SNage_PDVsControl_males_max-avg_all_pivalue_rankings.tsv"
)
# We read the prepared ranking and keep only the value used to rank.
F_ranking
<-
read.delim
(
F_ranking_file
,
stringsAsFactors
=
FALSE
)
%>%
...
...
@@ -61,49 +42,38 @@ M_ranking <- read.delim(M_ranking_file, stringsAsFactors = FALSE) %>%
select
(
Gene
,
log_fold_change
,
adj_P_value
)
rm
(
F_ranking_file
,
M_ranking_file
)
# We clean genes (no duplicate entries with pipe separated gene symbols).
F_tmp_res
<-
t
(
apply
(
F_ranking
,
1
,
duplicate_row_first_cell
))
M_tmp_res
<-
t
(
apply
(
M_ranking
,
1
,
duplicate_row_first_cell
))
F_ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
F_tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
M_ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
M_tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
rm
(
F_tmp_res
,
F_ranking
,
M_tmp_res
,
M_ranking
)
# We map the gene symbols to the EntrezGene identifiers.
F_mapped_egenes
<-
bitr
(
F_ranking
_clean
$
Gene
,
F_mapped_egenes
<-
bitr
(
F_ranking
$
Gene
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
F_ranking_final
<-
merge
(
x
=
F_ranking
_clean
,
F_ranking_final
<-
merge
(
x
=
F_ranking
,
y
=
F_mapped_egenes
,
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log
FC
,
adj_P_value
)
M_mapped_egenes
<-
bitr
(
M_ranking
_clean
$
Gene
,
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log
_fold_change
,
adj_P_value
)
M_mapped_egenes
<-
bitr
(
M_ranking
$
Gene
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
M_ranking_final
<-
merge
(
x
=
M_ranking
_clean
,
M_ranking_final
<-
merge
(
x
=
M_ranking
,
y
=
M_mapped_egenes
,
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log
FC
,
adj_P_value
)
F_ranking_final
$
logFC
<-
as.numeric
(
F_ranking_final
$
log
FC
)
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
log
_fold_change
,
adj_P_value
)
F_ranking_final
$
logFC
<-
as.numeric
(
F_ranking_final
$
log
_fold_change
)
F_ranking_final
$
adj_P_value
<-
as.numeric
(
F_ranking_final
$
adj_P_value
)
M_ranking_final
$
logFC
<-
as.numeric
(
M_ranking_final
$
log
FC
)
M_ranking_final
$
logFC
<-
as.numeric
(
M_ranking_final
$
log
_fold_change
)
M_ranking_final
$
adj_P_value
<-
as.numeric
(
M_ranking_final
$
adj_P_value
)
rm
(
F_mapped_egenes
,
F_ranking_clean
,
M_mapped_egenes
,
M_ranking_clean
)
rm
(
F_mapped_egenes
,
M_mapped_egenes
)
# We also save the data for GeneGo (manual work on the pathway maps).
FM
<-
merge
(
x
=
F_ranking_final
,
y
=
M_ranking_final
,
by
=
"Gene"
,
all
=
TRUE
,
suffixes
=
c
(
"_F"
,
"_M"
))
%>%
select
(
Gene
,
logFC_F
,
adj_P_value_F
,
logFC_M
,
adj_P_value_M
,)
FM_fn
<-
paste0
(
output_data_dir
,
"SNage_
VSN_
PDVsControl_both_max-avg_all_merged_rankings_4GeneGo.tsv"
)
FM_fn
<-
paste0
(
output_data_dir
,
"SNage_PDVsControl_both_max-avg_all_merged_rankings_4GeneGo.tsv"
)
FM
$
logFC_F
[
is.na
(
FM
$
logFC_F
)]
<-
0
FM
$
adj_P_value_F
[
is.na
(
FM
$
adj_P_value_F
)]
<-
1
FM
$
logFC_M
[
is.na
(
FM
$
logFC_M
)]
<-
0
...
...
@@ -113,27 +83,27 @@ rm(FM, FM_fn)
# We prepare the gene id specific rankings (using entrezgene ids for KEGG).
# max P = 0.05
F_logFC_eg
<-
F_ranking_final
$
logFC
[
F_ranking_final
$
adj_P_value
<
0.05
]
names
(
F_logFC_eg
)
<-
F_ranking_final
$
EGene
[
F_ranking_final
$
adj_P_value
<
0.05
]
F_logFC_eg
<-
F_logFC_eg
[
!
duplicated
(
names
(
F_logFC_eg
))]
F_logFC_eg
<-
F_logFC_eg
[
!
is.na
(
names
(
F_logFC_eg
))]
M_logFC_eg
<-
M_ranking_final
$
logFC
[
M_ranking_final
$
adj_P_value
<
0.05
]
names
(
M_logFC_eg
)
<-
M_ranking_final
$
EGene
[
M_ranking_final
$
adj_P_value
<
0.05
]
M_logFC_eg
<-
M_logFC_eg
[
!
duplicated
(
names
(
M_logFC_eg
))]
M_logFC_eg
<-
M_logFC_eg
[
!
is.na
(
names
(
M_logFC_eg
))]
rm
(
F_ranking_final
,
M_ranking_final
)
# Alternatively, max P = 1
# F_logFC_eg <- F_ranking_final$logFC
# names(F_logFC_eg) <- F_ranking_final$EGene
# F_logFC_eg <- F_ranking_final$logFC[F_ranking_final$adj_P_value < 0.05]
# names(F_logFC_eg) <- F_ranking_final$EGene[F_ranking_final$adj_P_value < 0.05]
# F_logFC_eg <- F_logFC_eg[!duplicated(names(F_logFC_eg))]
# F_logFC_eg <- F_logFC_eg[!is.na(names(F_logFC_eg))]
# M_logFC_eg <- M_ranking_final$logFC
# names(M_logFC_eg) <- M_ranking_final$EGene
# M_logFC_eg <- M_ranking_final$logFC
[M_ranking_final$adj_P_value < 0.05]
# names(M_logFC_eg) <- M_ranking_final$EGene
[M_ranking_final$adj_P_value < 0.05]
# M_logFC_eg <- M_logFC_eg[!duplicated(names(M_logFC_eg))]
# M_logFC_eg <- M_logFC_eg[!is.na(names(M_logFC_eg))]
# rm(F_ranking_final, M_ranking_final)
# Alternatively, max P = 1
F_logFC_eg
<-
F_ranking_final
$
logFC
names
(
F_logFC_eg
)
<-
F_ranking_final
$
EGene
F_logFC_eg
<-
F_logFC_eg
[
!
duplicated
(
names
(
F_logFC_eg
))]
F_logFC_eg
<-
F_logFC_eg
[
!
is.na
(
names
(
F_logFC_eg
))]
M_logFC_eg
<-
M_ranking_final
$
logFC
names
(
M_logFC_eg
)
<-
M_ranking_final
$
EGene
M_logFC_eg
<-
M_logFC_eg
[
!
duplicated
(
names
(
M_logFC_eg
))]
M_logFC_eg
<-
M_logFC_eg
[
!
is.na
(
names
(
M_logFC_eg
))]
rm
(
F_ranking_final
,
M_ranking_final
)
# Reformat the data.
FM
<-
merge
(
x
=
data.frame
(
EGene
=
names
(
F_logFC_eg
),
F_log
=
F_logFC_eg
),
y
=
data.frame
(
EGene
=
names
(
M_logFC_eg
),
M_log
=
M_logFC_eg
),
...
...
@@ -145,9 +115,10 @@ FM$M_log[is.na(FM$M_log)] <- 0
rm
(
F_logFC_eg
,
M_logFC_eg
)
# Selection of KEGG pathways to be plotted.
my_ids
<-
c
(
"hsa04062"
,
"hsa04668"
,
"hsa04721"
,
"hsa04145"
,
"hsa04064"
,
"hsa04657"
,
"hsa04010"
,
"hsa04151"
,
"hsa04211"
,
"hsa05012"
,
"hsa00190"
,
"hsa00061"
,
"hsa04630"
,
"hsa04210"
,
"hsa04115"
,
"hsa04620"
,
"hsa04612"
,
"hsa04728"
)
my_ids
<-
c
(
"hsa00020"
,
"hsa00061"
,
"hsa00190"
,
"hsa01200"
,
"hsa04012"
,
"hsa04060"
,
"hsa04061"
,
"hsa04062"
,
"hsa04657"
,
"hsa04668"
,
"hsa04714"
,
"hsa04721"
,
"hsa04966"
,
"hsa05012"
,
"hsa05017"
,
"hsa05020"
,
"hsa05110"
,
"hsa05120"
,
"hsa05130"
,
"hsa05132"
,
"hsa04728"
,
"hsa04064"
)
for
(
my_id
in
my_ids
)
{
pv.out
<-
pathview
(
gene.data
=
FM
,
pathway.id
=
my_id
,
species
=
"hsa"
,
...
...
07-Enrichment/gender_specific_pathways.R
View file @
3e4e2db5
...
...
@@ -79,8 +79,8 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data
<-
c
()
male_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_females_"
)
male_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_females_"
)
#
# KEGG by CP
...
...
@@ -160,30 +160,6 @@ for (i in seq_len(length(config$integrations))) {
mutate
(
tool
=
"CP"
,
source
=
"GOMF"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
)
#
# MSIGDB by G2P
#
M_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
male_prefix
,
"max-avg_gs_pivalue_MSIGDB.tsv"
)
F_filename
<-
paste0
(
output_data_dir
,
"G2P_"
,
female_prefix
,
"max-avg_gs_pivalue_MSIGDB.tsv"
)
M_data
<-
read.delim
(
M_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
M_data
$
adj_pv
=
M_data
$
pv
*
dim
(
M_data
)[
1
]
M_data
$
adj_pv
[
M_data
$
adj_pv
>
1
]
<-
1
M_data
<-
M_data
%>%
select
(
pid
,
pname
,
adj_pv
)
%>%
filter
(
!
is.na
(
adj_pv
))
%>%
rename
(
ID
=
pid
,
Description
=
pname
,
p.adjust
=
adj_pv
)
minp_M
<-
min
(
M_data
$
p.adjust
[
M_data
$
p.adjust
>
0
])
M_data
$
p.adjust
[
M_data
$
p.adjust
==
0
]
<-
minp_M
/
100
F_data
<-
read.delim
(
F_filename
,
row.names
=
1
,
stringsAsFactors
=
FALSE
)
F_data
$
adj_pv
=
F_data
$
pv
*
dim
(
F_data
)[
1
]
F_data
$
adj_pv
[
F_data
$
adj_pv
>
1
]
<-
1
F_data
<-
F_data
%>%
select
(
pid
,
pname
,
adj_pv
)
%>%
filter
(
!
is.na
(
adj_pv
))
%>%
rename
(
ID
=
pid
,
Description
=
pname
,
p.adjust
=
adj_pv
)
minp_F
<-
min
(
F_data
$
p.adjust
[
F_data
$
p.adjust
>
0
])
F_data
$
p.adjust
[
F_data
$
p.adjust
==
0
]
<-
minp_F
/
100
gsp_data
<-
rbind
(
gsp_data
,
compute_delta
(
M_data
,
F_data
)
%>%
mutate
(
tool
=
"G2P"
,
source
=
"MSIGDB"
))
rm
(
M_filename
,
F_filename
,
M_data
,
F_data
,
minp_M
,
minp_F
)
# We do not run it yet for the ROT and PF tools since there is no obvious p-value to use (there are several).
# We can later adapt this script for these two tools as well.
...
...
07-Enrichment/pathways_with_gender_specific_genes.R
View file @
3e4e2db5
...
...
@@ -87,8 +87,8 @@ for (i in seq_len(length(config$integrations))) {
# We create a structure for all results.
gsp_data
<-
c
()
male_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_
VSN_
PDVsControl_females_"
)
male_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_males_"
)
female_prefix
<-
paste0
(
integration
$
name
,
"_PDVsControl_females_"
)
#
# KEGG by CP
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment