Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
0fab72eb
Commit
0fab72eb
authored
Jun 11, 2020
by
Leon-Charles Tranchevent
Browse files
Re-adjusting the GDS P values for the gender-dimorphic genes.
parent
69df6c97
Changes
5
Hide whitespace changes
Inline
Side-by-side
16-Data_integration_all/compute_
overlap_percentage
.R
→
16-Data_integration_all/compute_
dataset_overlap
.R
View file @
0fab72eb
...
...
@@ -124,122 +124,22 @@ rm(x, res_females, res_males)
FM_ref
<-
FM
%>%
select
(
SYMBOL
,
overlap_percentage
)
# We save the complete refining data as to keep track of things.
refining_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
selection
$
name
,
"_
FvsM_refining_
data.tsv"
)
refining_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_
ALL_
"
,
selection
$
name
,
"_
datasetoverlap_full
data.tsv"
)
write.table
(
FM
,
refining_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
FM
,
refining_fn
)
# D- We then enrich both raw and non raw results for females. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k
<-
5
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration.tsv"
)
res_females
<-
read.delim
(
res_females_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_females
<-
merge
(
x
=
res_females
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_refined.tsv"
)
write.table
(
res_females
,
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
res_females_fn
)
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw.tsv"
)
res_females
<-
read.delim
(
res_females_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_females
<-
merge
(
x
=
res_females
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_females_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw_refined.tsv"
)
write.table
(
res_females
,
res_females_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
k
,
limma_analysis
,
res_females_fn
)
# E- We enrich both raw and non raw results for males. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k
<-
6
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration.tsv"
)
res_males
<-
read.delim
(
res_males_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_males
<-
merge
(
x
=
res_males
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_refined.tsv"
)
write.table
(
res_males
,
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
res_males_fn
)
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw.tsv"
)
res_males
<-
read.delim
(
res_males_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_males
<-
merge
(
x
=
res_males
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_males_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw_refined.tsv"
)
write.table
(
res_males
,
res_males_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
k
,
limma_analysis
,
res_males_fn
)
# F- We enrich both raw and non raw results for GDS. The previous computation was based
# on the raw results (complete) but we also enrich the filtered results (because these are
# the results we want ot further investigate, at least now). We simply add one column that
# corresponds to the overlap presented as a percentage.
k
<-
7
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
res_gds_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration.tsv"
)
res_gds
<-
read.delim
(
res_gds_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_gds
<-
merge
(
x
=
res_gds
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_gds_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_refined.tsv"
)
write.table
(
res_gds
,
res_gds_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
res_gds_fn
)
res_gds_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw.tsv"
)
res_gds
<-
read.delim
(
res_gds_fn
,
row.names
=
1
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
res_gds
<-
merge
(
x
=
res_gds
,
y
=
FM_ref
,
by
=
"SYMBOL"
,
ALL.x
=
TRUE
)
res_gds_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_"
,
limma_analysis
$
name
,
"_"
,
selection
$
name
,
"_integration_raw_refined.tsv"
)
write.table
(
res_gds
,
res_gds_fn
,
# We save the simplified data (to be used directly for merging).
refining_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_ALL_"
,
selection
$
name
,
"_datasetoverlap.tsv"
)
write.table
(
FM_ref
,
refining_fn
,
quote
=
FALSE
,
sep
=
"\t"
,
row.names
=
FALSE
)
rm
(
k
,
limma_analysis
,
res_gds_fn
)
rm
(
FM_ref
,
res_females
,
res_males
,
res_gds
)
rm
(
FM_ref
,
refining_fn
)
# We clean the workspace and log the session details for reproducibility.
rm
(
l
,
selection
,
vsn
,
j
,
integration
,
int_criteria
,
i
)
...
...
16-Data_integration_all/compute_gender_specificity.R
View file @
0fab72eb
...
...
@@ -23,36 +23,22 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' @title Function to compute the
delta
based on the Pi value
based
rankings.
#' @title Function to compute the
gender-specificity score
based on the Pi value rankings.
#'
#' @description This function takes a dataframe with the log fold changes and P values of two
#' comparisons termed "ref" and "ctrl". The idea is to
rank the genes according to
their
#' comparisons termed "ref" and "ctrl". The idea is to
compute a score representing
their
#' specificity for "ref" using "ctrl" to correct the non specific DEGs.
#' The specificity is based on the Pi values so the actual regulation (up/down) does not
#' matter and two genes with strong but opposite signals will not be considered specific.
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, and ctrl_adj_Pval.
#' @param has_ctrl A boolean indicating whether a "ctrl" com
a
prison is present. If not then
#' @param has_ctrl A boolean indicating whether a "ctrl" comp
a
rison is present. If not then
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' no delta values are returned.
#' @return An enriched data-frame that contains additional columns, in particular one that
#' contains the delta values, representing the specificity to the "ref" comparison (unless
#' has_ctrl was set to FALSE).
compute_delta
<-
function
(
FM
,
has_ctrl
=
TRUE
)
{
# Before we start with the real computation, we need to filter out the NA values in the control
# part of the data and keep them for later re-merging.
if
(
has_ctrl
)
{
FM_nas
<-
FM
%>%
filter
(
is.na
(
ctrl_Pval
))
%>%
mutate
(
ref_pivalue
=
NA
)
%>%
mutate
(
ctrl_pivalue
=
NA
)
%>%
mutate
(
ref_pivalue_rank
=
NA
)
%>%
mutate
(
ref_pivalue_rankratio
=
NA
)
%>%
mutate
(
ctrl_adj_pivalue
=
NA
)
%>%
mutate
(
ctrl_pivalue_rank
=
NA
)
%>%
mutate
(
ctrl_pivalue_rankratio
=
NA
)
%>%
mutate
(
Delta
=
NA
)
%>%
mutate
(
ranking_value
=
2
)
FM
<-
FM
%>%
filter
(
!
is.na
(
ctrl_Pval
))
}
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores
<-
function
(
FM
,
has_ctrl
=
TRUE
)
{
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
...
...
@@ -115,7 +101,7 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
# The rank ratios of the genes based on the Pi value ranks (for the "ref"
# comparison). Values are [0, 1]. Small ratios correspond to high Pi values
# (and therefore significant DEGs).
FM
$
ref_pivalue_rank
<-
rank
(
-
FM
$
ref_pivalue
,
ties.method
=
"
min
"
)
FM
$
ref_pivalue_rank
<-
rank
(
-
FM
$
ref_pivalue
,
ties.method
=
"
average
"
)
max_rank
<-
max
(
FM
$
ref_pivalue_rank
)
FM
$
ref_pivalue_rankratio
<-
FM
$
ref_pivalue_rank
/
max_rank
rm
(
max_rank
)
...
...
@@ -126,73 +112,19 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
return
(
FM
)
}
# Creating the ranks and rank ratios for "control", based on the Pi values.
# Things are similar than for "ref" but because these rank ratios will only be used to
# correct the "ref" rank ratios, they don't have the same role and we need to take into
# account special cases such as when we correct a gene that is up-expressed in "ref" but
# down-expressed in "control" (or the other way round). In these cases, we can not take the
# "control" Pi values as such since they do not represent the same thing than the "ref"
# Pi values.
#
# Examples:
# Gene A:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = +1, P value = 1e-3, Pi value = 3
# => It makes sense to correct the "ref" results by the "ctrl" results since A is up-expressed
# in both comparisons.
# Gene B:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = -3, P value = 1e-7, Pi value = 21
# => It does not make sense to correct the "ref" results by the "ctrl" results since A is
# up-expressed in "ref" and down-expressed in "ctrl".
#
# This is why when the fold changes in both comparisons are of different signs, we modify
# the ranks and rank ratios as if it would be a gene that is not differentially expressed in
# "control".
#
# Example:
# Gene B:
# ref: logFC = +2, P value = 1e-5, Pi value = 10
# ctrl: logFC = ~0, P value = ~1, Pi value = ~0 [pseudo-values only for computation]
# We get the minimum "ctrl" Pi value that is linked to a gene with concordant signal in both
# comparisons and whose P value is not 1 (in "ctrl").
min_pi
<-
min
(
FM
%>%
filter
(
sign
(
FM
$
ref_logFC
)
==
sign
(
FM
$
ctrl_logFC
))
%>%
filter
(
ctrl_Pval
<
1
)
%>%
select
(
ctrl_pivalue
))
# We will add one field:
# ctrl_adj_pivalue:
# The adjusted Pi values for the "control" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. The difference with the non
# adjusted Pi values is that discordant cases (e.g., up in "ctrl" and down in
# "ref") and non differentially expressed genes (P value == 1) all get the
# same minimum Pi value as to not discriminate between these bottom line cases.
# We keep the original Pi values as is, but we create the adjusted Pi values.
FM
$
ctrl_adj_pivalue
<-
FM
$
ctrl_pivalue
# We then update the special cases mentioned above.
FM
$
ctrl_adj_pivalue
[
sign
(
FM
$
ref_logFC
)
!=
sign
(
FM
$
ctrl_logFC
)]
<-
min_pi
FM
$
ctrl_adj_pivalue
[
FM
$
ctrl_Pval
==
1
]
<-
min_pi
rm
(
min_pi
)
# Now creating the rankings per say.
# Now creating the ranks and rank ratios based on the Pi values for the "ctrl" comparison.
# We will add two fields:
# ctrl_pivalue_rank:
# The rank of the genes based on their Pi values (for the "ctrl" comparison).
# Values are ~[1, 22000]. Small ranks correspond to high Pi values (and therefore
# significant DEGs). High ranks correspond to either non differentially expressed
# genes (P value == 1) or to genes with discordant differential expression between
# "ref" and "ctrl" (e.g., up in "ctrl", down in "ref").
# significant DEGs). High ranks correspond to non differentially expressed
# genes (P value == 1).
# ctrl_pivalue_rankratio:
# The rank ratios of the genes based on the Pi value ranks (for the "ctrl"
# comparison). Values are [0, 1]. Small ratios correspond to high Pi values
# (and therefore significant DEGs). High ratios correspond to either non
# differentially expressed genes (P value == 1) or to genes with discordant
# differential expression between "ref" and "ctrl" (e.g., up in "ctrl",
# down in "ref").
FM
$
ctrl_pivalue_rank
<-
rank
(
-
FM
$
ctrl_adj_pivalue
,
ties.method
=
"min"
)
# (and therefore significant DEGs). High ratios correspond to non
# differentially expressed genes (P value == 1).
FM
$
ctrl_pivalue_rank
<-
rank
(
-
FM
$
ctrl_pivalue
,
ties.method
=
"average"
)
max_rank
<-
max
(
FM
$
ctrl_pivalue_rank
)
FM
$
ctrl_pivalue_rankratio
<-
FM
$
ctrl_pivalue_rank
/
max_rank
rm
(
max_rank
)
...
...
@@ -209,10 +141,8 @@ compute_delta <- function(FM, has_ctrl = TRUE) {
# in "ctrl").
FM
$
Delta
<-
FM
$
ref_pivalue_rankratio
-
FM
$
ctrl_pivalue_rankratio
# We return the enriched data-frame after we have added back the section we left apart at the
# begining (because of the NAs).
FM
$
ranking_value
<-
1
-
FM
$
Delta
FM
<-
rbind
(
FM
,
FM_nas
)
# We return the enriched data-frame.
FM
$
gender_specific_score
<-
1
-
FM
$
Delta
return
(
FM
)
}
...
...
@@ -235,6 +165,7 @@ for (i in seq_len(length(config$integrations))) {
# We otherwise use default parameters SVN = TRUE, probe selection = max-avg
# integration = Marot-Mayer.
# We use four of the seven limma comparisons.
#
# Note: we use the filtered results of the integration ("integration") as well
# as the non filtered results ("integration_raw") but only as a control. The idea
# is that to identify say female specific genes, we focus on the female DEGs
...
...
@@ -244,8 +175,6 @@ for (i in seq_len(length(config$integrations))) {
# and the raw results contain more of these non expressed or non differentially
# expressed genes).
limmas
<-
config
$
limma_analyses
B_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
2
]]
$
name
,
"_max-avg_integration.tsv"
)
F_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
5
]]
$
name
,
"_max-avg_integration.tsv"
)
Fr_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
5
]]
$
name
,
...
...
@@ -254,21 +183,19 @@ for (i in seq_len(length(config$integrations))) {
"_max-avg_integration.tsv"
)
Mr_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
6
]]
$
name
,
"_max-avg_integration_raw.tsv"
)
G_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
7
]]
$
name
,
"_max-avg_integration.tsv"
)
B
<-
read.delim
(
B_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
F
<-
read.delim
(
F_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
Fr
<-
read.delim
(
Fr_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
M
<-
read.delim
(
M_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
Mr
<-
read.delim
(
Mr_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
G
<-
read.delim
(
G_fn
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
rm
(
B_fn
,
F_fn
,
Fr_fn
,
M_fn
,
Mr_fn
,
G_fn
)
rm
(
F_fn
,
Fr_fn
,
M_fn
,
Mr_fn
)
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
FM
<-
merge
(
x
=
F
,
y
=
Mr
,
by
=
"SYMBOL"
,
all.x
=
TRUE
)
MF
<-
merge
(
x
=
M
,
y
=
Fr
,
by
=
"SYMBOL"
,
all.x
=
TRUE
)
rm
(
Fr
,
Mr
)
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
FM
<-
merge
(
x
=
Fr
,
y
=
Mr
,
by
=
"SYMBOL"
)
MF
<-
merge
(
x
=
Mr
,
y
=
Fr
,
by
=
"SYMBOL"
)
rm
(
Fr
,
Mr
,
F
,
M
)
# We select the fields we need:
# Gene:
...
...
@@ -293,26 +220,6 @@ for (i in seq_len(length(config$integrations))) {
# ctrl_adj_Pval:
# The corrected P value associated with the "ctrl" comparison.
# Values are [0, 1]. The best P values are the smallest.
B
<-
B
%>%
mutate
(
Gene
=
SYMBOL
,
ref_logFC
=
wavg_csss_logFC
,
ref_Pval
=
int_csss_pval_marotmayer
,
ref_adj_Pval
=
int_csss_adj_pval_marotmayer
)
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
)
F
<-
F
%>%
mutate
(
Gene
=
SYMBOL
,
ref_logFC
=
wavg_csss_logFC
,
ref_Pval
=
int_csss_pval_marotmayer
,
ref_adj_Pval
=
int_csss_adj_pval_marotmayer
)
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
)
M
<-
M
%>%
mutate
(
Gene
=
SYMBOL
,
ref_logFC
=
wavg_csss_logFC
,
ref_Pval
=
int_csss_pval_marotmayer
,
ref_adj_Pval
=
int_csss_adj_pval_marotmayer
)
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
)
G
<-
G
%>%
mutate
(
Gene
=
SYMBOL
,
ref_logFC
=
wavg_csss_logFC
,
ref_Pval
=
int_csss_pval_marotmayer
,
ref_adj_Pval
=
int_csss_adj_pval_marotmayer
)
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
)
FM
<-
FM
%>%
mutate
(
Gene
=
SYMBOL
,
ref_logFC
=
wavg_csss_logFC.x
,
ref_Pval
=
int_csss_pval_marotmayer.x
,
...
...
@@ -329,44 +236,33 @@ for (i in seq_len(length(config$integrations))) {
ctrl_Pval
=
int_csss_pval_marotmayer.y
,
ctrl_adj_Pval
=
int_csss_adj_pval_marotmayer.y
)
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
,
ctrl_logFC
,
ctrl_Pval
,
ctrl_adj_Pval
)
# We compute the delta in both cases.
B_enriched
<-
compute_delta
(
B
,
has_ctrl
=
FALSE
)
F_enriched
<-
compute_delta
(
F
,
has_ctrl
=
FALSE
)
M_enriched
<-
compute_delta
(
M
,
has_ctrl
=
FALSE
)
G_enriched
<-
compute_delta
(
G
,
has_ctrl
=
FALSE
)
FM_enriched
<-
compute_delta
(
FM
)
MF_enriched
<-
compute_delta
(
MF
)
rm
(
FM
,
MF
,
G
,
F
,
M
,
B
)
FM_enriched
<-
compute_genderspecificity_scores
(
FM
)
MF_enriched
<-
compute_genderspecificity_scores
(
MF
)
rm
(
FM
,
MF
)
# We save the data.
B_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
2
]]
$
name
,
"_rankings.tsv"
)
F_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
5
]]
$
name
,
"_rankings.tsv"
)
M_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
6
]]
$
name
,
"_rankings.tsv"
)
G_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
7
]]
$
name
,
"_rankings.tsv"
)
FM_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
5
]]
$
name
,
"_specific_rankings.tsv"
)
MF_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
6
]]
$
name
,
"_specific_rankings.tsv"
)
write.table
(
B_enriched
,
file
=
B_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
write.table
(
F_enriched
,
file
=
F_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
write.table
(
M_enriched
,
file
=
M_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
write.table
(
G_enriched
,
file
=
G_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
# We save the full data.
FM_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
5
]]
$
name
,
"_max-avg_genderspecificityscore_fulldata.tsv"
)
MF_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
6
]]
$
name
,
"_max-avg_genderspecificityscore_fulldata.tsv"
)
write.table
(
FM_enriched
,
file
=
FM_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
write.table
(
MF_enriched
,
file
=
MF_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
rm
(
B_ofile
,
F_ofile
,
M_ofile
,
G_ofile
,
FM_ofile
,
MF_ofile
)
rm
(
FM_ofile
,
MF_ofile
)
# Manual exploration.
print
(
"Top female genes (Pi values):"
)
F_enriched
%>%
arrange
(
ref_pivalue_rankratio
)
%>%
head
(
5
)
print
(
"Top male genes (Pi values):"
)
M_enriched
%>%
arrange
(
ref_pivalue_rankratio
)
%>%
head
(
5
)
print
(
"Top factorial genes (Pi values):"
)
G_enriched
%>%
arrange
(
ref_pivalue_rankratio
)
%>%
head
(
5
)
print
(
"Top female specific genes (Delta):"
)
FM_enriched
%>%
arrange
(
Delta
)
%>%
head
(
5
)
print
(
"Top male specific genes (Delta):"
)
MF_enriched
%>%
arrange
(
Delta
)
%>%
head
(
5
)
# We save the simplified data (to be merged later on).
FM_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
5
]]
$
name
,
"_max-avg_genderspecificityscore.tsv"
)
MF_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_VSN_"
,
limmas
[[
6
]]
$
name
,
"_max-avg_genderspecificityscore.tsv"
)
FM_enriched_short
<-
FM_enriched
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
,
ctrl_logFC
,
ctrl_Pval
,
ctrl_adj_Pval
,
gender_specific_score
)
MF_enriched_short
<-
MF_enriched
%>%
select
(
Gene
,
ref_logFC
,
ref_Pval
,
ref_adj_Pval
,
ctrl_logFC
,
ctrl_Pval
,
ctrl_adj_Pval
,
gender_specific_score
)
write.table
(
FM_enriched_short
,
file
=
FM_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
write.table
(
MF_enriched_short
,
file
=
MF_ofile
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
rm
(
FM_ofile
,
MF_ofile
,
FM_enriched_short
,
MF_enriched_short
)
# Summary statistics of DEG versus all genes.
# The DEGs should have more negative than positive Delta values in general, because
...
...
@@ -384,10 +280,10 @@ for (i in seq_len(length(config$integrations))) {
# This display the effect of the correction (vertically shift = correction effect).
# The bottom line is the genes for which there is litle effect. Its position
# also reflects how many genes are corrected.
FM_deltaplot_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
5
]]
$
name
,
"_specific_deltaplot.png"
)
MF_deltaplot_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
6
]]
$
name
,
"_specific_deltaplot.png"
)
FM_deltaplot_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_
VSN_
"
,
limmas
[[
5
]]
$
name
,
"_
max-avg_
specific_deltaplot.png"
)
MF_deltaplot_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_
VSN_
"
,
limmas
[[
6
]]
$
name
,
"_
max-avg_
specific_deltaplot.png"
)
png
(
FM_deltaplot_ofile
)
plot
(
FM_enriched
$
ref_pivalue_rank
,
rank
(
FM_enriched
$
Delta
))
dev.off
()
...
...
@@ -396,36 +292,7 @@ for (i in seq_len(length(config$integrations))) {
dev.off
()
rm
(
FM_deltaplot_ofile
,
MF_deltaplot_ofile
)
# Another way to look at it.
# The gender DEGs and the gender specific DEGs are different if the correction is effective.
# Thus the mean/median change (correction effect) in ranks can be computed (it should not
# be null).
FM_rank_diff
<-
FM_enriched
$
ref_pivalue_rank
-
rank
(
FM_enriched
$
Delta
)
MF_rank_diff
<-
MF_enriched
$
ref_pivalue_rank
-
rank
(
MF_enriched
$
Delta
)
print
(
"[Female] Rank changes between Pi and Delta:"
)
summary
(
abs
(
FM_rank_diff
))
print
(
"[Male] Rank changes between Pi and Delta:"
)
summary
(
abs
(
MF_rank_diff
))
rm
(
FM_rank_diff
,
MF_rank_diff
)
# We study the differences between the factorial limma analysis and our lists
# of gender specific genes. Technically, they are not the same but there could be
# some overlap. However the way we do things might not allow us to detect it in this plot.
G_FM
<-
merge
(
x
=
G_enriched
,
y
=
FM_enriched
,
by.x
=
"Gene"
,
by.y
=
"Gene"
)
G_MF
<-
merge
(
x
=
G_enriched
,
y
=
MF_enriched
,
by.x
=
"Gene"
,
by.y
=
"Gene"
)
G_FM_cmp_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
7
]]
$
name
,
"_vs_"
,
limmas
[[
5
]]
$
name
,
"_specific.png"
)
G_MF_cmp_ofile
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
limmas
[[
6
]]
$
name
,
"_vs_"
,
limmas
[[
6
]]
$
name
,
"_specific.png"
)
png
(
G_FM_cmp_ofile
)
plot
(
G_FM
$
ref_pivalue_rankratio.x
,
rank
(
G_FM
$
Delta
))
dev.off
()
png
(
G_MF_cmp_ofile
)
plot
(
G_MF
$
ref_pivalue_rankratio.x
,
rank
(
G_MF
$
Delta
))
dev.off
()
rm
(
G_FM
,
G_MF
,
G_FM_cmp_ofile
,
G_MF_cmp_ofile
)
rm
(
B_enriched
,
G_enriched
,
F_enriched
,
M_enriched
,
FM_enriched
,
MF_enriched
)
rm
(
FM_enriched
,
MF_enriched
)
rm
(
integration
,
limmas
)
}
...
...
16-Data_integration_all/extra_figures.R
View file @
0fab72eb
...
...
@@ -390,8 +390,8 @@ plot_gene(gene = gene,
selection_name
=
selection
$
name
,
palette
=
current_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
,
#
plot_all = FALSE, plot_datasets = TRUE)
plot_all
=
TRUE
,
plot_datasets
=
FALSE
)
plot_all
=
FALSE
,
plot_datasets
=
TRUE
)
#
plot_all = TRUE, plot_datasets = FALSE)
gene_datasets
<-
c
(
"GSE20292"
,
"GSE8397"
,
"GSE49036"
)
plot_gene
(
gene
=
gene
,
gene_datasets
=
gene_datasets
,
...
...
@@ -457,8 +457,8 @@ limma_analysis <- config$limma_analyses[[k]]
#gene_datasets <- c("GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
#gene <- "CD93"
#gene_datasets <- c("GSE20159", "GSE20163", "GSE26927", "GSE49036", "GSE8397", "NBB")
#
gene <- "
CYP1B1
"
#
gene_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
gene
<-
"
RFC5
"
gene_datasets
<-
c
(
"GSE20159"
,
"GSE20163"
,
"GSE20292"
,
"GSE26927"
,
"GSE49036"
,
"GSE8397"
,
"NBB"
)
gene
<-
"EFNA1"
gene_datasets
<-
c
(
"GSE20159"
,
"GSE26927"
,
"GSE49036"
,
"GSE8397"
,
"NBB"
)
...
...
@@ -470,8 +470,8 @@ plot_gene(gene = gene,
selection_name
=
selection
$
name
,
palette
=
current_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
,
#
plot_all = FALSE, plot_datasets = TRUE)
plot_all
=
TRUE
,
plot_datasets
=
FALSE
)
plot_all
=
FALSE
,
plot_datasets
=
TRUE
)
#
plot_all = TRUE, plot_datasets = FALSE)
gene_datasets
<-
c
(
"GSE49036"
,
"NBB"
,
"GSE26927"
)
plot_gene
(
gene
=
gene
,
gene_datasets
=
gene_datasets
,
...
...
16-Data_integration_all/merge_and_filter_rankings.R
0 → 100644
View file @
0fab72eb
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"tidyverse"
)
library
(
"stringr"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
input_edata_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_exp_data_dir
)
input_rsedata_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_rs_exp_data_dir
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
# The aim of this script is to take the results of the integration and create gene lists that can
# be further analysed by GSEA and network-based methods. Several filters are applied and new
# scores / metrics are also computed. The general idea is to define the gender-specific and
# gender-dimorphic genes that make sense while keeping the workfow as simple as possible.
# Filtering parameters
specificity_threshold
<-
1.1
overlap_threshold
<-
0.5
# We focus on one configuration, our main configuration (SNage, VSN, max-avg).
i
<-
3
integration
<-
config
$
integrations
[[
i
]]
int_criteria
<-
str_split
(
integration
$
criteria
,
","
)[[
1
]]
j
<-
1
vsn
<-
config
$
variance_methods
[[
j
]]
l
<-
1
selection
<-
config
$
selections
[[
l
]]
# We first read the dataset overlap data (by definition, common to males and females).
data_over_fn
<-
paste0
(
output_data_dir
,
integration
$
name
,
"_"
,
vsn
$
name
,
"_ALL_"
,
selection
$
name
,
"_datasetoverlap.tsv"
)
data_over
<-
read.delim
(
data_over_fn
,
quote
=
""
,
stringsAsFactors
=
FALSE
)
%>%
mutate
(
Gene
=
SYMBOL
)
%>%
select
(
Gene
,
overlap_percentage
)