Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
77f8c71f
Commit
77f8c71f
authored
Nov 18, 2020
by
Leon-Charles Tranchevent
Browse files
Minor modifications in step 06 to optimize the code (duplicate_rows).
parent
fe85f14c
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
06-Data_integration/analyse.sh
View file @
77f8c71f
...
...
@@ -24,7 +24,6 @@ module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript
--vanilla
${
CODE_FOLDER
}
analyse_integration_results.R
>
${
OUTPUT_FOLDER
}
analyse_log.out 2>
${
OUTPUT_FOLDER
}
analyse_log.err
Rscript
--vanilla
${
CODE_FOLDER
}
analyse_integration_results_figures.R
>
${
OUTPUT_FOLDER
}
analyse_fig_log.out 2>
${
OUTPUT_FOLDER
}
analyse_fig_log.err
Rscript
--vanilla
${
CODE_FOLDER
}
compare_integrations.R
>
${
OUTPUT_FOLDER
}
compare_integrations_log.out 2>
${
OUTPUT_FOLDER
}
compare_integrations_log.err
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
slurm-
*
out
${
OUTPUT_FOLDER
}
...
...
06-Data_integration/analyse_integration_results.R
View file @
77f8c71f
This diff is collapsed.
Click to expand it.
06-Data_integration/analyse_integration_results_figures.R
View file @
77f8c71f
...
...
@@ -25,6 +25,26 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names
<-
function
(
row
,
ikey
=
3
,
sep
=
"|"
)
{
keys
<-
strsplit
(
as.character
(
row
[
ikey
]),
sep
,
fixed
=
TRUE
)[[
1
]]
if
(
length
(
keys
)
==
0
)
{
keys
<-
""
}
# We duplicate all fields.
new_data
<-
c
()
for
(
n
in
seq_len
(
length
(
row
)))
{
new_data
<-
cbind
(
new_data
,
rep
(
row
[
n
],
length
(
keys
)))
}
# We adjust the keys and the column names.
new_data
[,
ikey
]
<-
keys
return
(
new_data
)
}
#' @title Plot the differential expression of top hits.
#'
#' @description This function create one plot for each top hit (gene) of a given limma
...
...
@@ -55,7 +75,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
nb_hits
=
10
)
{
# We set the I/Os.
file_prefix_short
<-
paste0
(
output_data_dir
,
integration_name
,
"_"
,
vsn
$
name
,
"_"
,
file_prefix_short
<-
paste0
(
output_data_dir
,
integration_name
,
"_"
,
limma_analysis
$
name
,
"_"
)
# Read the matching data.
...
...
@@ -65,7 +85,14 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
rm
(
matching_data_fn
)
tmp_res
<-
t
(
apply
(
matching_data
,
1
,
duplicate_row_multiple_gene_names
))
matching_data_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
names
(
matching_data_clean
)
<-
names
(
matching_data
)
matching_data
<-
matching_data_clean
rm
(
matching_data_fn
,
tmp_res
,
matching_data_clean
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Matching data read."
))
# We read the results of the integration.
...
...
@@ -103,8 +130,9 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
# We get the relevant probe identifiers.
gene
<-
top_hits
[
p
,
]
matching_data_gene
<-
matching_data
%>%
dplyr
::
filter
(
SYMBOL
==
gene
$
SYMBOL
)
%>%
dplyr
::
filter
(
genes
==
gene
$
SYMBOL
)
%>%
dplyr
::
select
(
ends_with
(
str_replace
(
selection_name
,
"-"
,
"."
)))
# We do all datasets one by one and collect the clinical and expression data.
...
...
@@ -122,8 +150,16 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
dataset_arraytype
<-
stringr
::
str_replace
(
dataset
$
array_type
,
"-"
,
"."
)
probeset_tag
<-
paste0
(
dataset_arraytype
,
"_"
,
stringr
::
str_replace
(
selection_name
,
"-"
,
"."
))
probe_id
<-
matching_data_gene
[[
probeset_tag
]]
rm
(
dataset
,
probeset_tag
,
dataset_arraytype
,
jj
)
probe_ids
<-
matching_data_gene
[[
probeset_tag
]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id
<-
probe_ids
[
!
is.na
(
probe_ids
)]
if
(
length
(
probe_id
)
>
1
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Gene "
,
gene
$
SYMBOL
,
" has multiple probes for dataset "
,
dataset_name
,
" ("
,
dataset_arraytype
,
"): "
,
paste0
(
probe_ids
,
collapse
=
"-"
)))
}
rm
(
dataset
,
probeset_tag
,
dataset_arraytype
,
jj
,
probe_ids
)
# We read the clinical data.
clin_data
<-
clin_data_all
[[
ii
]]
%>%
...
...
@@ -135,8 +171,10 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
exp_data
<-
t
(
exp_data_all
[[
ii
]][
1
,
])
exp_data
[,
1
]
<-
NA
if
(
!
is.null
(
probe_id
))
{
if
(
!
is.na
(
probe_id
))
{
exp_data
<-
t
(
exp_data_all
[[
ii
]][
probe_id
,
])
if
(
length
(
probe_id
)
>
0
)
{
if
(
!
is.na
(
probe_id
))
{
exp_data
<-
t
(
exp_data_all
[[
ii
]][
probe_id
,
])
}
}
}
colnames
(
exp_data
)
<-
"Expression"
...
...
@@ -146,8 +184,10 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
zexp_data
<-
t
(
zexp_data_all
[[
ii
]][
1
,
])
zexp_data
[,
1
]
<-
NA
if
(
!
is.null
(
probe_id
))
{
if
(
!
is.na
(
probe_id
))
{
zexp_data
<-
t
(
zexp_data_all
[[
ii
]][
probe_id
,
])
if
(
length
(
probe_id
)
>
0
)
{
if
(
!
is.na
(
probe_id
))
{
zexp_data
<-
t
(
zexp_data_all
[[
ii
]][
probe_id
,
])
}
}
}
colnames
(
zexp_data
)
<-
"Expression"
...
...
@@ -243,8 +283,7 @@ ctrl_plt <- c("#7da7de", "#c2e0ae")
# (so far regardless of the significance). Data are plotted per dataset (each one in an individual
# subplot) as well as all together (when the data are first transformed into z-scores).
# We create once for all the list of all datasets (microarray and RNA-seq).
datasets
<-
c
(
config
$
datasets
,
config
$
rs_datasets
)
datasets
<-
config
$
datasets
# We do all integration schemes one by one.
for
(
i
in
seq_len
(
length
(
config
$
integrations
)))
{
...
...
@@ -253,115 +292,104 @@ for (i in seq_len(length(config$integrations))) {
integration
<-
config
$
integrations
[[
i
]]
int_criteria
<-
str_split
(
integration
$
criteria
,
","
)[[
1
]]
# We repeat the analysis for all VSN usages.
for
(
j
in
seq_len
(
length
(
config
$
variance_methods
)))
{
# We get the current VSN configuration.
vsn
<-
config
$
variance_methods
[[
j
]]
# We loop over the different limma analyses.
for
(
k
in
seq_len
(
length
(
config
$
limma_analyses
)))
{
# We define the current limma analysis.
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
# We define the color palette accordingly.
current_plt
<-
gds_plt
if
(
k
==
1
|
k
==
3
)
{
current_plt
<-
diz_plt
}
else
if
(
k
==
4
)
{
current_plt
<-
ctrl_plt
}
else
if
(
k
==
6
)
{
current_plt
<-
male_plt
}
print
(
paste0
(
k
,
" set to "
,
str
(
current_plt
)))
print
(
" "
)
# We loop over the different gene-probe selection methods.
for
(
l
in
seq_len
(
length
(
config
$
selections
)))
{
# We define the current gene-probe selection method.
selection
<-
config
$
selections
[[
l
]]
# We loop over all datasets to read only the relevant datasets
exp_data_all
<-
list
()
zexp_data_all
<-
list
()
clin_data_all
<-
list
()
m_indx
<-
1
m_indx_map
<-
list
()
for
(
m
in
seq_len
(
length
(
datasets
)))
{
# We extract the configuration of the current dataset.
dataset
<-
datasets
[[
m
]]
dataset_name
<-
dataset
$
dataset_name
dataset_arraytype
<-
dataset
$
array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all
<-
TRUE
for
(
c
in
seq_len
(
length
(
int_criteria
)))
{
int_criterium
<-
str_split
(
int_criteria
[
c
],
";"
)[[
1
]]
if
(
dataset
[
int_criterium
[
1
]]
!=
int_criterium
[
2
])
{
satisfy_all
<-
FALSE
}
rm
(
int_criterium
)
}
rm
(
c
)
if
(
satisfy_all
!=
TRUE
)
{
rm
(
satisfy_all
,
dataset
,
dataset_name
,
dataset_arraytype
)
next
# We loop over the different limma analyses.
for
(
k
in
seq_len
(
length
(
config
$
limma_analyses
)))
{
# We define the current limma analysis.
limma_analysis
<-
config
$
limma_analyses
[[
k
]]
# We define the color palette accordingly.
current_plt
<-
gds_plt
if
(
k
==
1
|
k
==
3
)
{
current_plt
<-
diz_plt
}
else
if
(
k
==
4
)
{
current_plt
<-
ctrl_plt
}
else
if
(
k
==
6
)
{
current_plt
<-
male_plt
}
print
(
paste0
(
k
,
" set to "
,
str
(
current_plt
)))
print
(
" "
)
# We loop over the different gene-probe selection methods.
for
(
l
in
seq_len
(
length
(
config
$
selections
)))
{
# We define the current gene-probe selection method.
selection
<-
config
$
selections
[[
l
]]
# We loop over all datasets to read only the relevant datasets
exp_data_all
<-
list
()
zexp_data_all
<-
list
()
clin_data_all
<-
list
()
m_indx
<-
1
m_indx_map
<-
list
()
for
(
m
in
seq_len
(
length
(
datasets
)))
{
# We extract the configuration of the current dataset.
dataset
<-
datasets
[[
m
]]
dataset_name
<-
dataset
$
dataset_name
dataset_arraytype
<-
dataset
$
array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all
<-
TRUE
for
(
c
in
seq_len
(
length
(
int_criteria
)))
{
int_criterium
<-
str_split
(
int_criteria
[
c
],
";"
)[[
1
]]
if
(
dataset
[
int_criterium
[
1
]]
!=
int_criterium
[
2
])
{
satisfy_all
<-
FALSE
}
rm
(
int_criterium
)
}
rm
(
c
)
if
(
satisfy_all
!=
TRUE
)
{
rm
(
satisfy_all
,
dataset
,
dataset_name
,
dataset_arraytype
)
next
}
# We define the dataset specific I/Os.
local_input_edata_dir
<-
input_edata_dir
file_suffix
<-
paste0
(
"_"
,
vsn
$
name
,
".tsv"
)
file_long_suffix
<-
paste0
(
"_"
,
vsn
$
name
,
"_zscores.tsv"
)
if
(
dataset_arraytype
==
"RNAseq_EG"
)
{
local_input_edata_dir
<-
input_rsedata_dir
file_suffix
<-
paste0
(
"_cpm.tsv"
)
file_long_suffix
<-
paste0
(
"_cpm_zscores.tsv"
)
}
exp_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_normalized"
,
file_suffix
)
zexp_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_normalized"
,
file_long_suffix
)
clin_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_clinical.tsv"
)
rm
(
local_input_edata_dir
,
file_suffix
,
file_long_suffix
)
# We read the data.
clin_data_all
[[
m_indx
]]
<-
read.table
(
clin_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
exp_data_all
[[
m_indx
]]
<-
read.table
(
exp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
zexp_data_all
[[
m_indx
]]
<-
read.table
(
zexp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
m_indx_map
[[
m_indx
]]
<-
m
m_indx
<-
m_indx
+
1
rm
(
dataset
,
dataset_name
,
dataset_arraytype
,
exp_data_fn
,
zexp_data_fn
,
clin_data_fn
)
}
# End for each dataset.
rm
(
m
)
# Actual plotting of the top hits (as we ll as the genes we want to plot anyway from
# the local configuration file).
plot_top_hits
(
output_data_dir
=
output_data_dir
,
integration_name
=
integration
$
name
,
limma_analysis
=
limma_analysis
,
selection_name
=
selection
$
name
,
palette
=
current_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
)
rm
(
selection
,
exp_data_all
,
zexp_data_all
,
clin_data_all
,
m_indx
,
m_indx_map
)
}
# End for each gene-probe selection method.
rm
(
l
,
limma_analysis
,
current_plt
)
}
# End for each limma analysis.
rm
(
k
,
vsn
)
}
# End for each variance configuration.
rm
(
j
,
integration
,
int_criteria
)
# We define the dataset specific I/Os.
local_input_edata_dir
<-
input_edata_dir
local_input_data_dir
<-
input_data_dir
vsn_name
<-
config
$
modifications
[[
m
]]
$
best_variance
file_suffix
<-
paste0
(
"_"
,
vsn_name
,
".tsv"
)
file_long_suffix
<-
paste0
(
"_"
,
vsn_name
,
"_zscores.tsv"
)
exp_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_normalized"
,
file_suffix
)
zexp_data_fn
<-
paste0
(
local_input_data_dir
,
dataset_name
,
"_normalized"
,
file_long_suffix
)
clin_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_clinical.tsv"
)
rm
(
local_input_edata_dir
,
local_input_data_dir
,
vsn_name
,
file_suffix
,
file_long_suffix
)
# We read the data.
clin_data_all
[[
m_indx
]]
<-
read.table
(
clin_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
exp_data_all
[[
m_indx
]]
<-
read.table
(
exp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
zexp_data_all
[[
m_indx
]]
<-
read.table
(
zexp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
m_indx_map
[[
m_indx
]]
<-
m
m_indx
<-
m_indx
+
1
rm
(
dataset
,
dataset_name
,
dataset_arraytype
,
exp_data_fn
,
zexp_data_fn
,
clin_data_fn
)
}
# End for each dataset.
rm
(
m
)
# Actual plotting of the top hits (as we ll as the genes we want to plot anyway from
# the local configuration file).
plot_top_hits
(
output_data_dir
=
output_data_dir
,
integration_name
=
integration
$
name
,
limma_analysis
=
limma_analysis
,
selection_name
=
selection
$
name
,
palette
=
current_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
)
rm
(
selection
,
exp_data_all
,
zexp_data_all
,
clin_data_all
,
m_indx
,
m_indx_map
)
}
# End for each gene-probe selection method.
rm
(
l
,
limma_analysis
,
current_plt
)
}
# End for each limma analysis.
rm
(
k
,
integration
,
int_criteria
)
}
# End for each data integration scheme.
rm
(
i
,
datasets
,
ctrl_plt
,
diz_plt
,
gds_plt
,
male_plt
)
...
...
06-Data_integration/compare_integrations.R
deleted
100644 → 0
View file @
fe85f14c
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"ggplot2"
)
library
(
"tidyverse"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Boxplot of P values from common / unique genes from two analyses.
#'
#' @descrtiption Function to plot the -log10( P values) of the genes found in common between the SN and
#' SNage analyses and the genes unique to each analysis.
#'
#' @param exp_tag The string tag corresponding to the limma analysis under study.
#' @param nb_genes The number of top genes to consider when comparing the analyses.
plotit
<-
function
(
exp_tag
=
"Gender_disease_status"
,
nb_genes
=
100
)
{
# Read the data from the current integration (MA+RS) and the previous one using only
# microarray data (MA).
F06_fn
<-
paste0
(
output_data_dir
,
"SN_VSN_"
,
exp_tag
,
"_max-avg_integration.tsv"
)
F06f
<-
read.delim
(
F06_fn
,
header
=
TRUE
,
stringsAsFactors
=
FALSE
)
%>%
mutate
(
pvalue
=
int_csss_pval_marotmayer
)
%>%
select
(
SYMBOL
,
pvalue
)
%>%
arrange
(
pvalue
)
F16_fn
<-
paste0
(
output_data_dir
,
"SNage_VSN_"
,
exp_tag
,
"_max-avg_integration.tsv"
)
F16f
<-
read.delim
(
F16_fn
,
header
=
TRUE
,
stringsAsFactors
=
FALSE
)
%>%
mutate
(
pvalue
=
int_csss_pval_marotmayer
)
%>%
select
(
SYMBOL
,
pvalue
)
%>%
arrange
(
pvalue
)
F06
<-
head
(
F06f
,
nb_genes
)
F16
<-
head
(
F16f
,
nb_genes
)
rm
(
F06_fn
,
F16_fn
)
# We compute the overall rank correlation.
Ff
<-
merge
(
x
=
F06f
,
y
=
F16f
,
by
=
"SYMBOL"
)
res
<-
cor.test
(
Ff
$
pvalue.x
,
Ff
$
pvalue.y
,
method
=
"spearman"
)
print
(
paste0
(
"["
,
exp_tag
,
"] rho = "
,
round
(
res
$
estimate
,
3
)))
rm
(
F06f
,
F16f
,
Ff
,
res
)
# We merge the two resulting dataframe as to get the common and unique genes.
F
<-
merge
(
x
=
F06
,
y
=
F16
,
by
=
"SYMBOL"
)
F06uniq
<-
setdiff
(
F06
,
data.frame
(
SYMBOL
=
F
$
SYMBOL
,
pvalue
=
F
$
pvalue.x
))
F16uniq
<-
setdiff
(
F16
,
data.frame
(
SYMBOL
=
F
$
SYMBOL
,
pvalue
=
F
$
pvalue.y
))
all
<-
rbind
(
data.frame
(
log_pvalue
=
-
log10
(
F
$
pvalue.x
))
%>%
mutate
(
Category
=
"SN - common genes"
),
data.frame
(
log_pvalue
=
-
log10
(
F
$
pvalue.y
))
%>%
mutate
(
Category
=
"SNage - common genes"
),
data.frame
(
log_pvalue
=
-
log10
(
F06uniq
$
pvalue
))
%>%
mutate
(
Category
=
"SN - unique genes"
),
data.frame
(
log_pvalue
=
-
log10
(
F16uniq
$
pvalue
))
%>%
mutate
(
Category
=
"SNage - unique genes"
)
)
perc
<-
dim
(
F
)[
1
]
/
dim
(
F06
)[
1
]
*
100
rm
(
F06
,
F16
,
F
,
F06uniq
,
F16uniq
)
# We build the boxplots.
bx_fn
<-
paste0
(
output_data_dir
,
"SN_VSN_"
,
exp_tag
,
"_comparison_SN_SNage.png"
)
png
(
bx_fn
)
myplot
<-
ggplot
(
all
,
aes
(
x
=
Category
,
y
=
log_pvalue
,
colour
=
Category
))
+
geom_boxplot
(
lwd
=
1.2
,
outlier.shape
=
NA
)
+
geom_jitter
(
size
=
3
,
width
=
0.2
,
height
=
0
)
+
theme
(
legend.position
=
"bottom"
,
axis.title.x
=
element_blank
(),
plot.title
=
element_text
(
size
=
15
,
hjust
=
0.5
),
panel.background
=
element_blank
(),
axis.line
=
element_line
(
colour
=
"black"
),
panel.grid.major
=
element_line
(
colour
=
"lightgrey"
),
strip.background
=
element_rect
(
fill
=
"black"
),
strip.text
=
element_text
(
colour
=
"white"
))
+
scale_color_brewer
(
palette
=
"Dark2"
)
+
ggtitle
(
paste0
(
"["
,
exp_tag
,
"] Analysis of top "
,
nb_genes
,
" genes (common = "
,
perc
,
"%)."
))
print
(
myplot
)
dev.off
()
rm
(
all
,
myplot
,
bx_fn
)
}
# ================================================================================================
# Main
# ================================================================================================
plotit
(
exp_tag
=
"Gender_disease_status"
,
nb_genes
=
100
)
plotit
(
exp_tag
=
"PDVsControl"
,
nb_genes
=
100
)
plotit
(
exp_tag
=
"PDVsControl_females"
,
nb_genes
=
100
)
plotit
(
exp_tag
=
"PDVsControl_males"
,
nb_genes
=
100
)
# We log the session details for reproducibility.
rm
(
config
,
output_data_dir
,
input_data_dir
)
sessionInfo
()
06-Data_integration/integrate.sh
View file @
77f8c71f
...
...
@@ -15,8 +15,7 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo
""
# Defining global parameters.
INPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CINPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/05/
INPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/05/
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
...
...
@@ -28,22 +27,7 @@ source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Preparing the job.
rm
-rf
${
OUTPUT_FOLDER
}
clinical_categories_summarized.tsv
nbDatasets
=
${#
datasets__dataset_name
[@]
}
for
((
i
=
0
;
i<
$nbDatasets
;
i++
))
do
datasetName
=
${
datasets__dataset_name
[
$i
]
}
cut
-f
2-3
${
MA_INPUT_FOLDER
}${
datasetName
}
_clinical.tsv |
grep
-v
Disease |
sort
|
uniq
-c
|
sed
's/^\s*//'
|
sed
-r
's/\s/\t/g'
|
awk
-v
OFS
=
"
\t
"
-F
"
\t
"
'{print $3, $2, $1}'
|
sed
-r
's/\t/./'
|
sed
-r
's/^/'
"
${
datasetName
}
"
'\t/g'
>>
${
OUTPUT_FOLDER
}
clinical_categories_summarized.tsv
done
nbDatasets
=
${#
rs_datasets__dataset_name
[@]
}
for
((
i
=
0
;
i<
$nbDatasets
;
i++
))
do
datasetName
=
${
rs_datasets__dataset_name
[
$i
]
}
cut
-f
2-3
${
RS_INPUT_FOLDER
}${
datasetName
}
_clinical.tsv |
grep
-v
Disease |
sort
|
uniq
-c
|
sed
's/^\s*//'
|
sed
-r
's/\s/\t/g'
|
awk
-v
OFS
=
"
\t
"
-F
"
\t
"
'{print $3, $2, $1}'
|
sed
-r
's/\t/./'
|
sed
-r
's/^/'
"
${
datasetName
}
"
'\t/g'
>>
${
OUTPUT_FOLDER
}
clinical_categories_summarized.tsv
done
head
-n
1
${
MA_CINPUT_FOLDER
}
clinical_categories_summarized.tsv
>
${
OUTPUT_FOLDER
}
clinical_categories_summarized_wide.tsv
cat
${
MA_CINPUT_FOLDER
}
clinical_categories_summarized.tsv |
grep
-v
Dataset
>>
${
OUTPUT_FOLDER
}
clinical_categories_summarized_wide.tsv
cat
${
RS_CINPUT_FOLDER
}
clinical_categories_summarized.tsv |
grep
-v
Dataset
>>
${
OUTPUT_FOLDER
}
clinical_categories_summarized_wide.tsv
cp
-rf
${
INPUT_FOLDER
}
clinical_categories_summarized.tsv
${
OUTPUT_FOLDER
}
clinical_categories_summarized.tsv
# Actual job
Rscript
--vanilla
${
CODE_FOLDER
}
integrate_datasets.R
>
${
OUTPUT_FOLDER
}
integrate_log.out 2>
${
OUTPUT_FOLDER
}
integrate_log.err
...
...
06-Data_integration/integrate_datasets.R
View file @
77f8c71f
This diff is collapsed.
Click to expand it.
06-Data_integration/summarize_gene_level.R
View file @
77f8c71f
...
...
@@ -103,6 +103,54 @@ select_best <- function(row, array_indexes) {
return
(
to_return
)
}
# duplicate_row_multiple_gene_names <- function(row, sep = "|") {
# keys <- strsplit(as.character(row[2]), sep, fixed = TRUE)[[1]]
# if (length(keys) == 0) {
# keys <- ""
# }
# val_1 <- rep(row[1], length(keys))
# val_3 <- rep(row[3], length(keys))
# val_4 <- rep(row[4], length(keys))
# val_5 <- rep(row[5], length(keys))
# val_6 <- rep(row[6], length(keys))
# val_7 <- rep(row[7], length(keys))
# val_8 <- rep(row[8], length(keys))
# val_9 <- rep(row[9], length(keys))
# val_10 <- rep(row[10], length(keys))
# to_ret <- cbind(val_1,
# keys,
# val_3,
# val_4,
# val_5,
# val_6,
# val_7,
# val_8,
# val_9,
# val_10)
# names(to_ret) <- names(row)
# return(to_ret)
# }
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names
<-
function
(
row
,
ikey
=
2
,
sep
=
"|"
)
{
keys
<-
strsplit
(
as.character
(
row
[
ikey
]),
sep
,
fixed
=
TRUE
)[[
1
]]
if
(
length
(
keys
)
==
0
)
{
keys
<-
""
}
# We duplicate all fields.
new_data
<-
c
()
for
(
n
in
seq_len
(
length
(
row
)))
{
new_data
<-
cbind
(
new_data
,
rep
(
row
[
n
],
length
(
keys
)))
}
# We adjust the keys and the column names.
new_data
[,
ikey
]
<-
keys
return
(
new_data
)
}
# ================================================================================================
# Main
# ================================================================================================
...
...
@@ -185,17 +233,33 @@ for (i in seq_len(length(config$integrations))) {
# Now, we are sure the current dataset is relevant and should be integrated.
# We read the results of the limma analysis.
local_input_data_dir
<-
input_data_dir
file_suffix
<-
paste0
(
"_"
,
config
$
modifications
[[
l
]]
$
best_variance
,
".tsv"
)
deps_analysis_fn
<-
paste0
(
local_input_data_dir
,
dataset_name
,
"_toptable_"
,
limma_analysis
$
name
,
file_suffix
)
datasets_data
[[
i_ds
]]
<-
read.table
(
deps_analysis_fn
,
local_input_data_dir
<-
input_data_dir
file_suffix
<-
paste0
(
"_"
,
config
$
modifications
[[
l
]]
$
best_variance
,
".tsv"
)
deps_analysis_fn
<-
paste0
(
local_input_data_dir
,
dataset_name
,
"_toptable_"
,
limma_analysis
$
name
,
file_suffix
)
local_exp_mat
<-
read.table
(
deps_analysis_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
rm
(
deps_analysis_fn
,
local_input_data_dir
)
rownames
(
datasets_data
[[
i_ds
]])
<-
datasets_data
[[
i_ds
]]
$
X
local_exp_mat_clean
<-
local_exp_mat
if
(
sum
(
grepl
(
"\\|"
,
local_exp_mat
$
SYMBOL
))
>
0
)
{
tmp_res
<-
t
(
apply
(
local_exp_mat
,
1
,
duplicate_row_multiple_gene_names
))
local_exp_mat_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
names
(
local_exp_mat_clean
)
<-
names
(
local_exp_mat
)
local_exp_mat_clean
$
logFC
<-
as.numeric
(
local_exp_mat_clean
$
logFC
)