Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
3c574ea8
Commit
3c574ea8
authored
Nov 11, 2021
by
Leon-Charles Tranchevent
Browse files
New method to plot the cell type biomarkers per dataset.
parent
5be415ac
Changes
1
Hide whitespace changes
Inline
Side-by-side
06-Data_integration/plot_biomarkers_perdataset.R
0 → 100644
View file @
3c574ea8
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"tidyverse"
)
library
(
"stringr"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
input_edata_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_exp_data_dir
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names
<-
function
(
row
,
ikey
=
3
,
sep
=
"|"
)
{
keys
<-
strsplit
(
as.character
(
row
[
ikey
]),
sep
,
fixed
=
TRUE
)[[
1
]]
if
(
length
(
keys
)
==
0
)
{
keys
<-
""
}
# We duplicate all fields.
new_data
<-
c
()
for
(
n
in
seq_len
(
length
(
row
)))
{
new_data
<-
cbind
(
new_data
,
rep
(
row
[
n
],
length
(
keys
)))
}
# We adjust the keys and the column names.
new_data
[,
ikey
]
<-
keys
return
(
new_data
)
}
#' @title Plot the differential expression of a given gene.
#'
#' @description This function create one plot for a given gene of a given limma
#' comparison (obtained for a given configuration). The plot contains the expression
#' values across samples (grouped per dataset and per relevant clinical descriptor).
#'
#' @param genes The genes to plot defined by their gene symbols.
#' @param gene_datasets The datasets to use for plotting (discarding the others).
#' @param output_data_dir A string to indicate the folder in which plots should be saved.
#' @param integration_name A string that contains the name of the current integration
#' scheme.
#' @param limma_analysis A complete configuration of the Limma comparison.
#' @param selection_name A string corresponding to the used gene--probe selection
#' method.
#' @param palette The color palette to use for plotting.
#' @param clin_data_all A list that contains one data-frame per dataset with its
#' clinical data.
#' @param exp_data_all A list that contains one data-frame per dataset with its
#' expression profiles (not transformed).
#' @param zexp_data_all A list that contains one data-frame per dataset with its
#' expression profiles (Z-scores).
#' @param m_indx_map A named list representing a map between two dataset indexes,
#' namely the index of the global dataset configuration (all datasets) and the list
#' of relevant datasets for this specific configuration. The list contains the second
#' index (relevant only) while its names are for the first index (all datasets).
#' @param plot_all A boolean indicating whether the combined data (Z-scores) should be
#' plotted (default value FALSE). One of plot_all and plot_dataset must be set to true.
#' @param plot_datasets A boolean indicating whether the individual data should be
#' plotted (default value TRUE). One of plot_all and plot_dataset must be set to true.
plot_gene
<-
function
(
genes
,
gene_datasets
,
output_data_dir
,
integration_name
,
limma_analysis
,
selection_name
,
palette
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
,
plot_all
=
FALSE
,
plot_datasets
=
TRUE
,
title_name
)
{
# We set the I/Os.
file_prefix_short
<-
paste0
(
output_data_dir
,
integration_name
,
"_"
,
limma_analysis
$
name
,
"_"
)
# Read the matching data.
matching_data_fn
<-
paste0
(
file_prefix_short
,
"matching_data.tsv"
)
matching_data
<-
read.table
(
matching_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
tmp_res
<-
t
(
apply
(
matching_data
,
1
,
duplicate_row_multiple_gene_names
))
matching_data_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
names
(
matching_data_clean
)
<-
names
(
matching_data
)
matching_data
<-
matching_data_clean
rm
(
matching_data_fn
,
tmp_res
,
matching_data_clean
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Matching data read."
))
# We read the results of the integration.
int_results_fn
<-
paste0
(
file_prefix_short
,
selection_name
,
"_integration.tsv"
)
results_data_df
<-
read.table
(
int_results_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
rm
(
int_results_fn
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Integration results read."
))
# We plot the gene defined by the Marot-Mayer combination method.
# We define the I/Os.
o
<-
1
pval_comb
<-
config
$
p_val_combinations
[[
o
]]
file_prefix
<-
paste0
(
file_prefix_short
,
selection_name
,
"_"
,
pval_comb
$
name
,
"_"
)
full_profiles
<-
vector
()
full_zprofiles
<-
vector
()
for
(
gene
in
genes
)
{
matching_data_gene
<-
matching_data
%>%
dplyr
::
filter
(
genes
==
gene
)
%>%
dplyr
::
select
(
ends_with
(
str_replace
(
selection_name
,
"-"
,
"."
)))
# We do all datasets one by one and collect the clinical and expression data.
for
(
ii
in
seq_len
(
length
(
clin_data_all
)))
{
# We define the second index as to switch between the global list (full)
# and current list (selection).
jj
<-
m_indx_map
[[
ii
]]
# We get the dataset details (name), and retrieve the correct probe id.
dataset
<-
datasets
[[
jj
]]
dataset_name
<-
dataset
$
dataset_name
dataset_arraytype
<-
stringr
::
str_replace
(
dataset
$
array_type
,
"-"
,
"."
)
probeset_tag
<-
paste0
(
dataset_arraytype
,
"_"
,
stringr
::
str_replace
(
selection_name
,
"-"
,
"."
))
probe_ids
<-
matching_data_gene
[[
probeset_tag
]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id
<-
probe_ids
[
!
is.na
(
probe_ids
)]
# TOP # beware if we loose a row, it might mess up with our data.
if
(
length
(
probe_id
)
>
1
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Gene "
,
gene
$
SYMBOL
,
" has multiple probes for dataset "
,
dataset_name
,
" ("
,
dataset_arraytype
,
"): "
,
paste0
(
probe_ids
,
collapse
=
"-"
)))
}
rm
(
dataset
,
probeset_tag
,
dataset_arraytype
,
jj
,
probe_ids
)
# We read the clinical data.
clin_data
<-
clin_data_all
[[
ii
]]
%>%
dplyr
::
mutate
(
gender_disease_status
=
paste
(
Gender
,
Disease.status
,
sep
=
"."
))
%>%
dplyr
::
select
(
"Disease.status"
,
"Gender"
,
"gender_disease_status"
)
# We read the expression data (expression levels).
# The dataset might not have a probe for that gene.
exp_data
<-
t
(
exp_data_all
[[
ii
]][
1
,
])
exp_data
[,
1
]
<-
NA
if
(
!
is.null
(
probe_id
))
{
if
(
length
(
probe_id
)
>
0
)
{
if
(
!
is.na
(
probe_id
))
{
exp_data
<-
t
(
exp_data_all
[[
ii
]][
probe_id
,
])
# TOP # loop over ids...
}
}
}
colnames
(
exp_data
)
<-
"Expression"
# TOP # rep "Expression" with NB_genes...
# We read the expression data (Z-scores).
# The dataset might not have a probe for that gene.
zexp_data
<-
t
(
zexp_data_all
[[
ii
]][
1
,
])
zexp_data
[,
1
]
<-
NA
if
(
!
is.null
(
probe_id
))
{
if
(
length
(
probe_id
)
>
0
)
{
if
(
!
is.na
(
probe_id
))
{
zexp_data
<-
t
(
zexp_data_all
[[
ii
]][
probe_id
,
])
# TOP # loop over ids...
}
}
}
colnames
(
zexp_data
)
<-
"Expression"
# TOP # rep "Expression" with NB_genes...
# We assemble clinical and expression and reformat.
local_block
<-
cbind
(
clin_data
,
exp_data
)
%>%
dplyr
::
mutate
(
dataset
=
dataset_name
)
%>%
dplyr
::
mutate
(
gene
=
gene
)
colnames
(
local_block
)[
colnames
(
local_block
)
==
probe_id
]
<-
"Expression"
# TOP # rep "Expression" with NB_genes...
zlocal_block
<-
cbind
(
clin_data
,
zexp_data
)
%>%
dplyr
::
mutate
(
dataset
=
dataset_name
)
%>%
dplyr
::
mutate
(
gene
=
gene
)
colnames
(
zlocal_block
)[
colnames
(
zlocal_block
)
==
probe_id
]
<-
"Expression"
# TOP # rep "Expression" with NB_genes...
rm
(
dataset_name
,
probe_id
,
clin_data
,
exp_data
,
zexp_data
)
# We do not need to keep only the relevant samples (we want all samples this time).
full_profiles
<-
rbind
(
full_profiles
,
local_block
)
full_zprofiles
<-
rbind
(
full_zprofiles
,
zlocal_block
)
rm
(
local_block
,
zlocal_block
)
}
# End for all relevant datasets.
rm
(
ii
)
}
# end for each gene.
rm
(
gene
)
# We check if the clinical factor is correctly formatted.
clinical_factor
<-
limma_analysis
$
factor
# We do not combine the Z-scores values to the regular values for plotting purposes.
full_profiles
<-
full_profiles
%>%
filter
(
dataset
%in%
gene_datasets
)
full_zprofiles
<-
full_zprofiles
%>%
filter
(
dataset
%in%
gene_datasets
)
%>%
mutate
(
dataset
=
"ALL"
)
plot_profiles
<-
c
()
if
(
plot_datasets
)
{
plot_profiles
<-
full_profiles
}
rm
(
full_profiles
,
full_zprofiles
)
# If we have missing values, we clean them.
plot_profiles
<-
plot_profiles
%>%
filter
(
!
is.na
(
Expression
))
# If there is no data at this point (all biomarkers are missing genes),
# we do not plot anything.
if
(
dim
(
plot_profiles
)[
1
]
==
0
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Plotting "
,
paste0
(
genes
,
collapse
=
", "
),
" with clinical factor "
,
clinical_factor
))
return
()
}
# We can create the plot.
message
(
paste0
(
"["
,
Sys.time
(),
"] Plotting "
,
paste0
(
genes
,
collapse
=
", "
),
" with clinical factor "
,
clinical_factor
))
myplot
<-
NULL
if
(
plot_all
)
{
# Not yet implemented...
}
else
{
myplot
<-
ggplot
(
plot_profiles
,
aes
(
x
=
gender_disease_status
,
y
=
Expression
,
colour
=
!!
ensym
(
clinical_factor
)))
+
geom_boxplot
(
lwd
=
1.2
,
outlier.shape
=
NA
)
+
geom_jitter
(
size
=
3
,
width
=
0.1
,
height
=
0
)
+
facet_wrap
(
~
gene
,
scales
=
"free"
)
+
theme
(
legend.position
=
"bottom"
,
axis.title.x
=
element_blank
(),
axis.title.y
=
element_text
(
size
=
16
,
face
=
"bold"
),
axis.text.x
=
element_blank
(),
axis.text.y
=
element_text
(
size
=
16
,
face
=
"bold"
),
plot.title
=
element_text
(
size
=
16
,
hjust
=
0.5
,
face
=
"bold"
),
panel.background
=
element_blank
(),
axis.line
=
element_line
(
colour
=
"black"
,
size
=
1.5
),
panel.grid.major
=
element_line
(
colour
=
"lightgrey"
),
strip.background
=
element_rect
(
fill
=
"black"
),
strip.text
=
element_text
(
colour
=
"white"
,
size
=
12
,
face
=
"bold"
),
legend.text
=
element_text
(
size
=
16
,
face
=
"bold"
))
+
scale_color_manual
(
values
=
palette
)
+
ggtitle
(
paste0
(
paste0
(
gene_datasets
,
collapse
=
" - "
),
" ("
,
title_name
,
")"
))
}
print
(
myplot
)
rm
(
myplot
,
clinical_factor
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Genes "
,
paste0
(
genes
,
collapse
=
", "
),
" done."
))
rm
(
genes
,
matching_data_gene
,
plot_profiles
)
rm
(
pval_comb
,
file_prefix
)
rm
(
o
,
results_data_df
,
matching_data
,
file_prefix_short
)
}
# End of function.
# ================================================================================================
# Main
# ================================================================================================
# We define the color palette we want to use (always all categories in this case).
# F.CTRL, F.PD, M.CTRL, M.PD
gds_plt
<-
c
(
"#7da7de"
,
"#00599d"
,
"#c2e0ae"
,
"#62a73b"
)
# The objective is simply to plot selected genes.
# Data are plotted per dataset (each one in an individual
# subplot) as well as all together (when the data are first transformed into z-scores).
# We create once for all the list of all datasets (microarray and RNA-seq).
datasets
<-
config
$
datasets
# We do only the SNage integration scheme.
i
<-
3
integration
<-
config
$
integrations
[[
i
]]
int_criteria
<-
str_split
(
integration
$
criteria
,
","
)[[
1
]]
# We use the first gene-probe selection method (max-avg).
l
<-
1
selection
<-
config
$
selections
[[
l
]]
# We loop over all datasets to read only the relevant datasets
exp_data_all
<-
list
()
zexp_data_all
<-
list
()
clin_data_all
<-
list
()
m_indx
<-
1
m_indx_map
<-
list
()
for
(
m
in
seq_len
(
length
(
datasets
)))
{
# We extract the configuration of the current dataset.
dataset
<-
datasets
[[
m
]]
dataset_name
<-
dataset
$
dataset_name
dataset_arraytype
<-
dataset
$
array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all
<-
TRUE
for
(
c
in
seq_len
(
length
(
int_criteria
)))
{
int_criterium
<-
str_split
(
int_criteria
[
c
],
";"
)[[
1
]]
if
(
dataset
[
int_criterium
[
1
]]
!=
int_criterium
[
2
])
{
satisfy_all
<-
FALSE
}
rm
(
int_criterium
)
}
rm
(
c
)
if
(
satisfy_all
!=
TRUE
)
{
rm
(
satisfy_all
,
dataset
,
dataset_name
,
dataset_arraytype
)
next
}
rm
(
satisfy_all
)
# We define the dataset specific I/Os.
local_input_edata_dir
<-
input_edata_dir
local_input_data_dir
<-
input_data_dir
vsn_name
<-
config
$
modifications
[[
m
]]
$
best_variance
file_suffix
<-
paste0
(
"_"
,
vsn_name
,
".tsv"
)
file_long_suffix
<-
paste0
(
"_"
,
vsn_name
,
"_zscores.tsv"
)
exp_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_normalized"
,
file_suffix
)
zexp_data_fn
<-
paste0
(
local_input_data_dir
,
dataset_name
,
"_normalized"
,
file_long_suffix
)
clin_data_fn
<-
paste0
(
local_input_edata_dir
,
dataset_name
,
"_clinical.tsv"
)
rm
(
local_input_edata_dir
,
local_input_data_dir
,
vsn_name
,
file_suffix
,
file_long_suffix
)
# We read the data.
clin_data_all
[[
m_indx
]]
<-
read.table
(
clin_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
exp_data_all
[[
m_indx
]]
<-
read.table
(
exp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
zexp_data_all
[[
m_indx
]]
<-
read.table
(
zexp_data_fn
,
header
=
TRUE
,
sep
=
"\t"
,
row.names
=
1
,
as.is
=
TRUE
)
m_indx_map
[[
m_indx
]]
<-
m
m_indx
<-
m_indx
+
1
rm
(
dataset
,
dataset_name
,
dataset_arraytype
,
exp_data_fn
,
zexp_data_fn
,
clin_data_fn
)
}
# End for each dataset.
rm
(
m
)
# List of the datasets to consider.
common_datasets
<-
c
(
"GSE20163"
,
"GSE20164"
,
"GSE20292"
,
"GSE8397"
,
"GSE26927"
,
"GSE20159"
,
"GSE49036"
,
"NBB"
)
# We want to run through the biomarkers cell type by cell type.
# So we re-organize the data.
x
<-
unlist
(
config
$
biomarkers
)
dim
(
x
)
<-
c
(
2
,
length
(
x
)
/
2
)
biomarkers
<-
data.frame
(
t
(
x
),
stringsAsFactors
=
FALSE
)
rm
(
x
)
names
(
biomarkers
)
<-
c
(
"Gene"
,
"CellType"
)
# We loop through cell types.
for
(
celltype
in
unique
(
biomarkers
$
CellType
))
{
# We define the current biomarkers
CT_markers
<-
biomarkers
%>%
filter
(
CellType
==
celltype
)
for
(
biomarker
in
CT_markers
$
Gene
)
{
# Now we loop over the datasets to create one plot per dataset with all CT biomarkers.
for
(
i
in
seq_len
(
length
(
datasets
)))
{
dataset
<-
datasets
[[
i
]]
if
(
!
dataset
$
dataset_name
%in%
common_datasets
)
{
rm
(
dataset
)
next
;
}
plotfilename
<-
paste0
(
output_data_dir
,
"SNage_PDVsControl_females_max-avg_CT_biomarker_"
,
biomarker
,
"_"
,
celltype
,
"_"
,
dataset
$
dataset_name
,
".png"
)
png
(
plotfilename
,
width
=
450
,
height
=
300
,
units
=
"px"
)
plot_gene
(
genes
=
biomarker
,
gene_datasets
=
c
(
dataset
$
dataset_name
),
output_data_dir
=
output_data_dir
,
integration_name
=
integration
$
name
,
limma_analysis
=
config
$
limma_analyses
[[
5
]],
selection_name
=
selection
$
name
,
palette
=
gds_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
,
plot_all
=
FALSE
,
plot_datasets
=
TRUE
,
title_name
=
celltype
)
dev.off
()
plotfilename
<-
paste0
(
output_data_dir
,
"SNage_PDVsControl_males_max-avg_CT_biomarker_"
,
biomarker
,
"_"
,
celltype
,
"_"
,
dataset
$
dataset_name
,
".png"
)
png
(
plotfilename
,
width
=
450
,
height
=
300
,
units
=
"px"
)
plot_gene
(
genes
=
biomarker
,
gene_datasets
=
c
(
dataset
$
dataset_name
),
output_data_dir
=
output_data_dir
,
integration_name
=
integration
$
name
,
limma_analysis
=
config
$
limma_analyses
[[
6
]],
selection_name
=
selection
$
name
,
palette
=
gds_plt
,
clin_data_all
,
exp_data_all
,
zexp_data_all
,
m_indx_map
,
config
,
plot_all
=
FALSE
,
plot_datasets
=
TRUE
,
title_name
=
celltype
)
dev.off
()
rm
(
plotfilename
)
message
(
paste0
(
"["
,
celltype
,
"]["
,
biomarker
,
"]["
,
dataset
$
dataset_name
,
"] plotted."
))
rm
(
dataset
)
}
rm
(
i
)
}
rm
(
biomarker
,
CT_markers
)
}
rm
(
celltype
,
biomarkers
,
common_datasets
)
rm
(
selection
,
exp_data_all
,
zexp_data_all
,
clin_data_all
,
m_indx
,
m_indx_map
)
rm
(
l
,
gds_plt
)
rm
(
integration
,
int_criteria
)
rm
(
datasets
)
# We log the session details for reproducibility.
rm
(
config
,
output_data_dir
,
input_data_dir
,
input_edata_dir
)
sessionInfo
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment