Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
f293ccca
Commit
f293ccca
authored
Oct 13, 2021
by
Leon-Charles Tranchevent
Browse files
Updated scRNA-seq analysis pipeline.
parent
fd93d4c9
Changes
20
Expand all
Hide whitespace changes
Inline
Side-by-side
09-Single-cell_analysis/CP_enrich_seurat.R
0 → 100644
View file @
f293ccca
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"stringr"
)
library
(
"org.Hs.eg.db"
)
library
(
"tidyverse"
)
library
(
"ReactomePA"
)
library
(
"DOSE"
)
library
(
"clusterProfiler"
)
library
(
"msigdbr"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
raw_data_dir
<-
config
$
global_raw_data_dir
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
# Optional command line parameter to process only one dataset.
args
<-
commandArgs
(
trailingOnly
=
TRUE
)
selected_dataset_name
<-
""
if
(
length
(
args
)
>
0
)
{
selected_dataset_name
<-
args
[
1
]
}
rm
(
args
)
# Configure MSigDB
msigdb
<-
msigdbr
(
species
=
"Homo sapiens"
)
%>%
dplyr
::
select
(
gs_name
,
entrez_gene
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Perform a regular enrichment analysis using Fisher's over-representation.
#'
#' @description This functions relies on clusterProfiler and the associated packages
#' to run the enrichment for a given ontology among DO, GO (all three subtypes), KEGG and
#' REACTOME. It saves the results in the provided folder and return the enrichResult object
#' for further plotting.
#'
#' @param genes The list of genes to analyse. The gene identifiers have to match what is
#' expected - be it entrezgene identifiers or gene symbols.
#' @param background The list of all genes to be used as a background for correct
#' estimation of the statistics. The genes identifiers also have to match what is expected.
#' @param type The name of the ontology to use among "DO", "GOMF", "GOBP", "GOCC",
#' "KEGG" and "REACTOME". The GO analyses relies on gene symbols, the others use entrezgene ids.
#' Default to "DO".
#' @param file_dir The folder in which the tables will be saved. Default to "".
#' @param file_prefix The string that can be used as a prefix to name the file. Default to "".
#' @param symplify A boolean indicating whether the results should be simplified (by removing
#' redundant terms). Default to TRUE.
#' @return The enrichResult object that contains the enriched terms.
perform_enrichment
<-
function
(
genes
,
background
,
type
=
"DO"
,
file_dir
=
""
,
file_prefix
=
""
,
simplify
=
TRUE
)
{
# Default case, we just set the result to NULL.
enrich
<-
NULL
switch
(
type
,
DO
=
{
enrich
<-
DOSE
::
enrichDO
(
gene
=
genes
,
universe
=
background
,
ont
=
"DO"
,
minGSSize
=
config
$
min_gs_size
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
readable
=
TRUE
)
},
GOMF
=
{
enrich
<-
clusterProfiler
::
enrichGO
(
gene
=
genes
,
universe
=
background
,
keyType
=
"SYMBOL"
,
OrgDb
=
org.Hs.eg.db
,
#nolint
ont
=
"MF"
,
minGSSize
=
10
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
readable
=
FALSE
)
},
GOBP
=
{
enrich
<-
clusterProfiler
::
enrichGO
(
gene
=
genes
,
universe
=
background
,
keyType
=
"SYMBOL"
,
OrgDb
=
org.Hs.eg.db
,
#nolint
ont
=
"BP"
,
minGSSize
=
config
$
min_gs_size
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
readable
=
FALSE
)
},
GOCC
=
{
enrich
<-
clusterProfiler
::
enrichGO
(
gene
=
genes
,
universe
=
background
,
keyType
=
"SYMBOL"
,
OrgDb
=
org.Hs.eg.db
,
#nolint
ont
=
"CC"
,
minGSSize
=
config
$
min_gs_size
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
readable
=
FALSE
)
},
KEGG
=
{
enrich
<-
clusterProfiler
::
enrichKEGG
(
gene
=
genes
,
universe
=
background
,
organism
=
"hsa"
,
minGSSize
=
config
$
min_gs_size
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
use_internal_data
=
FALSE
)
},
REACTOME
=
{
enrich
<-
ReactomePA
::
enrichPathway
(
gene
=
genes
,
universe
=
background
,
organism
=
"human"
,
minGSSize
=
config
$
min_gs_size
,
maxGSSize
=
config
$
max_gs_size
,
pvalueCutoff
=
1
,
qvalueCutoff
=
1
,
pAdjustMethod
=
"BH"
,
readable
=
TRUE
)
}
)
# We simplify the enrichment results by removing the potentially redundant terms.
if
(
simplify
==
TRUE
)
{
enrich
<-
clusterProfiler
::
simplify
(
enrich
,
cutoff
=
0.75
,
by
=
"p.adjust"
,
select_fun
=
min
)
}
# We save the table as TSV file.
enrich_fn
<-
paste0
(
file_dir
,
file_prefix
,
"enrich_"
,
type
,
".tsv"
)
write.table
(
enrich
,
file
=
enrich_fn
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
# We return the enrichResult object.
return
(
enrich
)
}
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for
(
i
in
seq_len
(
length
(
config
$
scdatasets
)))
{
# We get the dataset details.
dataset
<-
config
$
scdatasets
[[
i
]]
dataset_name
<-
dataset
$
dataset_name
# We run the quality control of the current dataset (if necessary).
if
(
selected_dataset_name
==
""
||
selected_dataset_name
==
dataset_name
)
{
# We define the I/Os
raw_data_subdir
<-
paste0
(
raw_data_dir
,
dataset_name
,
"/data/"
)
output_data_subdir
<-
paste0
(
output_data_dir
,
dataset_name
,
"_seurat/"
)
# We are loading the background
background_fn
<-
paste0
(
raw_data_subdir
,
"genes.tsv"
)
background
<-
read.table
(
background_fn
,
header
=
FALSE
,
sep
=
"\t"
,
row.names
=
NULL
,
as.is
=
TRUE
)
$
V2
rm
(
background_fn
)
# We map gene symbols to entrezgene ids for background.
background_map
<-
bitr
(
background
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
rm
(
background
)
# We perform the enrichment on all rankings files.
ranking_files
<-
list.files
(
path
=
output_data_subdir
,
pattern
=
"*rankings.tsv"
)
# We do all integration schemes one by one.
for
(
ranking_file
in
ranking_files
)
{
# We define the I/Os.
analysis_prefix
<-
strsplit
(
ranking_file
,
".tsv"
)[[
1
]]
ofile_prefix
<-
paste0
(
"CP_"
,
analysis_prefix
,
"_"
)
input_filename
<-
paste0
(
output_data_subdir
,
ranking_file
)
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
input_filename
,
stringsAsFactors
=
FALSE
)
%>%
select
(
Gene
,
adj_P_value
)
rm
(
input_filename
)
ranking_clean
<-
ranking
ranking_clean
$
adj_P_value
<-
as.numeric
(
ranking_clean
$
adj_P_value
)
rm
(
ranking
)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes
<-
bitr
(
ranking_clean
$
Gene
,
fromType
=
"SYMBOL"
,
toType
=
c
(
"ENTREZID"
),
OrgDb
=
org.Hs.eg.db
)
#nolint
ranking_final
<-
merge
(
x
=
ranking_clean
,
y
=
mapped_egenes
,
by.x
=
"Gene"
,
by.y
=
"SYMBOL"
,
all.x
=
TRUE
)
%>%
mutate
(
EGene
=
ENTREZID
)
%>%
select
(
Gene
,
EGene
,
adj_P_value
)
ranking_final
$
adj_P_value
<-
as.numeric
(
ranking_final
$
adj_P_value
)
ranking_final
<-
ranking_final
%>%
arrange
(
adj_P_value
)
ranking_final_eg
<-
ranking_final
%>%
filter
(
!
is.na
(
EGene
))
rm
(
mapped_egenes
,
ranking_clean
)
# We then prepare the gene lists for Fisher-based enrichment. These are just gene lists based on
# the adjusted P values.
selection_sy
<-
ranking_final
%>%
filter
(
adj_P_value
<
0.05
)
selection_eg
<-
selection_sy
%>%
filter
(
!
is.na
(
EGene
))
rm
(
ranking_final
,
ranking_final_eg
)
# We loop over the functional sources.
for
(
k
in
seq_len
(
length
(
config
$
functional_sources
)))
{
# We define the current source.
func_source
<-
config
$
functional_sources
[[
k
]]
if
(
is.null
(
func_source
$
cp_name
)
||
func_source
$
cp_name
==
""
)
{
next
}
# We do not do the MSIGDB enrichment (was only for GSEA).
if
(
func_source
$
cp_name
==
"MSIGDB"
)
{
next
;
}
# We define the reference genes (depending on the source).
selection
<-
NULL
background
<-
NULL
if
(
func_source
$
cp_gene_id_type
==
"entrezgene"
)
{
selection
<-
selection_eg
$
EGene
background
<-
background_map
$
ENTREZID
}
else
if
(
func_source
$
cp_gene_id_type
==
"symbol"
)
{
selection
<-
selection_sy
$
Gene
background
<-
background_map
$
SYMBOL
}
# We run the Fisher analysis.
fisher_adjP
<-
perform_enrichment
(
selection
,
background
,
type
=
func_source
$
cp_name
,
file_dir
=
output_data_subdir
,
file_prefix
=
ofile_prefix
,
simplify
=
FALSE
)
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
analysis_prefix
,
"] CP enrichment ("
,
func_source
$
cp_name
,
") done."
))
rm
(
func_source
,
fisher_adjP
,
selection
,
background
)
}
# End for each functional source.
rm
(
k
,
selection_sy
,
selection_eg
)
rm
(
analysis_prefix
,
ofile_prefix
)
}
# End for each integration.
rm
(
ranking_file
,
ranking_files
,
output_data_subdir
,
raw_data_subdir
,
background_map
)
}
# End if dataset should be run.
rm
(
dataset
,
dataset_name
)
}
# End for each dataset
rm
(
i
)
# We log the session details for reproducibility.
rm
(
config
,
input_data_dir
,
output_data_dir
,
msigdb
)
sessionInfo
()
09-Single-cell_analysis/CP_enrich_seurat.sh
0 → 100755
View file @
f293ccca
#!/bin/bash -l
#SBATCH -J geneder:09:cp_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --time=0-08:00:00
#SBATCH --mem=10GB
#SBATCH -p batch
#SBATCH --qos=normal
echo
"== Starting run at
$(
date
)
"
echo
"== Job ID:
${
SLURM_JOBID
}
"
echo
"== Node list:
${
SLURM_NODELIST
}
"
echo
"== Submit dir. :
${
SLURM_SUBMIT_DIR
}
"
echo
""
# Defining global parameters.
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF
=
${
1
}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
# Load configuration
source
../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# For all scRNA datasets
nbScDatasets
=
${#
scdatasets__dataset_name
[@]
}
for
((
i
=
0
;
i<
$nbScDatasets
;
i++
))
do
datasetName
=
${
scdatasets__dataset_name
[
$i
]
}
if
[
"
${
datasetName
}
"
==
"
${
REF
}
"
]
then
# We run the functional enrichment.
echo
"== Job
$i
started (
${
datasetName
}
) =="
Rscript
--vanilla
${
CODE_FOLDER
}
/CP_enrich_seurat.R
${
datasetName
}
>
${
OUTPUT_FOLDER
}
/
${
datasetName
}
_seurat/CP_enrich_log.out 2>
${
OUTPUT_FOLDER
}
/
${
datasetName
}
_seurat/CP_enrich_log.err
echo
"== Job
$i
ended (
${
datasetName
}
) =="
fi
done
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
/slurm-
*
out
${
OUTPUT_FOLDER
}
/
09-Single-cell_analysis/Makefile
View file @
f293ccca
...
...
@@ -6,5 +6,28 @@ clean:
@
rm
-rf
*
~
clean_outputs
:
@
rm
-rf
${OUTPUT_FOLDER}
/
*
analyse
:
@
sbatch
${CODE_FOLDER}
analyse.sh
@
/bin/bash
${CODE_FOLDER}
getdata.sh
s_ana
:
@
sbatch
--time
=
0-02:30:00
--mem
=
64GB
${CODE_FOLDER}
analyse_seurat.sh GSE157783
@
sbatch
--time
=
0-20:00:00
--mem
=
196GB
-p
bigmem
${CODE_FOLDER}
analyse_seurat.sh GSE178265
#s_ana_stest:
# @sbatch --time=0-00:20:00 --mem=8GB ${CODE_FOLDER}sanalyse_seurat.sh GSE157783s
# @sbatch --time=0-00:20:00 --mem=8GB ${CODE_FOLDER}sanalyse_seurat.sh GSE178265s
#s_ana_mtest:
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_seurat.sh GSE157783m
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_seurat.sh GSE178265m
l_ana
:
@
sbatch
--time
=
0-17:00:00
--mem
=
64GB
${CODE_FOLDER}
analyse_liger.sh GSE157783
@
sbatch
--time
=
2-00:00:00
--mem
=
400GB
-p
bigmem
${CODE_FOLDER}
analyse_liger.sh GSE178265
#l_ana_stest:
# @sbatch --time=0-00:20:00 --mem=6GB ${CODE_FOLDER}sanalyse_liger.sh GSE157783s
# @sbatch --time=0-00:20:00 --mem=6GB ${CODE_FOLDER}sanalyse_liger.sh GSE178265s
#l_ana_mtest:
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_liger.sh GSE157783m
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_liger.sh GSE178265m
prep
:
@
sbatch
${CODE_FOLDER}
prepare_rankings_seurat.sh GSE157783
#@sbatch
${CODE_FOLDER}
prepare_rankings_seurat.sh GSE178265
enrich
:
@
sbatch
${CODE_FOLDER}
CP_enrich_seurat.sh GSE157783
#@sbatch
${CODE_FOLDER}
CP_enrich_seurat.sh GSE178265
09-Single-cell_analysis/README.md
View file @
f293ccca
...
...
@@ -2,6 +2,7 @@
The objectives of this step is to analyse the single-cell data (PD versus control) for male and female samples.
# Details and instructions
Beware, we use a version of SEurat that needs spatstat v1.64 not v2.x.
# Prerequisites
A prerequisite is to have the scRNA data ready in the project data folder.
\ No newline at end of file
A prerequisite is to have the scRNA data ready in the project data folder.
09-Single-cell_analysis/analyse_liger.R
0 → 100644
View file @
f293ccca
This diff is collapsed.
Click to expand it.
09-Single-cell_analysis/analyse.sh
→
09-Single-cell_analysis/analyse
_liger
.sh
100644 → 100755
View file @
f293ccca
#!/bin/bash -l
#SBATCH -J geneder:09:
scRNA
#SBATCH -J geneder:09:
liger
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --ntasks-per-socket=14
#SBATCH --ntasks-per-node=28
#SBATCH -c 1
#SBATCH --mem=
32
GB
#SBATCH --time=0-0
2
:00:00
#SBATCH --mem=
64
GB
#SBATCH --time=0-0
4
:00:00
#SBATCH -p batch
#SBATCH --qos=normal
...
...
@@ -20,6 +20,7 @@ echo ""
# Defining global parameters.
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF
=
${
1
}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
...
...
@@ -28,22 +29,20 @@ module load lang/R/3.6.2-foss-2019b-bare
source
../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# We get the Ensembl and HGNC data
wget
-O
${
OUTPUT_FOLDER
}
mitochondrial_genes.tsv
'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "chromosome_name" value = "MT"/><Attribute name = "ensembl_gene_id" /></Dataset></Query>'
wget
-O
${
OUTPUT_FOLDER
}
ribosomalrna_genes.tsv
'https://www.genenames.org/cgi-bin/genegroup/download?id=848&type=branch'
# For all scRNA datasets
#nbScDatasets=${#scdatasets__dataset_name[@]}
#for (( i=0; i<$nbScDatasets; i++ ))
#do
i
=
1
nbScDatasets
=
${#
scdatasets__dataset_name
[@]
}
for
((
i
=
0
;
i<
$nbScDatasets
;
i++
))
do
datasetName
=
${
scdatasets__dataset_name
[
$i
]
}
echo
"== Job
$i
started (
${
datasetName
}
) =="
rm
-rf
${
OUTPUT_FOLDER
}${
datasetName
}
/
mkdir
${
OUTPUT_FOLDER
}${
datasetName
}
/
Rscript
--vanilla
${
CODE_FOLDER
}
analyse.R
${
datasetName
}
>
${
OUTPUT_FOLDER
}${
datasetName
}
/analysis_log.out 2>
${
OUTPUT_FOLDER
}${
datasetName
}
/analysis_log.err
echo
"== Job
$i
ended (
${
datasetName
}
) =="
#done
if
[
"
${
datasetName
}
"
==
"
${
REF
}
"
]
then
echo
"== Job
$i
started (
${
datasetName
}
) =="
rm
-rf
${
OUTPUT_FOLDER
}${
datasetName
}
_liger/
mkdir
${
OUTPUT_FOLDER
}${
datasetName
}
_liger/
Rscript
--vanilla
${
CODE_FOLDER
}
analyse_liger.R
${
datasetName
}
>
${
OUTPUT_FOLDER
}${
datasetName
}
_liger/analysis_log.out 2>
${
OUTPUT_FOLDER
}${
datasetName
}
_liger/analysis_log.err
echo
"== Job
$i
ended (
${
datasetName
}
) =="
fi
done
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
/slurm-
*
out
${
OUTPUT_FOLDER
}
/
09-Single-cell_analysis/analyse.R
→
09-Single-cell_analysis/analyse
_seurat
.R
View file @
f293ccca
This diff is collapsed.
Click to expand it.
09-Single-cell_analysis/analyse_seurat.sh
0 → 100755
View file @
f293ccca
#!/bin/bash -l
#SBATCH -J geneder:09:seurat
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --ntasks-per-socket=14
#SBATCH --ntasks-per-node=28
#SBATCH -c 1
#SBATCH --mem=64GB
#SBATCH --time=0-04:00:00
#SBATCH -p batch
#SBATCH --qos=normal
echo
"== Starting run at
$(
date
)
"
echo
"== Job ID:
${
SLURM_JOBID
}
"
echo
"== Node list:
${
SLURM_NODELIST
}
"
echo
"== Submit dir. :
${
SLURM_SUBMIT_DIR
}
"
echo
""
# Defining global parameters.
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF
=
${
1
}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
# Load configuration
source
../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# For all scRNA datasets
nbScDatasets
=
${#
scdatasets__dataset_name
[@]
}
for
((
i
=
0
;
i<
$nbScDatasets
;
i++
))
do
datasetName
=
${
scdatasets__dataset_name
[
$i
]
}
if
[
"
${
datasetName
}
"
==
"
${
REF
}
"
]
then
echo
"== Job
$i
started (
${
datasetName
}
) =="
rm
-rf
${
OUTPUT_FOLDER
}${
datasetName
}
_seurat/
mkdir
${
OUTPUT_FOLDER
}${
datasetName
}
_seurat/
Rscript
--vanilla
${
CODE_FOLDER
}
analyse_seurat.R
${
datasetName
}
>
${
OUTPUT_FOLDER
}${
datasetName
}
_seurat/analysis_log.out 2>
${
OUTPUT_FOLDER
}${
datasetName
}
_seurat/analysis_log.err
echo
"== Job
$i
ended (
${
datasetName
}
) =="
fi
done
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
/slurm-
*
out
${
OUTPUT_FOLDER
}
/
09-Single-cell_analysis/compute_gender_specificity_seurat.R
0 → 100644
View file @
f293ccca
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library
(
"yaml"
)
library
(
"tidyverse"
)
library
(
"ggplot2"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
# ================================================================================================
# Configuration
# ================================================================================================
options
(
bitmapType
=
"cairo"
)
config
<-
read_config
(
config_dirs
=
c
(
"../Confs/"
,
"./"
))
output_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_data_dir
)
input_data_dir
<-
paste0
(
config
$
global_data_dir
,
config
$
local_input_data_dir
)
# Optional command line parameter to process only one dataset.
args
<-
commandArgs
(
trailingOnly
=
TRUE
)
selected_dataset_name
<-
""
if
(
length
(
args
)
>
0
)
{
selected_dataset_name
<-
args
[
1
]
}
rm
(
args
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Configuration done."
))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Function to compute the gender-specificity score based on the Pi value rankings.
#' @description This function takes a dataframe with the log fold changes and P values of two
#' comparisons termed "ref" and "ctrl". The idea is to compute a score representing their
#' specificity for "ref" using "ctrl" to correct the non specific DEGs.
#' The specificity is based on the Pi values so the actual regulation (up/down) does not
#' matter and two genes with strong but opposite signals will not be considered specific.
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, and ctrl_adj_Pval.
#' @param has_ctrl A boolean indicating whether a "ctrl" comparison is present. If not then
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' no delta values are returned.
#' @return An enriched data-frame that contains additional columns, in particular one that
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores
<-
function
(
FM
,
has_ctrl
=
TRUE
)
{
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
# difficult since most values would be 1. We anyway intend to use the gender specific rankings
# produced here for GSEA, and only the ranking matters, not so much the actual values.
#
# We will add two fields:
# ref_pivalue:
# The Pi values for the "ref" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. These values are the ones used to
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
# ctrl_pivalue:
# The Pi values for the "ctrl" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. These values are the ones used to
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
#
# We start by the most obvious cases for which all values are present.
# The Pi value is simply the absolute log fold change * the P value (on a log10 scale).
FM
$
ref_pivalue
<-
-
log10
(
FM
$
ref_Pval
)
*
abs
(
FM
$
ref_logFC
)
if
(
has_ctrl
)
{
FM
$
ctrl_pivalue
<-
-
log10
(
FM
$
ctrl_Pval
)
*
abs
(
FM
$
ctrl_logFC
)
}
# We continue with special cases, that is when P value == 1.
# These are the genes that are definitely not differentially expressed.
# Because P value == 1, we have Pi value == 0.
# We use a small workaround to substract a small epsilon to the original P value.
# This way, all genes with P value == 1 are still ranked (therefore based on their log
# fold change only). This is because we want to avoid too many ties.
pseudo_pval_ref
<-
0.1
*
(
9
+
max
(
FM
$
ref_Pval
[
FM
$
ref_Pval
<
1
]))
pseudo_pi_ref
<-
-
log10
(
pseudo_pval_ref
)
*
abs
(
FM
$
ref_logFC
[
FM
$
ref_pivalue
==
0
])
FM
$
ref_pivalue
[
FM
$
ref_pivalue
==
0
]
<-
pseudo_pi_ref
if
(
has_ctrl
)
{
pseudo_pval_ctrl
<-
0.1
*
(
9
+
max
(
FM
$
ctrl_Pval
[
FM
$
ctrl_Pval
<
1
]))
pseudo_pi_ctrl
<-
-
log10
(
pseudo_pval_ctrl
)
*
abs
(
FM
$
ctrl_logFC
[
FM
$
ctrl_pivalue
==
0
])
FM
$
ctrl_pivalue
[
FM
$
ctrl_pivalue
==
0
]
<-
pseudo_pi_ctrl
rm
(
pseudo_pval_ctrl
,
pseudo_pi_ctrl
)
}
rm
(
pseudo_pval_ref
,
pseudo_pi_ref
)
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the Pi values. We can not use the Pi values directly since they