Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
62ce0a0b
Commit
62ce0a0b
authored
Sep 23, 2020
by
Leon-Charles Tranchevent
Browse files
Another attempt to get Carnival to run (bioc package version).
parent
ae497b09
Changes
2
Hide whitespace changes
Inline
Side-by-side
18-RegulatoryNetwork_all/prepare_carnival.sh
View file @
62ce0a0b
...
...
@@ -18,6 +18,7 @@ echo ""
INPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/17/
ALT_INPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/18/
DATA_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# We load the module (R is used i part II).
...
...
@@ -43,8 +44,7 @@ cut -f 1,2,4 ${OUTPUT_FOLDER}Omnipath_network.tsv | awk -v FS=$'\t' -v OFS=$'\t'
#echo "*************************************************************************"
# Part II - we now take care of the PROGENy linear model (useful to create the CARNIVAL weight matrix).
# We obtain the latest file from the CARNIVAL R package.
Rscript
--vanilla
${
CODE_FOLDER
}
extract_progeny_model.R
>
${
OUTPUT_FOLDER
}
extract_progeny_model_log.out 2>
${
OUTPUT_FOLDER
}
extract_progeny_model_log.err
cp
-rf
${
DATA_FOLDER
}
/Original/Else/Progeny/Progeny_model.csv
${
OUTPUT_FOLDER
}
Progeny_model.csv
# We process the file further to extract the full list of gene ids used by PROGENy.
cut
-f
1
-d
','
${
OUTPUT_FOLDER
}
Progeny_model.csv |
sed
-r
"s/
\"
//g"
|
grep
-v
"^$"
|
sort
-u
>
${
OUTPUT_FOLDER
}
Progeny_geneids.tsv
...
...
18-RegulatoryNetwork_all/run_carnival.R
View file @
62ce0a0b
...
...
@@ -8,6 +8,7 @@ library("stringr")
library
(
"tidyverse"
)
library
(
"viper"
)
library
(
"CARNIVAL"
)
library
(
"progeny"
)
source
(
"../libs/conf/confR.R"
)
source
(
"../libs/utils/utils.R"
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Libraries loaded."
))
...
...
@@ -40,7 +41,8 @@ duplicate_row_first_cell <- function(row, sep = "|") {
keys
<-
strsplit
(
row
[
1
],
sep
,
fixed
=
TRUE
)[[
1
]]
val_1
<-
rep
(
row
[
2
],
length
(
keys
))
val_2
<-
rep
(
row
[
3
],
length
(
keys
))
to_ret
<-
cbind
(
Gene
=
keys
,
LFC
=
val_1
,
ranking_value
=
val_2
)
val_3
<-
rep
(
row
[
4
],
length
(
keys
))
to_ret
<-
cbind
(
Gene
=
keys
,
LFC
=
val_1
,
P_value
=
val_2
,
adj_P_value
=
val_3
)
return
(
to_ret
)
}
...
...
@@ -101,51 +103,88 @@ map_genenames_uniprots <- read.delim(map_genenames_uniprots_fn,
stringsAsFactors
=
FALSE
,
header
=
TRUE
)
# We loop over the different files to analyse.
for
(
i
in
seq_len
(
length
(
config
$
ranking_files
)))
{
# We only do the first three cases because then we do not have P values so
# we can not really create the "mySignature" object.
if
(
i
>
4
)
{
next
}
# We define the I/Os.
ranking_file
<-
paste0
(
input_data_dir
,
config
$
ranking_files
[[
i
]])
file_prefix
<-
strsplit
(
config
$
ranking_files
[[
i
]],
split
=
"rankings.tsv"
)[[
1
]]
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
ranking_file
,
stringsAsFactors
=
FALSE
,
row.names
=
1
)
%>%
select
(
Gene
,
ref_logFC
,
ranking_value
)
rm
(
ranking_file
)
# We do all integration schemes one by one.
for
(
i
in
seq_len
(
length
(
config
$
integrations
)))
{
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res
<-
t
(
apply
(
ranking
,
1
,
duplicate_row_first_cell
))
ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
ranking_clean
$
ranking_value
<-
as.numeric
(
ranking_clean
$
ranking_value
)
ranking_clean
$
LFC
<-
as.numeric
(
ranking_clean
$
LFC
)
ranking_clean
<-
ranking_clean
%>%
arrange
(
desc
(
ranking_value
))
rm
(
tmp_res
,
ranking
)
# We get the integration name for that scheme.
integration
<-
config
$
integrations
[[
i
]]
# Estimate Z-score values for the GES (As in DoRothEA manual, check VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics
<-
matrix
(
ranking_clean
$
LFC
,
dimnames
=
list
(
ranking_clean
$
Gene
,
"logFC"
)
)
mypPvalue
<-
matrix
(
10
^
(
-
ranking_clean
$
ranking_value
*
median
(
abs
(
ranking_clean
$
LFC
))),
dimnames
=
list
(
ranking_clean
$
Gene
,
"pP.Value"
)
)
mySignature
<-
(
qnorm
(
mypPvalue
/
2
,
lower.tail
=
FALSE
)
*
sign
(
myStatistics
))[,
1
]
mySignature
<-
mySignature
[
order
(
mySignature
,
decreasing
=
T
)]
rm
(
ranking_clean
,
mypPvalue
,
myStatistics
)
# We only do the enrichment if it is necessary.
if
(
integration
$
use_for_network
==
"FALSE"
)
{
rm
(
integration
)
next
}
# Estimate TF activities via DoRothEA.
TF_activities
<-
runDoRothEA
(
data.frame
(
EGF
=
mySignature
,
row.names
=
names
(
mySignature
)),
regulon
=
viper_regulon
,
confidence_level
=
c
(
'A'
,
'B'
,
'C'
,
'D'
))
TF_activities
$
SYMBOL
<-
row.names
(
TF_activities
)
# We do all limma comparisons one by one.
for
(
j
in
seq_len
(
length
(
config
$
limma_analyses
)))
{
# We extract the Limma parameters.
limma_parameters
<-
config
$
limma_analyses
[[
j
]]
# We only do the enrichment if it is necessary.
if
(
limma_parameters
$
use_for_network
==
"FALSE"
)
{
rm
(
limma_parameters
)
next
}
# We define the I/Os.
file_prefix
<-
paste0
(
integration
$
name
,
"_VSN_"
,
limma_parameters
$
name
,
"_max-avg_all_pivalue_"
)
ranking_file
<-
paste0
(
input_data_dir
,
file_prefix
,
"rankings.tsv"
)
# We read the prepared ranking and keep only the value used to rank.
ranking
<-
read.delim
(
ranking_file
,
stringsAsFactors
=
FALSE
)
%>%
select
(
Gene
,
log_fold_change
,
P_value
,
adj_P_value
)
rm
(
ranking_file
)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res
<-
t
(
apply
(
ranking
,
1
,
duplicate_row_first_cell
))
ranking_clean
<-
data.frame
(
do.call
(
rbind
,
c
(
tmp_res
)),
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
ranking_clean
$
LFC
<-
as.numeric
(
ranking_clean
$
LFC
)
ranking_clean
$
P_value
<-
as.numeric
(
ranking_clean
$
P_value
)
ranking_clean
$
adj_P_value
<-
as.numeric
(
ranking_clean
$
adj_P_value
)
ranking_clean
<-
ranking_clean
%>%
arrange
(
P_value
)
rm
(
tmp_res
,
ranking
)
# Estimate Z-score values for the GES (as in DoRothEA manual, check VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics
<-
matrix
(
ranking_clean
$
LFC
,
dimnames
=
list
(
ranking_clean
$
Gene
,
"logFC"
))
myPvalue
<-
matrix
(
ranking_clean
$
adj_P_value
,
dimnames
=
list
(
ranking_clean
$
Gene
,
"P.Value"
))
mySignature
<-
(
qnorm
(
myPvalue
/
2
,
lower.tail
=
FALSE
)
*
sign
(
myStatistics
))[,
1
]
mySignature
<-
mySignature
[
order
(
mySignature
,
decreasing
=
T
)]
rm
(
ranking_clean
,
myPvalue
,
myStatistics
)
# Estimate TF activities via DoRothEA.
TF_activities
<-
runDoRothEA
(
data.frame
(
EGF
=
mySignature
,
row.names
=
names
(
mySignature
)),
regulon
=
viper_regulon
,
confidence_level
=
c
(
'A'
,
'B'
,
'C'
,
'D'
))
TF_activities
$
SYMBOL
<-
row.names
(
TF_activities
)
# Estimate TF activities via Viper.
mrs
<-
msviper
(
ges
=
mySignature
,
regulon
=
viper_regulon
,
minsize
=
4
,
ges.filter
=
F
)
# Format the results.
tf_name
<-
sapply
(
strsplit
(
names
(
mrs
$
es
$
nes
),
split
=
"_"
),
head
,
1
)
evidence_level
<-
sapply
(
strsplit
(
names
(
mrs
$
es
$
nes
),
split
=
"_"
),
tail
,
1
)
TF_activities
<-
data.frame
(
Regulon
=
tf_name
,
evdc_lvl
=
evidence_level
,
Size
=
mrs
$
es
$
size
[
names
(
mrs
$
es
$
nes
)
],
NES
=
mrs
$
es
$
nes
,
p.value
=
mrs
$
es
$
p.value
)
TF_activities
<-
TF_activities
[
order
(
TF_activities
$
p.value
),
]
%>%
filter
(
evdc_lvl
%in%
c
(
"A"
,
"B"
,
"C"
,
"D"
))
%>%
mutate
(
FDR
=
p.adjust
(
p.value
,
method
=
"fdr"
))
%>%
rename
(
SYMBOL
=
Regulon
)
row.names
(
TF_activities
)
<-
TF_activities
$
SYMBOL
##rm(mrs, mySignature, tf_name, evidence_level)
# We then convert the gene names to Uniprot Ids since CARNIVAL requires Uniprot ids.
TF_activities_uniprotids
<-
merge
(
x
=
TF_activities
,
y
=
map_genenames_uniprots
,
...
...
@@ -186,6 +225,7 @@ for (i in seq_len(length(config$ranking_files))) {
# We then move the results to the data folder.
system
(
paste0
(
"mv "
,
result_dir
,
" "
,
output_data_dir
))
}
}
# End for each file to analyse.
rm
(
i
,
viper_regulon
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment