Commit 8bcecb01 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 08 in agreement with the latest changes in the overall workflow.

parent fbfefcbd
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/08/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/08-RegulatoryNetwork/
clean:
@rm -rf *~
......@@ -44,5 +44,12 @@ refineEnrich:
# @sbatch ${CODE_FOLDER}run_carnival.sh
#refineCarnival:
# @sbatch ${CODE_FOLDER}refine_carnival.sh
tfstats:
@sbatch ${CODE_FOLDER}compute_TF_stats.sh
selectInts:
@echo "********************************************************************************"
@echo "* PLEASE PROCEED WITH THE MANUAL CURATION OF THE *_statistics.tsv FILE *"
@echo "* LOCATED IN THE DATA FOLDER. RELEVANT TF SHOULD BE SELECTED BASED ON THE *"
@echo "* STATISTICS. *"
@echo "********************************************************************************"
@sbatch ${CODE_FOLDER}select_interactions.sh
......@@ -3,7 +3,7 @@ The objectives of this step is to investigate the potential regulators behind th
# Details and instructions
For DoRothEA and CARNIVAL, it is important to make sure that the local repositories are up-to-date and contain the relevant data.
The folders are at ~/Data/GeneDER/Original/Else/DoRothEA and ~/Data/GeneDER/Original/Else/CARNIVAL.
The folder is ~/Data/GeneDER/Original/Else/DoRothEA.
We first start by selecting the genes we want to investigate. This is based on the PI values again, selecting a not so conservative threshold as to have enough genes to start with. This also means that we will need to check the differential expression of the genes at the end as well since some genes might be only weakly differentially expressed.
```
......
......@@ -18,6 +18,7 @@ options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
delta_diffexp_threshold <- 0.5
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
......@@ -49,14 +50,14 @@ duplicate_row_first_cell <- function(row, sep = "|") {
# ================================================================================================
# Read the GeneGo and DoRothEA data (regulatory networks extracted from their databases).
dr_data_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_dorothea_GRN_internal_refined.tsv")
dr_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_dorothea_GRN_internal_refined.tsv")
dr_data <- read.delim(dr_data_fn, header = TRUE, stringsAsFactors = FALSE)
gg_data_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_genego_GRN_internal_refined.tsv")
gg_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_genego_GRN_internal_refined.tsv")
gg_data <- read.delim(gg_data_fn, header = TRUE, stringsAsFactors = FALSE)
rm(dr_data_fn, gg_data_fn)
# For the computations, we simply put all unknown log FC values to 0 and the adjusted P values to 1.
# This simplification is useful for Cytoscape.
# For the computations, we simply put all unknown log FC values to 0 and the unknown adjusted
# P values to 1. This simplification is useful for Cytoscape.
dr_data$Source.F.logFC[is.na(dr_data$Source.F.logFC)] <- 0
dr_data$Target.F.logFC[is.na(dr_data$Target.F.logFC)] <- 0
dr_data$Source.M.logFC[is.na(dr_data$Source.M.logFC)] <- 0
......@@ -98,14 +99,17 @@ dr_data_sel <- dr_data_sel %>%
mutate(Source_best_sign = ifelse(abs(Source.F.logFC) > abs(Source.M.logFC),
sign(Source.F.logFC), sign(Source.M.logFC))) %>%
mutate(Target_best_sign = ifelse(abs(Target.F.logFC) > abs(Target.M.logFC),
sign(Target.F.logFC), sign(Target.M.logFC)))
sign(Target.F.logFC), sign(Target.M.logFC))) %>%
mutate(Delta_diffexp_target = abs(Target.F.logFC - Target.M.logFC))
gg_data_sel <- gg_data_sel %>%
mutate(Source_F_best = abs(Source.F.logFC) > abs(Source.M.logFC)) %>%
mutate(Target_F_best = abs(Target.F.logFC) > abs(Target.M.logFC)) %>%
mutate(Source_best_sign = ifelse(abs(Source.F.logFC) > abs(Source.M.logFC),
sign(Source.F.logFC), sign(Source.M.logFC))) %>%
mutate(Target_best_sign = ifelse(abs(Target.F.logFC) > abs(Target.M.logFC),
sign(Target.F.logFC), sign(Target.M.logFC)))
sign(Target.F.logFC), sign(Target.M.logFC))) %>%
mutate(Delta_diffexp_target = abs(Target.F.logFC - Target.M.logFC))
# We then first select the suitable activations.
dr_data_sel_act <- dr_data_sel %>% filter(Regulation == "Activation") %>%
......@@ -148,71 +152,18 @@ gg_data_sel_unspe_inh <- gg_data_sel %>% filter(Regulation == "Unspecified") %>
mutate(Obs_regulation = "Inhibition")
# We concatenate all selected interactions and save the files.
dr_data_sel_final <- rbind(dr_data_sel_act, dr_data_sel_inh, dr_data_sel_unspe_act, dr_data_sel_unspe_inh) %>% unique()
gg_data_sel_final <- rbind(gg_data_sel_act, gg_data_sel_inh, gg_data_sel_unspe_act, gg_data_sel_unspe_inh) %>% unique()
rm(dr_data_sel, dr_data_sel_act, dr_data_sel_inh, dr_data_sel_unspe_act, dr_data_sel_unspe_inh)
rm(gg_data_sel, gg_data_sel_act, gg_data_sel_inh, gg_data_sel_unspe_act, gg_data_sel_unspe_inh)
dr_data_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_dorothea_GRN_internal_refined_selected.tsv")
gg_data_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_genego_GRN_internal_refined_selected.tsv")
dr_data_sel_final <- rbind(dr_data_sel_act, dr_data_sel_inh, dr_data_sel_unspe_act,
dr_data_sel_unspe_inh) %>% unique()
gg_data_sel_final <- rbind(gg_data_sel_act, gg_data_sel_inh, gg_data_sel_unspe_act,
gg_data_sel_unspe_inh) %>% unique()
rm(dr_data_sel_act, dr_data_sel_inh, dr_data_sel_unspe_act, dr_data_sel_unspe_inh)
rm(gg_data_sel_act, gg_data_sel_inh, gg_data_sel_unspe_act, gg_data_sel_unspe_inh)
dr_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_dorothea_GRN_internal_refined_selected.tsv")
gg_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_genego_GRN_internal_refined_selected.tsv")
write.table(dr_data_sel_final, file = dr_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
write.table(gg_data_sel_final, file = gg_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(dr_data_fn, gg_data_fn)
# We prepare the pseudo SIF files (for Cytoscape).
dr_data_assif <- dr_data_sel_final %>% mutate(sif = paste(Source, Obs_regulation, Target)) %>% select(sif)
gg_data_assif <- gg_data_sel_final %>% mutate(sif = paste(Source, Obs_regulation, Target)) %>% select(sif)
# We prepare the annotations (for Cytoscape again).
dr_data_annots <- rbind(dr_data_sel_final %>%
select(Source, Source.F.logFC, Source.F.adj.P.value, Source.M.logFC, Source.M.adj.P.value) %>%
unique() %>%
rename(Gene = Source,
F.logFC = Source.F.logFC,
F.adj.P.value = Source.F.adj.P.value,
M.logFC = Source.M.logFC,
M.adj.P.value = Source.M.adj.P.value),
dr_data_sel_final %>%
select(Target, Target.F.logFC, Target.F.adj.P.value, Target.M.logFC, Target.M.adj.P.value) %>%
unique() %>%
rename(Gene = Target,
F.logFC = Target.F.logFC,
F.adj.P.value = Target.F.adj.P.value,
M.logFC = Target.M.logFC,
M.adj.P.value = Target.M.adj.P.value)) %>%
mutate(is_TF = case_when(Gene %in% sel_tfs ~ 1, TRUE ~ 0)) %>% unique() %>%
mutate(core_set = (is_TF | abs(F.logFC - M.logFC) >= 1 | sign(F.logFC) == -sign(M.logFC)))
gg_data_annots <- rbind(gg_data_sel_final %>%
select(Source, Source.F.logFC, Source.F.adj.P.value, Source.M.logFC, Source.M.adj.P.value) %>%
unique() %>%
rename(Gene = Source,
F.logFC = Source.F.logFC,
F.adj.P.value = Source.F.adj.P.value,
M.logFC = Source.M.logFC,
M.adj.P.value = Source.M.adj.P.value),
gg_data_sel_final %>%
select(Target, Target.F.logFC, Target.F.adj.P.value, Target.M.logFC, Target.M.adj.P.value) %>%
unique() %>%
rename(Gene = Target,
F.logFC = Target.F.logFC,
F.adj.P.value = Target.F.adj.P.value,
M.logFC = Target.M.logFC,
M.adj.P.value = Target.M.adj.P.value)) %>%
mutate(is_TF = case_when(Gene %in% sel_tfs ~ 1, TRUE ~ 0)) %>% unique() %>%
mutate(core_set = (is_TF | abs(F.logFC - M.logFC) >= 1 | sign(F.logFC) == -sign(M.logFC)))
rm(dr_data_sel_final, gg_data_sel_final)
# Save the network and annot files.
dr_data_sif_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_dorothea_GRN_internal_refined_selected_network.sif")
write.table(dr_data_assif, file = dr_data_sif_fn, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
gg_data_sif_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_genego_GRN_internal_refined_selected_network.sif")
write.table(gg_data_assif, file = gg_data_sif_fn, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
rm(dr_data_sif_fn, dr_data_assif, gg_data_sif_fn, gg_data_assif)
dr_data_annots_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_dorothea_GRN_internal_refined_selected_annotations.tsv")
write.table(dr_data_annots, file = dr_data_annots_fn, sep = "\t", quote = FALSE, row.names = FALSE)
gg_data_annots_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_genego_GRN_internal_refined_selected_annotations.tsv")
write.table(gg_data_annots, file = gg_data_annots_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(dr_data_annots_fn, dr_data_annots, gg_data_annots_fn, gg_data_annots)
# In a second step, we now take care of the regulons interactions that are obtained via the R package.
# These are the interactions that are being used by DoRothEA to compute the enrichment and they can be
# different than the ones DoRothEA retrieves from OmniPath.
......@@ -249,10 +200,10 @@ dim(vp_fullnames) <- c(2, length(names(viper_regulon)))
# the third integration scheme (SNage).
integration <- config$integrations[[3]]
limma_parameters <- config$limma_analyses[[5]]
analysis_prefix_F <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg")
analysis_prefix_F <- paste0(integration$name, "_", limma_parameters$name, "_max-avg")
ranking_file_F <- paste0(input_data_dir, analysis_prefix_F, "_all_pivalue_rankings.tsv")
limma_parameters <- config$limma_analyses[[6]]
analysis_prefix_M <- paste0(integration$name, "_VSN_", limma_parameters$name, "_max-avg")
analysis_prefix_M <- paste0(integration$name, "_", limma_parameters$name, "_max-avg")
ranking_file_M <- paste0(input_data_dir, analysis_prefix_M, "_all_pivalue_rankings.tsv")
rm(integration, limma_parameters)
......@@ -262,7 +213,7 @@ ranking_F <- read.delim(ranking_file_F, stringsAsFactors = FALSE) %>%
ranking_M <- read.delim(ranking_file_M, stringsAsFactors = FALSE) %>%
select(Gene, log_fold_change, P_value, adj_P_value)
rm(ranking_file_F, analysis_prefix_F, ranking_file_M, analysis_prefix_M)
# We clean genes (no duplicate entries with pipe separated gene symbols).
if (sum(grepl("\\|", ranking_F$Gene)) > 0) {
tmp_res <- t(apply(ranking_F, 1, duplicate_row_first_cell))
......@@ -276,7 +227,7 @@ if (sum(grepl("\\|", ranking_F$Gene)) > 0) {
rm(tmp_res)
} else {
ranking_clean_F <- ranking_F
ranking_clean_F$LFC <- as.numeric(ranking_clean_F$ref_logFC)
ranking_clean_F$LFC <- as.numeric(ranking_clean_F$log_fold_change)
ranking_clean_F$P_value <- as.numeric(ranking_clean_F$P_value)
ranking_clean_F$adj_P_value <- as.numeric(ranking_clean_F$adj_P_value)
ranking_clean_F <- ranking_clean_F %>% arrange(adj_P_value)
......@@ -295,7 +246,7 @@ if (sum(grepl("\\|", ranking_M$Gene)) > 0) {
rm(tmp_res)
} else {
ranking_clean_M <- ranking_M
ranking_clean_M$LFC <- as.numeric(ranking_clean_M$ref_logFC)
ranking_clean_M$LFC <- as.numeric(ranking_clean_M$log_fold_change)
ranking_clean_M$P_value <- as.numeric(ranking_clean_M$P_value)
ranking_clean_M$adj_P_value <- as.numeric(ranking_clean_M$adj_P_value)
ranking_clean_M <- ranking_clean_M %>% arrange(adj_P_value)
......@@ -311,8 +262,8 @@ all_diff_exp_genes <- c((ranking_clean_F %>% filter(adj_P_value < 0.05) %>% sele
# is the transcription factor name followed by its evidence level.
names <- vp_fullnames[1, ]
evidence_levels <- vp_fullnames[2, ]
sel_tfs_keys <- paste(names[vp_fullnames[1, ] %in% sel_tfs],
evidence_levels[vp_fullnames[1, ] %in% sel_tfs], sep = "_")
sel_tfs_keys <- paste(names[vp_fullnames[1, ] %in% sel_tfs],
evidence_levels[vp_fullnames[1, ] %in% sel_tfs], sep = "_")
rm(vp_fullnames, names, evidence_levels)
nmax <- 1000
......@@ -364,7 +315,7 @@ vr_data <- vr_data[1:(idf-1),]
rm(viper_regulon, idf, sel_tfs_keys)
rm(all_diff_exp_genes, ranking_clean_F, ranking_clean_M)
# We now have approxiamtelly the same format as for DR and GG so we follow the same procedure.
# We now have approximatelly the same format as for DR and GG so we follow the same procedure.
# Code have been copied from above.
vr_data$Source.F.logFC[is.na(vr_data$Source.F.logFC)] <- 0
vr_data$Target.F.logFC[is.na(vr_data$Target.F.logFC)] <- 0
......@@ -376,7 +327,7 @@ vr_data$Source.M.adj.P.value[is.na(vr_data$Source.M.adj.P.value)] <- 1
vr_data$Target.M.adj.P.value[is.na(vr_data$Target.M.adj.P.value)] <- 1
# No filtering in this case as the filtering already took place while building the dataframe.
vr_data_sel <-vr_data
vr_data_sel <- vr_data
rm(vr_data)
# We then filter the links based on the differential expression and the known regulation.
......@@ -389,7 +340,8 @@ vr_data_sel <- vr_data_sel %>%
mutate(Source_best_sign = ifelse(abs(Source.F.logFC) > abs(Source.M.logFC),
sign(Source.F.logFC), sign(Source.M.logFC))) %>%
mutate(Target_best_sign = ifelse(abs(Target.F.logFC) > abs(Target.M.logFC),
sign(Target.F.logFC), sign(Target.M.logFC)))
sign(Target.F.logFC), sign(Target.M.logFC))) %>%
mutate(Delta_diffexp_target = abs(Target.F.logFC - Target.M.logFC))
# We then first select the suitable activations.
vr_data_sel_act <- vr_data_sel %>% filter(Regulation == "Activation") %>%
......@@ -407,44 +359,69 @@ vr_data_sel_inh <- vr_data_sel %>% filter(Regulation == "Inhibition") %>%
# We concatenate all selected interactions and save the files.
vr_data_sel_final <- rbind(vr_data_sel_act, vr_data_sel_inh) %>% unique()
rm(vr_data_sel, vr_data_sel_act, vr_data_sel_inh)
vr_data_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_regulon_GRN_internal_refined_selected.tsv")
rm(vr_data_sel_act, vr_data_sel_inh)
vr_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_regulon_GRN_internal_refined_selected.tsv")
write.table(vr_data_sel_final, file = vr_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(vr_data_fn)
# We prepare the pseudo SIF files (for Cytoscape).
vr_data_assif <- vr_data_sel_final %>% mutate(sif = paste(Source, Obs_regulation, Target)) %>% select(sif)
# We prepare the annotations (for Cytoscape again).
vr_data_annots <- rbind(vr_data_sel_final %>%
select(Source, Source.F.logFC, Source.F.adj.P.value, Source.M.logFC, Source.M.adj.P.value) %>%
unique() %>%
rename(Gene = Source,
F.logFC = Source.F.logFC,
F.adj.P.value = Source.F.adj.P.value,
M.logFC = Source.M.logFC,
M.adj.P.value = Source.M.adj.P.value),
vr_data_sel_final %>%
select(Target, Target.F.logFC, Target.F.adj.P.value, Target.M.logFC, Target.M.adj.P.value) %>%
unique() %>%
rename(Gene = Target,
F.logFC = Target.F.logFC,
F.adj.P.value = Target.F.adj.P.value,
M.logFC = Target.M.logFC,
M.adj.P.value = Target.M.adj.P.value)) %>%
mutate(is_TF = case_when(Gene %in% sel_tfs ~ 1, TRUE ~ 0)) %>% unique() %>%
mutate(core_set = (is_TF | abs(F.logFC - M.logFC) >= 1 | sign(F.logFC) == -sign(M.logFC)))
rm(vr_data_sel_final, sel_tfs)
# Save the network and annot files.
vr_data_sif_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_regulon_GRN_internal_refined_selected_network.sif")
write.table(vr_data_assif, file = vr_data_sif_fn, sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
rm(vr_data_sif_fn, vr_data_assif)
vr_data_annots_fn <- paste0(output_data_dir, "SNage_VSN_PDVsControl_all_lists_regulon_GRN_internal_refined_selected_annotations.tsv")
write.table(vr_data_annots, file = vr_data_annots_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(vr_data_annots_fn, vr_data_annots)
# We now need to compute more statistics in order to select the best TFs to investigate further.
# We start by merging the original data with the refined daa as to keep all rows.
dr_data_sel_final_full <- merge(dr_data_sel_final, dr_data_sel, all = TRUE)
dr_data_sel_final_full$Obs_regulation[is.na(dr_data_sel_final_full$Obs_regulation)] <- "Deleted"
gg_data_sel_final_full <- merge(gg_data_sel_final, gg_data_sel, all = TRUE)
gg_data_sel_final_full$Obs_regulation[is.na(gg_data_sel_final_full$Obs_regulation)] <- "Deleted"
vr_data_sel_final_full <- merge(vr_data_sel_final, vr_data_sel, all = TRUE)
vr_data_sel_final_full$Obs_regulation[is.na(vr_data_sel_final_full$Obs_regulation)] <- "Deleted"
rm(dr_data_sel_final, dr_data_sel)
rm(gg_data_sel_final, gg_data_sel)
rm(vr_data_sel_final, vr_data_sel)
# We save the full dataframes (that also include the rows that have been tagged for deletion).
dr_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_dorothea_GRN_internal_refined_selected_details.tsv")
write.table(dr_data_sel_final_full, file = dr_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(dr_data_fn)
gg_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_genego_GRN_internal_refined_selected_details.tsv")
write.table(gg_data_sel_final_full, file = gg_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(gg_data_fn)
vr_data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_regulon_GRN_internal_refined_selected_details.tsv")
write.table(vr_data_sel_final_full, file = vr_data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(vr_data_fn)
# We merge all datasets together to create only one dataset and compute the statistics on that one.
data_final <- rbind(dr_data_sel_final_full %>% select(Source, Target, Delta_diffexp_target),
gg_data_sel_final_full %>% select(Source, Target, Delta_diffexp_target),
vr_data_sel_final_full %>% select(Source, Target, Delta_diffexp_target)) %>%
unique()
rm(dr_data_sel_final_full, gg_data_sel_final_full, vr_data_sel_final_full)
# For each TF, we compute the median delta LFC across all targets.
data_stats <- data_final %>%
group_by(Source) %>%
summarize(Median_delta_diffexp_targets = median(Delta_diffexp_target))
# For each TF, we compute the percentage of targets whose delta LFC is above a given threshold.
data_perc <- merge(data_final %>%
group_by(Source) %>%
summarize(Nb_targets = n()),
data_final %>%
filter(Delta_diffexp_target > delta_diffexp_threshold) %>%
group_by(Source) %>%
summarize(count = n()),
all = TRUE)
data_perc$count[is.na(data_perc$count)] <- 0
data_perc <- data_perc %>% mutate(Perc_targets = round(100 * count / Nb_targets, 1)) %>%
select(Source, Nb_targets, Perc_targets)
rm(data_final)
# We keep only one DF for all stats and we keep only the selected TFs.
data_stats <- merge(data_stats, data_perc) %>% filter(Source %in% sel_tfs)
rm(data_perc, sel_tfs)
# We save the file with the statistics.
data_fn <- paste0(output_data_dir, "SNage_PDVsControl_all_lists_combined_GRN_internal_refined_selected_statistics.tsv")
write.table(data_stats, file = data_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(data_fn, data_stats)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
rm(config, output_data_dir, input_data_dir, delta_diffexp_threshold)
sessionInfo()
#!/bin/bash -l
#SBATCH -J geneder:08:TFstats
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 10
#SBATCH --time=0-00:02:00
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
module load math/CPLEX/12.9-foss-2019a
# Defining global parameters.
INPUT_FOLDER=/home/leon/Projects/GeneDER/Results/Regulatory_v06/Refined/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/08/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/08-RegulatoryNetwork/
# Copy the data locally (the manually refined GeneGO and DoRothEA files).
cp ${INPUT_FOLDER}SNage_PDVsControl_all_lists*_GRN_*internal_refined.tsv ${OUTPUT_FOLDER}
cp ${INPUT_FOLDER}SelectedTFs_fulllist.tsv ${OUTPUT_FOLDER}
# Actual job in R.
Rscript --vanilla ${CODE_FOLDER}compute_TF_stats.R > ${OUTPUT_FOLDER}compute_TF_stats_log.out 2> ${OUTPUT_FOLDER}compute_TF_stats_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/bin/bash -l
#SBATCH -J geneder:18:extract
#SBATCH -J geneder:08:extract
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:05:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,9 +15,9 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/08/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/08-RegulatoryNetwork/
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (diffentially expressed genes).
......@@ -35,22 +35,22 @@ done < "$input"
mv ${OUTPUT_FOLDER}Dorothea_temp.tsv ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# First the female data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_max-avg_all_Pvalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_max-avg_all_Pvalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
# Second the male data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_max-avg_all_Pvalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_max-avg_all_Pvalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
# Third, the combination of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_max-avg_all_lists_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_max-avg_all_lists_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/bin/bash -l
#SBATCH -J geneder:18:getdb
#SBATCH -J geneder:08:getdb
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:06:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,15 +15,14 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
ALT_INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
ALT_INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/08/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/08-RegulatoryNetwork/
# Actual job in bash.
# We download the latest Dodothea release from omnipath. We use the evidence levels A, B and C and disregard the levels D and E.
# We could ignore C as well as these interactions have only limited support but we could also include D, to get a lot more interactions.
curl -o ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv "http://omnipathdb.org/interactions?datasets=tfregulons&tfregulons_levels=A,B,C&genesymbols=1&fields=sources,tfregulons_level"
curl -o ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv "https://omnipathdb.org/interactions?datasets=tfregulons&tfregulons_levels=A,B,C&genesymbols=1&fields=sources,tfregulons_level"
# We remove the unnecessary fields. So far, we do not filter the TF that are very frequent.
# The most frequent one (with evidence A, B and C) has ~1k targets.
......@@ -35,7 +34,7 @@ grep -v "source_genesymbol" ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3
cat <(cut -f 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) | grep -v "source_genesymbol" | grep -v "target_genesymbol" | sort -u > ${OUTPUT_FOLDER}Dorothea_geneids.tsv
# We also extract all gene ids used in GeneDER.
cut -f 1 ${ALT_INPUT_FOLDER}Combined_probe_matching.tsv | grep -v "genes" | sort -u > ${OUTPUT_FOLDER}Geneder_geneids.tsv
cut -f 2 ${ALT_INPUT_FOLDER}Combined_probe_matching.tsv | grep -v "genes" | sort -u > ${OUTPUT_FOLDER}Geneder_geneids.tsv
# We compute the overlap, and more interestingly the genes from DoRothEA that do not match the GeneDER genes.
comm -23 ${OUTPUT_FOLDER}Dorothea_geneids.tsv ${OUTPUT_FOLDER}Geneder_geneids.tsv > ${OUTPUT_FOLDER}Dorothea_geneids_unmapped.tsv
......@@ -73,7 +72,7 @@ wc -l ${OUTPUT_FOLDER}Dorothea_geneids_unmapped.tsv
# Build the final clean map (that would still need to be checked manually).
join -1 2 -2 1 ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped.tsv <(comm -23 <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped.tsv | sort | uniq -c | sort -g | grep '\s1\s' | sed -r 's/1 /1\t/g' | cut -f 2) ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_missing.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped_clean.tsv
join -t$'t' -a 1 -1 1 -2 1 ${OUTPUT_FOLDER}Dorothea_geneids.tsv ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped_clean.tsv | cut -f 2 > ${OUTPUT_FOLDER}Dorothea_geneids_automatic.tsv
join -t$'t' -a 1 -1 1 -2 1 ${OUTPUT_FOLDER}Dorothea_geneids.tsv <(sort -k1,1n ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped_clean.tsv) | cut -f 2 > ${OUTPUT_FOLDER}Dorothea_geneids_automatic.tsv
# Warning for user.
echo "*************************************************************************"
......@@ -119,7 +118,7 @@ else
# Build the final clean map (that would still need to be check manually).
join -1 2 -2 1 ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_mapped.tsv <(comm -23 <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_mapped.tsv | sort | uniq -c | sort -g | grep '\s1\s' | sed -r 's/1 /1\t/g' | cut -f 2) ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_missing.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_mapped_clean.tsv
fi
join -t$'t' -a 1 -1 1 -2 1 ${OUTPUT_FOLDER}Dorothea_source_geneids.tsv ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_mapped_clean.tsv | cut -f 2 > ${OUTPUT_FOLDER}Dorothea_source_geneids_automatic.tsv
join -t$'t' -a 1 -1 1 -2 1 ${OUTPUT_FOLDER}Dorothea_source_geneids.tsv <(sort -k1,1n ${OUTPUT_FOLDER}Dorothea_source_geneids_unmapped_mapped_clean.tsv) | cut -f 2 > ${OUTPUT_FOLDER}Dorothea_source_geneids_automatic.tsv
# Warning for user.
echo "*************************************************************************"
......
local_input_data_dir: !!str '07/'
local_data_dir: !!str '08/'
local_code_dir: !!str '08-RegulatoryNetwork/'
#!/bin/bash -l
#SBATCH -J geneder:18:prepref
#SBATCH -J geneder:08:prepref
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:05:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -16,22 +16,19 @@ echo ""
# Defining global parameters.
GG_DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Else/GeneGo/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/07/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/08/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/08-RegulatoryNetwork/
# Actual job in bash.
# Copy the GeneGo data (must have been run manually before).
cp -rf ${GG_DATA_FOLDER}* ${OUTPUT_FOLDER}
# We derive the list of TF present in the extended networks and not the internal networks.
# We derive the list of TF present in the internal networks.
# We will need this list to go through GeneGo as well to get the official gene names and thus potentially
# the geneder ids for these genes.
cut -f 2 ${OUTPUT_FOLDER}*genego*internal.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F1
cut -f 2 ${OUTPUT_FOLDER}*genego*extended.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F2
comm -13 ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2 > ${OUTPUT_FOLDER}genego_extended_newTFs.tsv
rm ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2
cut -f 2 ${OUTPUT_FOLDER}*genego*internal.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}genego_extended_newTFs.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
#!/bin/bash -l
#SBATCH -J geneder:18:prepcrnvl
#SBATCH -J geneder:08:prepcrnvl
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:06:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,11 +15,11 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
ALT_INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/