Commit a661ba2b authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added the enrichment analysis for the MA+RS integration.

parent dd6b7c73
This diff is collapsed.
#!/bin/bash -l
#SBATCH -J geneder:17:cp_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 20
#SBATCH --time=0-02:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/17-Enrichment_all/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/CP_enrich.R > ${OUTPUT_FOLDER}/CP_enrich_log.out 2> ${OUTPUT_FOLDER}/CP_enrich_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("GSEABase")
library("gep2pep")
library("tidyverse")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
return(to_ret)
}
# ================================================================================================
# Main
# ================================================================================================
# Import the MsigDB data (latest release).
db_dir <- paste0(config$global_raw_data_dir, "MsigDB/")
my_db <- paste0(db_dir, "msigdb_v7.0.xml")
db <- importMSigDB.xml(my_db)
db <- as.CategorizedCollection(db)
rm(my_db)
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# Create the repository (for reusability).
repoRoot <- file.path(db_dir, "gep2pep_data_tmp")
unlink(repoRoot, recursive = TRUE)
rp <- createRepository(repoRoot, db)
# We define the I/Os.
ranking_file <- paste0(output_data_dir, config$ranking_files[[i]])
file_prefix <- paste0("G2P_", strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]])
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res, ranking)
# We prepare the gene id specific rankings (using gene names).
ranking_final <- ranking_clean %>% mutate(ranking = base::rank(-ranking_value,
na.last = TRUE,
ties.method = "first")) %>%
select(ranking)
rownames(ranking_final) <- ranking_clean$Gene
rm(ranking_clean)
# We build the PEPs (~differentially expressed pathways).
buildPEPs(rp, ranking_final)
rm(ranking_final)
# We then need to extract the results for each collection.
collections <- getCollections(rp)
peps_msigdb_res <- list()
for (j in seq_len(length(collections))) {
# Load the results.
message("[", j, " started]")
collec <- collections[j]
rawES <- loadPVmatrix(rp, collec)
rawPV <- loadPVmatrix(rp, collec)
resES <- data.frame(pid = rownames(rawES), es = rawES[, 1])
resPV <- data.frame(pid = rownames(rawPV), pv = rawPV[, 1])
res <- merge(x = resES, y = resPV, by = "pid") %>% mutate(collection = collec)
# Get id to name mapping, same for description (takes time).
collec_sets <- loadCollection(rp, collec)
res$pname <- setId2setName(collec_sets, res$pid)
res$pdesc <- apply(data.frame(res$pname), 1, function(x) description(collec_sets[[x]]))
peps_msigdb_res[[j]] <- res
rm(collec, rawES, rawPV, resES, resPV, res, collec_sets)
message("[", j, " done]")
} # End for each collection.
rm(j, collections)
# We format the results and save them.
peps_msigdb_res <- data.frame(do.call(rbind, c(peps_msigdb_res)),
row.names = NULL,
stringsAsFactors = FALSE)
peps_msigdb_res_fn <- paste0(output_data_dir, file_prefix, "MSIGDB.tsv")
write.table(peps_msigdb_res,
file = peps_msigdb_res_fn,
sep = "\t",
quote = FALSE,
col.names = NA)
rm(peps_msigdb_res, peps_msigdb_res_fn)
# Cleaning
unlink(repoRoot, recursive = TRUE)
rm(rp, repoRoot)
message(paste0("[", Sys.time(), "][", file_prefix, "] G2P GSEA enrichment (MsigDB) done."))
rm(file_prefix)
} # End for each file to analyse.
rm(i, db, db_dir)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
sessionInfo()
#!/bin/bash -l
#SBATCH -J geneder:17:g2p_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 8
#SBATCH --time=0-02:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/17-Enrichment_all/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/G2P_enrich.R > ${OUTPUT_FOLDER}/G2P_enrich_log.out 2> ${OUTPUT_FOLDER}/G2P_enrich_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/17-Enrichment_all/
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
prep:
@sbatch ${CODE_FOLDER}/subtract_rankings.sh
enrich:
@sbatch ${CODE_FOLDER}/CP_enrich.sh
@sbatch ${CODE_FOLDER}/G2P_enrich.sh
@sbatch ${CODE_FOLDER}/ROT_enrich.sh
pfenrich:
@/bin/bash ${CODE_FOLDER}/PF_enrich.sh
merge:
@sbatch ${CODE_FOLDER}/merge.sh
gsp:
@sbatch ${CODE_FOLDER}/gender_specific_pathways.sh
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("pathfindR")
library("tidyverse")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
return(to_ret)
}
# ================================================================================================
# Main
# ================================================================================================
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We need a P value to start with (PF requirement) so we only run PF on the purely
# limma derived cases and ignore the gender specific cases since the deltas we computed
# ourselves are not associated with any P value.
if (i > 4) {
next
}
# We define the I/Os.
ranking_file <- paste0(output_data_dir, config$ranking_files[[i]])
file_prefix <- paste0("PF_", strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]])
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ref_adj_Pval)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_final <- ranking_clean %>% arrange(ranking_value) %>% filter(ranking_value < 0.05)
rm(tmp_res, ranking, ranking_clean)
# We loop over the functional sources.
for (j in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[j]]
if (is.null(func_source$pf_name) || func_source$pf_name == "") {
next
}
# We run the Network based ORA.
output_dir <- paste0(output_data_dir, file_prefix, "tmp")
output_df <- pathfindR::run_pathfindR(input = ranking_final,
gene_sets = func_source$pf_name,
enrichment_threshold = 1,
iterations = 10,
output_dir = output_dir,
visualize_enriched_terms = FALSE,
pin_name_path = "Biogrid")
otable_fn <- paste0(output_data_dir, file_prefix, func_source$pf_name, ".tsv")
write.table(output_df,
file = otable_fn,
sep = "\t",
quote = FALSE,
col.names = NA)
unlink(output_dir, recursive = TRUE)
rm(output_dir, otable_fn, output_df)
message(paste0("[", Sys.time(), "][", file_prefix, "] PF GSEA enrichment (",
func_source$pf_name, ") done."))
rm(func_source)
} # End for each functional source.
rm(j, file_prefix, ranking_final)
} # End for each file to analyse.
rm(i)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
sessionInfo()
#!/bin/bash -l
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/17-Enrichment_all/
# Actual jobs
xvfb-run Rscript --vanilla ${CODE_FOLDER}/PF_enrich.R > ${OUTPUT_FOLDER}/PF_enrich_log.out 2> ${OUTPUT_FOLDER}/PF_enrich_log.err
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("org.Hs.eg.db")
library("tidyverse")
library("ROntoTools")
library("graph")
library("clusterProfiler")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
return(to_ret)
}
# ================================================================================================
# Main
# ================================================================================================
# Get the latest Kegg pathways.
kpg <- keggPathwayGraphs("hsa", updateCache = TRUE, verbose = TRUE)
kpn <- keggPathwayNames("hsa")
# Fine tune the weights according to interation type.
kpg <- setEdgeWeights(kpg, edgeTypeAttr = "subtype",
edgeWeightByType = list(activation = 1, inhibition = -1,
expression = 1, repression = -1),
defaultWeight = 0)
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We define the I/Os.
ranking_file <- paste0(output_data_dir, config$ranking_files[[i]])
file_prefix <- paste0("ROT_", strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]])
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
rm(tmp_res, ranking)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
ranking_final <- merge(x = ranking_clean,
y = mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = paste0("hsa:", ENTREZID)) %>% select(EGene, LFC, ranking_value)
ranking_final$LFC <- as.numeric(ranking_final$LFC)
ranking_final$ranking_value <- as.numeric(ranking_final$ranking_value)
ranking_final <- ranking_final %>% arrange(desc(ranking_value))
rm(mapped_egenes, ranking_clean)
# We prepare the gene id specific rankings (using entrezgene ids).
logFC_eg <- ranking_final$LFC
ranking_eg <- ranking_final$ranking_value
names(logFC_eg) <- ranking_final$EGene
names(ranking_eg) <- ranking_final$EGene
logFC_eg <- logFC_eg[!duplicated(names(logFC_eg))]
ranking_eg <- ranking_eg[!duplicated(names(ranking_eg))]
rm(ranking_final)
# We set the pathway node weights from the gene Pi values.
# Here, we go back from the original Pi values [0, +Inf] to pseudo P values [0, 1].
# alphaMLG then uses -log10(Pseudo P values) to assign the weights that are therefore the
# original Pi values.
loc_kpg <- kpg
loc_kpg <- setNodeWeights(loc_kpg, weights = alphaMLG(10 ^ (-ranking_eg)), defaultWeight = 1)
rm(ranking_eg)
# We run the GSEA itself.
gsea_topo_kegg <- pe(x = logFC_eg,
graphs = loc_kpg,
ref = NULL,
nboot = 25000,
verbose = TRUE,
seed = config$seed)
rm(logFC_eg)
# We format the results and save them.
gsea_topo_kegg_table <- summary(gsea_topo_kegg, pathNames = kpn, order.by = "pPert")
gsea_topo_kegg_table_fn <- paste0(output_data_dir, file_prefix, "KEGG.tsv")
write.table(gsea_topo_kegg_table,
file = gsea_topo_kegg_table_fn,
sep = "\t",
quote = FALSE,
col.names = NA)
rm(gsea_topo_kegg, gsea_topo_kegg_table, gsea_topo_kegg_table_fn)
message(paste0("[", Sys.time(), "][", file_prefix, "] ROT GSEA enrichment (KEGG) done."))
rm(loc_kpg, file_prefix)
} # End for each file to analyse.
rm(i, kpn, kpg)
# We log the session details for reproducibility.
rm(config, input_data_dir, output_data_dir)
sessionInfo()
#!/bin/bash -l
#SBATCH -J geneder:17:rot_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 8
#SBATCH --time=0-02:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/17-Enrichment_all/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/ROT_enrich.R > ${OUTPUT_FOLDER}/ROT_enrich_log.out 2> ${OUTPUT_FOLDER}/ROT_enrich_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("org.Hs.eg.db")
library("tidyverse")
library("ReactomePA")
library("DOSE")
library("clusterProfiler")
library("msigdbr")
library("pathview")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Configure MSigDB
msigdb <- msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name, entrez_gene)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Duplicate rows by splitting on the first cell.
#'
#' @description This function takes one row and split its first cell.
#' It then creates the necessary extra rows while copying the second cell.
#' Ex: "a|b", "whatever" --> "a", "whatever"
#' "b", "whatever"
#'
#' @param row A vector of two values consisting of a set of values (possibly only one) and
#' some other value on the second cell (Not to be splitted).
#' @param sep The string separator for the split.
#' @return A matrix of rows (possibly only one row).
duplicate_row_first_cell <- function(row, sep = "|") {
keys <- strsplit(row[1], sep, fixed = TRUE)[[1]]
val_1 <- rep(row[2], length(keys))
val_2 <- rep(row[3], length(keys))
to_ret <- cbind(Gene = keys, LFC = val_1, ranking_value = val_2)
return(to_ret)
}
# =========================================
# i <- 2
# =========================================
i <-2
# We define the I/Os.
ranking_file <- paste0(output_data_dir, config$ranking_files[[i]])
file_prefix <- paste0("CP_", strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]])
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%