Commit d3a82871 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 06 to process all datasets at once (Part II).

parent 77f8c71f
......@@ -132,6 +132,12 @@ for (i in seq_len(length(config$integrations))) {
row.names = 1,
as.is = TRUE)
rm(int_results_fn)
# If there are no results, we do not do anything (rare - only some limma on iPSC-DA).
#if (dim(integration_results)[1] == 0) {
# rm(selection, file_prefix, integration_results)
# next
#}
# We create the data structures to later plot the use counts.
results_core_matrix_full <- integration_results %>%
......@@ -146,7 +152,9 @@ for (i in seq_len(length(config$integrations))) {
# We filter out to non consensus log fold changes only for the use count plots.
results_csss_lfc <- rep(integration_results$wavg_csss_logFC, dim(results_core_matrix_full)[2])
dim(results_csss_lfc) <- dim(results_core_matrix_full)
results_core_matrix_full[sign(results_core_matrix_full) != sign(results_csss_lfc)] <- NA
if (dim(results_core_matrix_full)[1] > 0) {
results_core_matrix_full[sign(results_core_matrix_full) != sign(results_csss_lfc)] <- NA
}
nb_nas <- c(0, sapply(results_core_matrix_full,
function(y) sum(is.na(y))))
use_count_full <- dim(results_core_matrix_full)[1] - nb_nas
......@@ -221,7 +229,7 @@ for (i in seq_len(length(config$integrations))) {
rm(sign_genes_4ecdf, ecdf_fn, myplot)
} # End if there is at least one significant DEG.
# We now save the individual gene lists (even if enmpty).
# We now save the individual gene lists (even if empty).
save_sign_genes(sign_genes, paste0(file_prefix, pval_comb$name, "_"))
rm(pval_comb, sign_genes)
} # End for all P value combination methods.
......
......@@ -45,99 +45,91 @@ for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We repeat the analysis for all VSN usages.
for (j in seq_len(length(config$variance_methods))) {
# We get the current VSN configuration.
vsn <- config$variance_methods[[j]]
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We define the current limma analysis.
limma_analysis <- config$limma_analyses[[k]]
# We only focus on the PD vs Control limma analysis.
if (limma_analysis$name != "PDVsControl") {
rm(limma_analysis)
next;
}
# We loop over the different gene-probe selection methods.
for (l in seq_len(length(config$selections))) {
# We define the current gene-probe selection method.
selection <- config$selections[[l]]
# We define the file prefix for this configuration defined
# based on all above loops.
file_prefix <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_")
# We read the results of the integration (contains results for all P value combination
# methods).
int_results_fn <- paste0(file_prefix, "integration.tsv")
integration_results <- read.table(int_results_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(int_results_fn)
for (o in seq_len(length(config$p_val_combinations))) {
# We define the I/Os.
pval_comb <- config$p_val_combinations[[o]]
pval_meth <- paste0("int_csss_adj_pval_", pval_comb$name)
# We filter the results for that particular P value combination method.
diff_exp_results <- integration_results %>%
dplyr::select(SYMBOL, !! sym(pval_meth))
# We filter the list of PD associated genes as well to keep only the genes that
# are in both lists.
pd_genes_loc <- merge(pd_genes, diff_exp_results, by.x = "V1", by.y = "SYMBOL")
# We need to compute the contingency table.
nb_tt <- dim(pd_genes_loc %>%
dplyr::filter(.data[[pval_meth]] < config$p_val_sign_thres))[1]
nb_tf <- dim(pd_genes_loc %>%
dplyr::filter(.data[[pval_meth]] >= config$p_val_sign_thres))[1]
nb_ft <- dim(diff_exp_results %>%
dplyr::filter(.data[[pval_meth]] < config$p_val_sign_thres))[1]
nb_ff <- dim(diff_exp_results %>%
dplyr::filter(.data[[pval_meth]] >= config$p_val_sign_thres))[1]
cont_table <- matrix(c(nb_tt, nb_tf, nb_ft, nb_ff), nrow = 2,
dimnames = list(c("DEGs", "Non DEGs"),
c("PD biomarkers", "Non PD biomarkers")))
rm(pval_meth, diff_exp_results)
# We run the Fisher exact test.
ft_res <- fisher.test(cont_table, alternative = "two.sided")
# We print the results and log.
all_res <- rbind(all_res, c(integration$name, vsn$name, limma_analysis$name,
selection$name, pval_comb$name, nb_tt, nb_tf,
nb_ft, nb_ff, ft_res$p.value))
message(paste0("[", Sys.time(), "][", integration$name, "][", vsn$name, "][",
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We define the current limma analysis.
limma_analysis <- config$limma_analyses[[k]]
# We only focus on the PD vs Control limma analysis.
if (limma_analysis$name != "PDVsControl") {
rm(limma_analysis)
next;
}
# We loop over the different gene-probe selection methods.
for (l in seq_len(length(config$selections))) {
# We define the current gene-probe selection method.
selection <- config$selections[[l]]
# We define the file prefix for this configuration defined
# based on all above loops.
file_prefix <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_")
# We read the results of the integration (contains results for all P value combination
# methods).
int_results_fn <- paste0(file_prefix, "integration.tsv")
integration_results <- read.table(int_results_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(int_results_fn)
for (o in seq_len(length(config$p_val_combinations))) {
# We define the I/Os.
pval_comb <- config$p_val_combinations[[o]]
pval_meth <- paste0("int_csss_adj_pval_", pval_comb$name)
# We filter the results for that particular P value combination method.
diff_exp_results <- integration_results %>%
dplyr::select(SYMBOL, !! sym(pval_meth))
# We filter the list of PD associated genes as well to keep only the genes that
# are in both lists.
pd_genes_loc <- merge(pd_genes, diff_exp_results, by.x = "V1", by.y = "SYMBOL")
# We need to compute the contingency table.
nb_tt <- dim(pd_genes_loc %>%
dplyr::filter(.data[[pval_meth]] < config$p_val_sign_thres))[1]
nb_tf <- dim(pd_genes_loc %>%
dplyr::filter(.data[[pval_meth]] >= config$p_val_sign_thres))[1]
nb_ft <- dim(diff_exp_results %>%
dplyr::filter(.data[[pval_meth]] < config$p_val_sign_thres))[1]
nb_ff <- dim(diff_exp_results %>%
dplyr::filter(.data[[pval_meth]] >= config$p_val_sign_thres))[1]
cont_table <- matrix(c(nb_tt, nb_tf, nb_ft, nb_ff), nrow = 2,
dimnames = list(c("DEGs", "Non DEGs"),
c("PD biomarkers", "Non PD biomarkers")))
rm(pval_meth, diff_exp_results)
# We run the Fisher exact test.
ft_res <- fisher.test(cont_table, alternative = "two.sided")
# We print the results and log.
all_res <- rbind(all_res, c(integration$name, limma_analysis$name,
selection$name, pval_comb$name, nb_tt, nb_tf,
nb_ft, nb_ff, ft_res$p.value))
message(paste0("[", Sys.time(), "][", integration$name, "][",
limma_analysis$name, "][", selection$name, "][", pval_comb$name,
"] Enrichment computed (P = ", ft_res$p.value, ")."))
rm(cont_table, nb_tt, nb_tf, nb_ft, nb_ff, ft_res, pd_genes_loc, pval_comb)
}
rm(o)
rm(selection, file_prefix, integration_results)
} # End of for each gene-probe selection.
rm(l, limma_analysis)
} # End of for each limma analysis.
rm(k, vsn)
} # End for each variance configuration.
rm(j, integration)
rm(cont_table, nb_tt, nb_tf, nb_ft, nb_ff, ft_res, pd_genes_loc, pval_comb)
}
rm(o)
rm(selection, file_prefix, integration_results)
} # End of for each gene-probe selection.
rm(l, limma_analysis)
} # End of for each limma analysis.
rm(k, integration)
} # End for each data integration scheme.
rm(i, pd_genes)
# We save the full results (with names).
colnames(all_res) <- c("Integration", "Variance", "Comparison", "Probe selection",
"P value combination", "#TT", "#TF", "#FT", "#FF", "P")
colnames(all_res) <- c("Integration", "Comparison", "Probe selection",
"P value combination", "#TT", "#TF", "#FT", "#FF", "P")
all_res_fn <- paste0(output_data_dir, "Enrichment_PD_associated_genes.tsv")
write.table(all_res, file = all_res_fn, sep = "\t", quote = FALSE, col.names = TRUE,
row.names = FALSE)
......
......@@ -19,44 +19,30 @@ config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
exp_data_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
rsexp_data_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
clinical_data_dir <- paste0(config$global_data_dir, config$local_clinical_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title To select the best probe for a gene (simplified version)
#'
#' @description For a given dataset, the best probe will be the one with the highest average
#' expression. Contrary to the more complex version, this one is based on the expression data
#' alone and not on the differential expression. In addition, here each dataset is treated
#' independently.
#'
#' @param row A dataframe row that contains one probe list (in addition to the official gene name).
#' @return The identifier of the selected probe.
select_best_simplified <- function(row) {
probes <- row$probes
if (is.na(probes)) {
return("")
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 3, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
probe_list <- unlist(strsplit(probes, "|", fixed = TRUE))
all_avgs <- numeric()
for (j in seq_len(length(probe_list))) {
probe <- probe_list[j]
if (probe %in% rownames(exp_mat)) {
all_avgs[j] <- mean(exp_mat[probe,])
} else {
all_avgs[j] <- 0
}
rm(probe)
}
rm(j)
if (max(all_avgs) == 0) {
return("")
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
new_data <- cbind(new_data, rep(row[n], length(keys)))
}
return(probe_list[which.max(all_avgs)])
# We adjust the keys and the column names.
new_data[, ikey] <- keys
return(new_data)
}
# ================================================================================================
......@@ -64,101 +50,121 @@ select_best_simplified <- function(row) {
# ================================================================================================
# The objective here is to use the previously defined gene-probe match data to refine the existing
# expression data for all datasets and all limma analyses. The datasets are treated
# individually. For each dataset configuration (i.e., variance correction),
# each dataset is treated.
# We read the matching data (for all array types).
# This will then later be adapted for each particular case.
matching_data_fn <- paste0(output_data_dir, "Combined_probe_matching.tsv")
matching_data <- read.table(file = matching_data_fn,
sep = "\t",
header = TRUE,
row.names = 1,
as.is = TRUE)
rm(matching_data_fn)
# We create once for all the list of all datasets (microarray and RNA-seq).
all_datasets <- c(config$datasets, config$rs_datasets)
# We do all datasets one by one.
for (i in seq_len(length(all_datasets))) {
# We get the dataset details.
dataset <- all_datasets[[i]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- dataset$array_type
# expression data for all datasets and all limma analyses but only for the main integration scheme
# (SNage in this case). The datasets are treated individually and the best variance configuration
# is always used.
# We only do the SNage integration.
integration <- config$integrations[[3]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We repeat the analysis for all VSN usages.
for (j in seq_len(length(config$variance_methods))) {
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We get the current VSN configuration.
vsn <- config$variance_methods[[j]]
# We define the current limma analysis.
limma_analysis <- config$limma_analyses[[k]]
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- dataset$array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all <- TRUE
for (c in seq_len(length(int_criteria))) {
int_criterium <- str_split(int_criteria[c], ";")[[1]]
if (dataset[int_criterium[1]] != int_criterium[2]) {
satisfy_all <- FALSE
}
rm(int_criterium)
}
rm(c)
if (satisfy_all != TRUE) {
rm(satisfy_all, dataset, dataset_name, dataset_arraytype)
next
}
rm(satisfy_all)
# We get the expression data.
local_exp_data_dir <- exp_data_dir
file_suffix <- paste0("_", vsn$name, ".tsv")
if (dataset_arraytype == "RNAseq_EG") {
local_exp_data_dir <- rsexp_data_dir
file_suffix <- paste0("_cpm.tsv")
}
vsn_name <- config$modifications[[i]]$best_variance
file_suffix <- paste0("_", vsn_name, ".tsv")
exp_data_fn <- paste0(local_exp_data_dir, dataset_name, "_normalized", file_suffix)
exp_mat <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
rm(exp_data_fn, file_suffix, local_exp_data_dir)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
rm(exp_data_fn, file_suffix, local_exp_data_dir, vsn_name)
message(paste0("[", Sys.time(), "][", dataset_name,
"] Expression data read."))
# Read the matching data.
file_prefix_short <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_")
matching_data_fn <- paste0(file_prefix_short, "matching_data.tsv")
matching_data <- read.table(matching_data_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
tmp_res <- t(apply(matching_data, 1,
duplicate_row_multiple_gene_names))
matching_data_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
names(matching_data_clean) <- names(matching_data)
rm(matching_data_fn, tmp_res, matching_data, file_prefix_short)
message(paste0("[", Sys.time(), "] Matching data read."))
# It could be that there is not enough samples to consider that dataset for the current
# Limma analysis but instead of reading again the number of samples, we simply rely on
# the matching data.
array_type_name <- paste0(str_replace(dataset$array_type, "-", "."), "_max.avg")
if (is.null(matching_data_clean[[array_type_name]])) {
rm(matching_data_clean, exp_mat, array_type_name)
rm(dataset_arraytype, dataset_name, dataset)
next
}
# We select the best probe for each gene and for each platform (for this analysis).
array_type_name <- str_replace(dataset$array_type, "-", ".")
local_matching_data <- data.frame(genes = rownames(matching_data),
probes = matching_data[[array_type_name]],
stringsAsFactors = FALSE)
rm(array_type_name)
# We then do the selection. Note: this can be CPU intensive.
tmp <- local_matching_data %>%
dplyr::rowwise() %>%
dplyr::do(tmp = select_best_simplified(.))
# We refine the above results (reformating / names).
new_data <- unlist(tmp$tmp)
new_matching_data <- local_matching_data %>%
mutate(best_probe = new_data) %>%
filter(best_probe != "")
rm(tmp, new_data, local_matching_data)
new_matching_data <- data.frame(genes = matching_data_clean$genes,
probes = matching_data_clean[[array_type_name]],
stringsAsFactors = FALSE) %>%
filter(!is.na(probes))
rm(array_type_name, matching_data_clean)
# We finally filter the original expression data (probe based)
# to keep only the best probes.
exp_mat_bestprobe <- exp_mat[new_matching_data$best_probe, ]
exp_mat_bestprobe <- exp_mat[intersect(new_matching_data$probes, row.names(exp_mat)), ]
exp_mat_bestprobe <- exp_mat_bestprobe %>%
as.data.frame() %>%
mutate(best_probe = rownames(exp_mat_bestprobe))
exp_mat_matched <- merge(x = new_matching_data,
y = exp_mat_bestprobe,
by = "best_probe") %>%
select(-one_of("best_probe", "probes"))
by.x = "probes",
by.y = "best_probe") %>%
select(-one_of("probes")) %>% unique()
rm(exp_mat_bestprobe, new_matching_data, exp_mat)
# We save the matched data.
gexpr_data_fn <- paste0(output_data_dir, dataset_name, "_", vsn$name,
"_gene_expr_data.tsv")
gexpr_data_fn <- paste0(output_data_dir, dataset_name, "_", integration$name,
"_", limma_analysis$name, "_gene_expr_data.tsv")
write.table(exp_mat_matched,
row.names = FALSE,
file = gexpr_data_fn,
sep = "\t",
quote = FALSE)
rm(gexpr_data_fn, exp_mat_matched)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
message(paste0("[", Sys.time(), "][", dataset_name, "][", limma_analysis$name,
"] Expression data matched and saved."))
rm(vsn)
} # End of foreach VSN configuration.
rm(j, dataset_arraytype, dataset_name, dataset)
rm(dataset_arraytype, dataset_name, dataset)
} # End for each limma analysis.
rm(i, limma_analysis)
} # End for each dataset.
rm(i, matching_data, all_datasets)
rm(k, integration, int_criteria)
# We log the session details for reproducibility.
rm(config, output_data_dir, exp_data_dir, input_data_dir, clinical_data_dir, rsexp_data_dir)
rm(config, output_data_dir, exp_data_dir, input_data_dir)
sessionInfo()
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 5
#SBATCH --time=0-00:35:00
#SBATCH --time=0-01:20:00
#SBATCH -p batch
#SBATCH --qos=normal
......
This diff is collapsed.
......@@ -103,34 +103,6 @@ select_best <- function(row, array_indexes) {
return(to_return)
}
# duplicate_row_multiple_gene_names <- function(row, sep = "|") {
# keys <- strsplit(as.character(row[2]), sep, fixed = TRUE)[[1]]
# if (length(keys) == 0) {
# keys <- ""
# }
# val_1 <- rep(row[1], length(keys))
# val_3 <- rep(row[3], length(keys))
# val_4 <- rep(row[4], length(keys))
# val_5 <- rep(row[5], length(keys))
# val_6 <- rep(row[6], length(keys))
# val_7 <- rep(row[7], length(keys))
# val_8 <- rep(row[8], length(keys))
# val_9 <- rep(row[9], length(keys))
# val_10 <- rep(row[10], length(keys))
# to_ret <- cbind(val_1,
# keys,
# val_3,
# val_4,
# val_5,
# val_6,
# val_7,
# val_8,
# val_9,
# val_10)
# names(to_ret) <- names(row)
# return(to_ret)
# }
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment