Commit 84aa8519 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Few adjustment for gene figures (both for top hits and selected genes).

parent 0ef244af
......@@ -18,7 +18,6 @@ config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
input_rsedata_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
......@@ -231,7 +230,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
y = Expression,
colour = !!ensym(clinical_factor))) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 3, width = 0.2, height = 0.1) +
geom_jitter(size = 3, width = 0.2, height = 0) +
facet_wrap(~ dataset, scales = "free") +
theme(legend.position = "bottom",
axis.title.x = element_blank(),
......@@ -343,6 +342,7 @@ for (i in seq_len(length(config$integrations))) {
rm(satisfy_all, dataset, dataset_name, dataset_arraytype)
next
}
rm(satisfy_all)
# We define the dataset specific I/Os.
local_input_edata_dir <- input_edata_dir
......@@ -394,5 +394,5 @@ for (i in seq_len(length(config$integrations))) {
rm(i, datasets, ctrl_plt, diz_plt, gds_plt, male_plt)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir, input_edata_dir, input_rsedata_dir)
rm(config, output_data_dir, input_data_dir, input_edata_dir)
sessionInfo()
......@@ -18,13 +18,32 @@ config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
input_rsedata_dir <- paste0(config$global_data_dir, config$local_rs_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 3, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
new_data <- cbind(new_data, rep(row[n], length(keys)))
}
# We adjust the keys and the column names.
new_data[, ikey] <- keys
return(new_data)
}
#' @title Plot the differential expression of a given gene.
#'
#' @description This function create one plot for a given gene of a given limma
......@@ -59,7 +78,7 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
m_indx_map, config, plot_all = FALSE, plot_datasets = TRUE) {
# We set the I/Os.
file_prefix_short <- paste0(output_data_dir, integration_name, "_", vsn$name, "_",
file_prefix_short <- paste0(output_data_dir, integration_name, "_",
limma_analysis$name, "_")
# Read the matching data.
......@@ -69,7 +88,14 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(matching_data_fn)
tmp_res <- t(apply(matching_data, 1,
duplicate_row_multiple_gene_names))
matching_data_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
names(matching_data_clean) <- names(matching_data)
matching_data <- matching_data_clean
rm(matching_data_fn, tmp_res, matching_data_clean)
message(paste0("[", Sys.time(), "] Matching data read."))
# We read the results of the integration.
......@@ -82,34 +108,41 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
rm(int_results_fn)
message(paste0("[", Sys.time(), "] Integration results read."))
# We plot the top hits defined by the Marot-Mayer combination method.
# We plot the gene defined by the Marot-Mayer combination method.
# We define the I/Os.
o <- 1
pval_comb <- config$p_val_combinations[[o]]
file_prefix <- paste0(file_prefix_short, selection_name, "_", pval_comb$name, "_")
# We get the relevant probe identifiers.
matching_data_gene <- matching_data %>%
dplyr::filter(SYMBOL == gene) %>%
dplyr::filter(genes == gene) %>%
dplyr::select(ends_with(str_replace(selection_name, "-", ".")))
# We do all datasets one by one and collect the clinical and expression data.
full_profiles <- vector()
full_zprofiles <- vector()
for (ii in seq_len(length(clin_data_all))) {
# We define the second index as to switch between the global list (full)
# and current list (selection).
jj <- m_indx_map[[ii]]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- datasets[[jj]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- stringr::str_replace(dataset$array_type, "-", ".")
probeset_tag <- paste0(dataset_arraytype, "_",
stringr::str_replace(selection_name, "-", "."))
probe_id <- matching_data_gene[[probeset_tag]]
rm(dataset, probeset_tag, dataset_arraytype, jj)
probe_ids <- matching_data_gene[[probeset_tag]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id <- probe_ids[!is.na(probe_ids)]
if (length(probe_id) > 1) {
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL, " has multiple probes for dataset ",
dataset_name, " (", dataset_arraytype, "): ", paste0(probe_ids, collapse = "-")))
}
rm(dataset, probeset_tag, dataset_arraytype, jj, probe_ids)
# We read the clinical data.
clin_data <- clin_data_all[[ii]] %>%
......@@ -121,23 +154,27 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
exp_data <- t(exp_data_all[[ii]][1, ])
exp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (!is.na(probe_id)) {
exp_data <- t(exp_data_all[[ii]][probe_id, ])
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
exp_data <- t(exp_data_all[[ii]][probe_id, ])
}
}
}
colnames(exp_data) <- "Expression"
# We read the expression data (Z-scores).
# The dataset might not have a probe for that gene.
zexp_data <- t(zexp_data_all[[ii]][1, ])
zexp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (!is.na(probe_id)) {
zexp_data <- t(zexp_data_all[[ii]][probe_id, ])
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
zexp_data <- t(zexp_data_all[[ii]][probe_id, ])
}
}
}
colnames(zexp_data) <- "Expression"
# We assemble clinical and expression and reformat.
local_block <- cbind(clin_data, exp_data) %>%
dplyr::mutate(dataset = dataset_name)
......@@ -147,18 +184,10 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
colnames(zlocal_block)[colnames(zlocal_block) == probe_id] <- "Expression"
rm(dataset_name, probe_id, clin_data, exp_data, zexp_data)
# We need to keep only the relevant samples (i.e., the M.PD and F.PD for
# Gender_PD for instance since the M.Control and F.Control were not used in the
# limma analysis and will only pollute the signal.
## relevant_categories <- str_trim(str_split(limma_analysis$coefficient,
## pattern = "[-()]")[[1]])
## local_block <- local_block %>%
## dplyr::filter(!! sym(limma_analysis$factor) %in% relevant_categories)
full_profiles <- rbind(full_profiles, local_block)
## zlocal_block <- zlocal_block %>%
## dplyr::filter(!! sym(limma_analysis$factor) %in% relevant_categories)
# We do not need to keep only the relevant samples (we want all samples this time).
full_profiles <- rbind(full_profiles, local_block)
full_zprofiles <- rbind(full_zprofiles, zlocal_block)
rm(local_block, zlocal_block) ##, relevant_categories)
rm(local_block, zlocal_block)
} # End for all relevant datasets.
rm(ii)
......@@ -182,7 +211,7 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
# We can create the plot.
output_figure_file <- paste0(file_prefix, "extraplot_", gene, ".png")
# png(output_figure_file)
##png(output_figure_file)
message(paste0("[", Sys.time(), "] Plotting ", gene, " with clinical factor ",
clinical_factor))
myplot <- NULL
......@@ -246,11 +275,10 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
ggtitle(gene)
}
print(myplot)
# dev.off()
##dev.off()
rm(myplot, output_figure_file, clinical_factor)
message(paste0("[", Sys.time(), "] Gene ", gene, " done."))
rm(gene, matching_data_gene, plot_profiles)
rm(pval_comb, file_prefix)
rm(o, results_data_df, matching_data, file_prefix_short)
} # End of function.
......@@ -261,29 +289,28 @@ plot_gene <- function(gene, gene_datasets, output_data_dir, integration_name, li
# We define the color palette we want to use (always all categories in this case).
# F.CTRL, F.PD, M.CTRL, M.PD
current_plt <- c("#7da7de", "#00599d", "#c2e0ae", "#62a73b")
gds_plt <- c("#7da7de", "#00599d", "#c2e0ae", "#62a73b")
# The objective is simply to plot the top differentially expressed genes.
# The objective is simply to plot selected genes.
# Data are plotted per dataset (each one in an individual
# subplot) as well as all together (when the data are first transformed into z-scores).
# We create once for all the list of all datasets (microarray and RNA-seq).
datasets <- c(config$datasets, config$rs_datasets)
datasets <- config$datasets
# We do only the SNage integration scheme.
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We rely on the last limma analyses (gender_disease_status) but this is only to get the
# 4 categories of interest.
##k <- 5
##limma_analysis <- config$limma_analyses[[k]]
# We use the first gene-probe selection method (max-avg).
l <- 1
selection <- config$selections[[l]]
......@@ -319,17 +346,14 @@ for (m in seq_len(length(datasets))) {
# We define the dataset specific I/Os.
local_input_edata_dir <- input_edata_dir
file_suffix <- paste0("_", vsn$name, ".tsv")
file_long_suffix <- paste0("_", vsn$name, "_zscores.tsv")
if (dataset_arraytype == "RNAseq_EG") {
local_input_edata_dir <- input_rsedata_dir
file_suffix <- paste0("_cpm.tsv")
file_long_suffix <- paste0("_cpm_zscores.tsv")
}
local_input_data_dir <- input_data_dir
vsn_name <- config$modifications[[m]]$best_variance
file_suffix <- paste0("_", vsn_name, ".tsv")
file_long_suffix <- paste0("_", vsn_name, "_zscores.tsv")
exp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_suffix)
zexp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_long_suffix)
zexp_data_fn <- paste0(local_input_data_dir, dataset_name, "_normalized", file_long_suffix)
clin_data_fn <- paste0(local_input_edata_dir, dataset_name, "_clinical.tsv")
rm(local_input_edata_dir, file_suffix, file_long_suffix)
rm(local_input_edata_dir, local_input_data_dir, vsn_name, file_suffix, file_long_suffix)
# We read the data.
clin_data_all[[m_indx]] <- read.table(clin_data_fn,
......@@ -360,7 +384,7 @@ top_genes <- c("H2AC6", "GDPD5", "POGK", "CA2", "SALL1", "SGSH", "CX3CL1",
"FAM189A1", "DYNC1LI1", "EFNA1", "NDUFA10", "MCM3AP", "IFITM2", "MAPK1",
"ID3", "ECE1", "GRHL3", "RGS2", "YBX3")
for (gene in top_genes) {
plotfilename <- paste0(output_data_dir, "SNage_VSN_PDVsControl_females_max-avg_", gene, ".png")
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_females_max-avg_", gene, ".png")
png(plotfilename)
plot_gene(gene = gene,
gene_datasets = common_datasets,
......@@ -368,11 +392,11 @@ for (gene in top_genes) {
integration_name = integration$name,
limma_analysis = config$limma_analyses[[5]],
selection_name = selection$name,
palette = current_plt,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE)
dev.off()
plotfilename <- paste0(output_data_dir, "SNage_VSN_PDVsControl_males_max-avg_", gene, ".png")
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_males_max-avg_", gene, ".png")
png(plotfilename)
plot_gene(gene = gene,
gene_datasets = common_datasets,
......@@ -380,7 +404,7 @@ for (gene in top_genes) {
integration_name = integration$name,
limma_analysis = config$limma_analyses[[6]],
selection_name = selection$name,
palette = current_plt,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE)
dev.off()
......@@ -388,11 +412,46 @@ for (gene in top_genes) {
message(paste0("[",gene, "] plotted."))
}
rm(gene, top_genes, common_datasets)
# Actual plotting of the selected genes.
common_datasets <- c("GSE20159", "GSE20163", "GSE20292", "GSE26927", "GSE49036", "GSE8397", "NBB")
top_genes <- c("CA2", "NR4A2", "EFNA1")
for (gene in top_genes) {
for (dataset in common_datasets) {
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_females_max-avg_", gene, "_", dataset, ".png")
png(plotfilename)
plot_gene(gene = gene,
gene_datasets = c(dataset),
output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = config$limma_analyses[[5]],
selection_name = selection$name,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE)
dev.off()
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_males_max-avg_", gene, "_", dataset, ".png")
png(plotfilename)
plot_gene(gene = gene,
gene_datasets = c(dataset),
output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = config$limma_analyses[[6]],
selection_name = selection$name,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE)
dev.off()
rm(plotfilename)
message(paste0("[",gene, "][", dataset, "] plotted."))
}
}
rm(gene, top_genes, common_datasets)
rm(selection, exp_data_all, zexp_data_all, clin_data_all, m_indx, m_indx_map)
rm(l, current_plt, vsn)
rm(l, gds_plt, vsn)
rm(j, integration, int_criteria)
rm(i, datasets)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir, input_edata_dir, input_rsedata_dir)
rm(config, output_data_dir, input_data_dir, input_edata_dir)
sessionInfo()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment