Commit 3c574ea8 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

New method to plot the cell type biomarkers per dataset.

parent 5be415ac
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("stringr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_edata_dir <- paste0(config$global_data_dir, config$local_exp_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 3, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
new_data <- cbind(new_data, rep(row[n], length(keys)))
}
# We adjust the keys and the column names.
new_data[, ikey] <- keys
return(new_data)
}
#' @title Plot the differential expression of a given gene.
#'
#' @description This function create one plot for a given gene of a given limma
#' comparison (obtained for a given configuration). The plot contains the expression
#' values across samples (grouped per dataset and per relevant clinical descriptor).
#'
#' @param genes The genes to plot defined by their gene symbols.
#' @param gene_datasets The datasets to use for plotting (discarding the others).
#' @param output_data_dir A string to indicate the folder in which plots should be saved.
#' @param integration_name A string that contains the name of the current integration
#' scheme.
#' @param limma_analysis A complete configuration of the Limma comparison.
#' @param selection_name A string corresponding to the used gene--probe selection
#' method.
#' @param palette The color palette to use for plotting.
#' @param clin_data_all A list that contains one data-frame per dataset with its
#' clinical data.
#' @param exp_data_all A list that contains one data-frame per dataset with its
#' expression profiles (not transformed).
#' @param zexp_data_all A list that contains one data-frame per dataset with its
#' expression profiles (Z-scores).
#' @param m_indx_map A named list representing a map between two dataset indexes,
#' namely the index of the global dataset configuration (all datasets) and the list
#' of relevant datasets for this specific configuration. The list contains the second
#' index (relevant only) while its names are for the first index (all datasets).
#' @param plot_all A boolean indicating whether the combined data (Z-scores) should be
#' plotted (default value FALSE). One of plot_all and plot_dataset must be set to true.
#' @param plot_datasets A boolean indicating whether the individual data should be
#' plotted (default value TRUE). One of plot_all and plot_dataset must be set to true.
plot_gene <- function(genes, gene_datasets, output_data_dir, integration_name, limma_analysis,
selection_name, palette, clin_data_all, exp_data_all, zexp_data_all,
m_indx_map, config, plot_all = FALSE, plot_datasets = TRUE, title_name) {
# We set the I/Os.
file_prefix_short <- paste0(output_data_dir, integration_name, "_",
limma_analysis$name, "_")
# Read the matching data.
matching_data_fn <- paste0(file_prefix_short, "matching_data.tsv")
matching_data <- read.table(matching_data_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
tmp_res <- t(apply(matching_data, 1,
duplicate_row_multiple_gene_names))
matching_data_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
names(matching_data_clean) <- names(matching_data)
matching_data <- matching_data_clean
rm(matching_data_fn, tmp_res, matching_data_clean)
message(paste0("[", Sys.time(), "] Matching data read."))
# We read the results of the integration.
int_results_fn <- paste0(file_prefix_short, selection_name, "_integration.tsv")
results_data_df <- read.table(int_results_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(int_results_fn)
message(paste0("[", Sys.time(), "] Integration results read."))
# We plot the gene defined by the Marot-Mayer combination method.
# We define the I/Os.
o <- 1
pval_comb <- config$p_val_combinations[[o]]
file_prefix <- paste0(file_prefix_short, selection_name, "_", pval_comb$name, "_")
full_profiles <- vector()
full_zprofiles <- vector()
for (gene in genes) {
matching_data_gene <- matching_data %>%
dplyr::filter(genes == gene) %>%
dplyr::select(ends_with(str_replace(selection_name, "-", ".")))
# We do all datasets one by one and collect the clinical and expression data.
for (ii in seq_len(length(clin_data_all))) {
# We define the second index as to switch between the global list (full)
# and current list (selection).
jj <- m_indx_map[[ii]]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- datasets[[jj]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- stringr::str_replace(dataset$array_type, "-", ".")
probeset_tag <- paste0(dataset_arraytype, "_",
stringr::str_replace(selection_name, "-", "."))
probe_ids <- matching_data_gene[[probeset_tag]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id <- probe_ids[!is.na(probe_ids)] # TOP # beware if we loose a row, it might mess up with our data.
if (length(probe_id) > 1) {
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL, " has multiple probes for dataset ",
dataset_name, " (", dataset_arraytype, "): ",
paste0(probe_ids, collapse = "-")))
}
rm(dataset, probeset_tag, dataset_arraytype, jj, probe_ids)
# We read the clinical data.
clin_data <- clin_data_all[[ii]] %>%
dplyr::mutate(gender_disease_status = paste(Gender, Disease.status, sep = ".")) %>%
dplyr::select("Disease.status", "Gender", "gender_disease_status")
# We read the expression data (expression levels).
# The dataset might not have a probe for that gene.
exp_data <- t(exp_data_all[[ii]][1, ])
exp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
exp_data <- t(exp_data_all[[ii]][probe_id, ]) # TOP # loop over ids...
}
}
}
colnames(exp_data) <- "Expression" # TOP # rep "Expression" with NB_genes...
# We read the expression data (Z-scores).
# The dataset might not have a probe for that gene.
zexp_data <- t(zexp_data_all[[ii]][1, ])
zexp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
zexp_data <- t(zexp_data_all[[ii]][probe_id, ]) # TOP # loop over ids...
}
}
}
colnames(zexp_data) <- "Expression" # TOP # rep "Expression" with NB_genes...
# We assemble clinical and expression and reformat.
local_block <- cbind(clin_data, exp_data) %>%
dplyr::mutate(dataset = dataset_name) %>%
dplyr::mutate(gene = gene)
colnames(local_block)[colnames(local_block) == probe_id] <- "Expression" # TOP # rep "Expression" with NB_genes...
zlocal_block <- cbind(clin_data, zexp_data) %>%
dplyr::mutate(dataset = dataset_name) %>%
dplyr::mutate(gene = gene)
colnames(zlocal_block)[colnames(zlocal_block) == probe_id] <- "Expression" # TOP # rep "Expression" with NB_genes...
rm(dataset_name, probe_id, clin_data, exp_data, zexp_data)
# We do not need to keep only the relevant samples (we want all samples this time).
full_profiles <- rbind(full_profiles, local_block)
full_zprofiles <- rbind(full_zprofiles, zlocal_block)
rm(local_block, zlocal_block)
} # End for all relevant datasets.
rm(ii)
} # end for each gene.
rm(gene)
# We check if the clinical factor is correctly formatted.
clinical_factor <- limma_analysis$factor
# We do not combine the Z-scores values to the regular values for plotting purposes.
full_profiles <- full_profiles %>%
filter(dataset %in% gene_datasets)
full_zprofiles <- full_zprofiles %>%
filter(dataset %in% gene_datasets) %>%
mutate(dataset = "ALL")
plot_profiles <- c()
if (plot_datasets) {
plot_profiles <- full_profiles
}
rm(full_profiles, full_zprofiles)
# If we have missing values, we clean them.
plot_profiles <- plot_profiles %>%
filter(!is.na(Expression))
# If there is no data at this point (all biomarkers are missing genes),
# we do not plot anything.
if (dim(plot_profiles)[1] == 0) {
message(paste0("[", Sys.time(), "] Plotting ", paste0(genes, collapse = ", "), " with clinical factor ",
clinical_factor))
return()
}
# We can create the plot.
message(paste0("[", Sys.time(), "] Plotting ", paste0(genes, collapse = ", "), " with clinical factor ",
clinical_factor))
myplot <- NULL
if (plot_all) {
# Not yet implemented...
} else {
myplot <- ggplot(plot_profiles, aes(x = gender_disease_status,
y = Expression,
colour = !!ensym(clinical_factor))) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 3, width = 0.1, height = 0) +
facet_wrap(~ gene, scales = "free") +
theme(legend.position = "bottom",
axis.title.x = element_blank(),
axis.title.y = element_text(size = 16,
face = "bold"),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 16,
face = "bold"),
plot.title = element_text(size = 16,
hjust = 0.5,
face = "bold"),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 1.5),
panel.grid.major = element_line(colour = "lightgrey"),
strip.background = element_rect(fill = "black"),
strip.text = element_text(colour = "white",
size = 12,
face = "bold"),
legend.text = element_text(size = 16,
face = "bold")) +
scale_color_manual(values = palette) +
ggtitle(paste0(paste0(gene_datasets, collapse = " - "), " (", title_name, ")"))
}
print(myplot)
rm(myplot, clinical_factor)
message(paste0("[", Sys.time(), "] Genes ", paste0(genes, collapse = ", "), " done."))
rm(genes, matching_data_gene, plot_profiles)
rm(pval_comb, file_prefix)
rm(o, results_data_df, matching_data, file_prefix_short)
} # End of function.
# ================================================================================================
# Main
# ================================================================================================
# We define the color palette we want to use (always all categories in this case).
# F.CTRL, F.PD, M.CTRL, M.PD
gds_plt <- c("#7da7de", "#00599d", "#c2e0ae", "#62a73b")
# The objective is simply to plot selected genes.
# Data are plotted per dataset (each one in an individual
# subplot) as well as all together (when the data are first transformed into z-scores).
# We create once for all the list of all datasets (microarray and RNA-seq).
datasets <- config$datasets
# We do only the SNage integration scheme.
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We use the first gene-probe selection method (max-avg).
l <- 1
selection <- config$selections[[l]]
# We loop over all datasets to read only the relevant datasets
exp_data_all <- list()
zexp_data_all <- list()
clin_data_all <- list()
m_indx <- 1
m_indx_map <- list()
for (m in seq_len(length(datasets))) {
# We extract the configuration of the current dataset.
dataset <- datasets[[m]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- dataset$array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all <- TRUE
for (c in seq_len(length(int_criteria))) {
int_criterium <- str_split(int_criteria[c], ";")[[1]]
if (dataset[int_criterium[1]] != int_criterium[2]) {
satisfy_all <- FALSE
}
rm(int_criterium)
}
rm(c)
if (satisfy_all != TRUE) {
rm(satisfy_all, dataset, dataset_name, dataset_arraytype)
next
}
rm(satisfy_all)
# We define the dataset specific I/Os.
local_input_edata_dir <- input_edata_dir
local_input_data_dir <- input_data_dir
vsn_name <- config$modifications[[m]]$best_variance
file_suffix <- paste0("_", vsn_name, ".tsv")
file_long_suffix <- paste0("_", vsn_name, "_zscores.tsv")
exp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_suffix)
zexp_data_fn <- paste0(local_input_data_dir, dataset_name, "_normalized", file_long_suffix)
clin_data_fn <- paste0(local_input_edata_dir, dataset_name, "_clinical.tsv")
rm(local_input_edata_dir, local_input_data_dir, vsn_name, file_suffix, file_long_suffix)
# We read the data.
clin_data_all[[m_indx]] <- read.table(clin_data_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
exp_data_all[[m_indx]] <- read.table(exp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
zexp_data_all[[m_indx]] <- read.table(zexp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
m_indx_map[[m_indx]] <- m
m_indx <- m_indx + 1
rm(dataset, dataset_name, dataset_arraytype, exp_data_fn, zexp_data_fn, clin_data_fn)
} # End for each dataset.
rm(m)
# List of the datasets to consider.
common_datasets <- c("GSE20163", "GSE20164", "GSE20292", "GSE8397",
"GSE26927", "GSE20159", "GSE49036", "NBB")
# We want to run through the biomarkers cell type by cell type.
# So we re-organize the data.
x <- unlist(config$biomarkers)
dim(x) <- c(2, length(x) / 2 )
biomarkers <- data.frame(t(x), stringsAsFactors = FALSE)
rm(x)
names(biomarkers) <- c("Gene", "CellType")
# We loop through cell types.
for (celltype in unique(biomarkers$CellType)) {
# We define the current biomarkers
CT_markers <- biomarkers %>% filter(CellType == celltype)
for (biomarker in CT_markers$Gene) {
# Now we loop over the datasets to create one plot per dataset with all CT biomarkers.
for (i in seq_len(length(datasets))) {
dataset <- datasets[[i]]
if (! dataset$dataset_name %in% common_datasets) {
rm(dataset)
next;
}
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_females_max-avg_CT_biomarker_", biomarker, "_", celltype, "_",
dataset$dataset_name, ".png")
png(plotfilename, width = 450, height = 300, units = "px")
plot_gene(genes = biomarker,
gene_datasets = c(dataset$dataset_name),
output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = config$limma_analyses[[5]],
selection_name = selection$name,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE,
title_name = celltype)
dev.off()
plotfilename <- paste0(output_data_dir, "SNage_PDVsControl_males_max-avg_CT_biomarker_", biomarker, "_", celltype, "_",
dataset$dataset_name, ".png")
png(plotfilename, width = 450, height = 300, units = "px")
plot_gene(genes = biomarker,
gene_datasets = c(dataset$dataset_name),
output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = config$limma_analyses[[6]],
selection_name = selection$name,
palette = gds_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config,
plot_all = FALSE, plot_datasets = TRUE,
title_name = celltype)
dev.off()
rm(plotfilename)
message(paste0("[", celltype, "][", biomarker, "][", dataset$dataset_name, "] plotted."))
rm(dataset)
}
rm(i)
}
rm(biomarker, CT_markers)
}
rm(celltype, biomarkers, common_datasets)
rm(selection, exp_data_all, zexp_data_all, clin_data_all, m_indx, m_indx_map)
rm(l, gds_plt)
rm(integration, int_criteria)
rm(datasets)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir, input_edata_dir)
sessionInfo()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment