Commit c9839b60 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added support for preprocessed RNA-seq dataset (only NBB so far).

parent 765c19e7
......@@ -65,7 +65,7 @@ for (i in seq_len(length(config$datasets))) {
# Prepare the data.
percentage <- round(pca$sdev / sum(pca$sdev) * 100, 2)
percentage_strings <- paste(as.character(percentage), "%", ")", sep = "")
percentage <- paste(colnames(pca$x), "(", percentage_strings)
percentage <- paste(colnames(pca$x), " (", percentage_strings, sep = "")
pca_coords <- as.data.frame(pca$x)
pca_coords$group <- pheno_data$gender_disease_status
rm(pca, percentage_strings)
......
......@@ -76,10 +76,10 @@ plot_age_dist <- function(d, split_by, output_figure_file,
# We prepare the plots by computing the min and max ages across all datasets.
get_max_cmd <- paste0("cut -f 4 ", output_data_dir,
"*clinical* | grep -v Age | grep -v NA | sort -n | tail -n 1")
"*clinical.tsv | grep -v Age | grep -v NA | sort -n | tail -n 1")
max_age_across_datasets <- as.numeric(system(get_max_cmd, intern = TRUE)) + 1
get_min_cmd <- paste0("cut -f 4 ", output_data_dir,
"*clinical* | grep -v Age | grep -v NA | sort -n | head -n 1")
"*clinical.tsv | grep -v Age | grep -v NA | sort -n | head -n 1")
min_age_across_datasets <- as.numeric(system(get_min_cmd, intern = TRUE)) - 1
rm(get_max_cmd, get_min_cmd)
......
......@@ -13,6 +13,7 @@ library("hgug4112a.db")
library("hgfocus.db")
library("ArrayUtils")
library("tidyverse")
library("RColorBrewer")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -106,8 +107,13 @@ for (i in seq_len(length(config$datasets))) {
dplyr::group_by(PROBEID) %>%
dplyr::summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"),
.funs = collapser) %>%
dplyr::ungroup
dplyr::ungroup()
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
# Just in case, for RNA-seq data, we use the following alternative to the above statment.
# This is only placed there for information since it was not used for miicroarray data.
# The current solution is equivalent.
#gene_annots <- merge(data.frame(PROBEID = rownames(exp_eset)),
# gene_annots, by = "PROBEID", all.x = TRUE, sort = FALSE)
rm(gene_annots_raw)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
"] Gene annotations ready."))
......@@ -115,9 +121,15 @@ for (i in seq_len(length(config$datasets))) {
# Additional QC: plot the MDS to check again for outliers.
mds_filename <- paste0(output_data_dir, dataset_name, "_mdsplot_", vsn$name, ".png")
png(mds_filename)
limma::plotMDS(exp_eset, labels = NULL, pch = 19)
col_group <- factor(pheno_data$gender_disease_status)
levels(col_group) <- RColorBrewer::brewer.pal(nlevels(col_group), "Set1")
col_group <- as.character(col_group)
limma::plotMDS(exp_eset,
labels = pheno_data$gender_disease_status,
col = col_group,
pch = 19)
dev.off()
rm(mds_filename)
rm(mds_filename, col_group)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
"] MDS plot created."))
......
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/14/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/14-Prepare_RNAseq_datasets/
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}*
data:
@sbatch ${CODE_FOLDER}clean_datasets.sh
check:
@sbatch ${CODE_FOLDER}check.sh
biomarker:
@sbatch ${CODE_FOLDER}plot_biomarkers.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
# Objectives
The first objectives of this step is clean the RNA-seq dataset, i.e., remove the transcripts that are not useful (i.e., full of zeros, or with a variance of zero). A second objective is to check the expression levels of pre-defined biomarkers as well as checking the clinical covariates.
# Details and instructions
There is no cleaning of the samples per say as the data was processed by EG prior to the project. We simply remove the useless rows (null values ro null deviation).
```
make clean_outputs
make data
```
The age and category balances are then checked. No statistics is derived but plots are created for manual inspection.
```
make check
```
Then, the expression levels of the various biomarkers are checked. We use the map between the probes (aka transcripts) and the genes to analyze the expression patterns of pre-defined biomarkers.
```
make biomarker
```
For manual inspection, a document that contains all figures is then generated.
```
make doc
```
# Prerequisites
The prerequisites are to have the preprocessed data for the RNA-seq dataset (from EG).
\ No newline at end of file
#!/bin/bash -l
#SBATCH -J geneder:14:check
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:03:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/14/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/14-Prepare_RNAseq_datasets/
# Loading modules.
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual jobs.
Rscript --vanilla ${CODE_FOLDER}check_clinical_balance.R > ${OUTPUT_FOLDER}check_age_log.out 2> ${OUTPUT_FOLDER}check_age_log.err
Rscript --vanilla ${CODE_FOLDER}check_category_bias.R > ${OUTPUT_FOLDER}check_cat_log.out 2> ${OUTPUT_FOLDER}check_cat_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("readr")
library("tidyverse")
library("ggfortify")
library("ArrayUtils")
library("ggpubr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(config$rs_datasets))) {
# We get the dataset details.
dataset <- config$rs_datasets[[i]]
dataset_name <- dataset$dataset_name
# We load the data (clinical).
pheno_data_fn <- paste0(dataset_name, "_clinical.tsv")
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
rm(pheno_data_fn)
# We prepare the clinical data for ggplot by adding an extra column (Gender * Disease.status).
pheno_data <- pheno_data %>%
dplyr::mutate(gender_disease_status = factor(paste(Gender, Disease.status, sep = ".")))
# We load the data (preproccesed data).
exp_data_fn <- paste0(output_data_dir, dataset_name, "_normalized_cpm.tsv")
exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
message(paste0("[", Sys.time(), "][", dataset_name, "] Data ready."))
rm(exp_data_fn)
# Compute the PCA.
pca <- prcomp(t(exp_data), scale = FALSE)
rm(exp_data)
# Plots the first three PCS by pairs.
# Prepare the data.
percentage <- round(pca$sdev / sum(pca$sdev) * 100, 2)
percentage_strings <- paste(as.character(percentage), "%", ")", sep = "")
percentage <- paste(colnames(pca$x), " (", percentage_strings, sep = "")
pca_coords <- as.data.frame(pca$x)
pca_coords$group <- pheno_data$gender_disease_status
rm(pca, percentage_strings)
# First version, one colour per category.
# We prepare the subplots one by one.
p12 <- ggplot(pca_coords, aes(x = PC1, y = PC2, color = group)) +
geom_point(size = 5) +
theme(legend.position = "None") +
xlab(percentage[1]) +
ylab(percentage[2])
p13 <- ggplot(pca_coords, aes(x = PC1, y = PC3, color = group)) +
geom_point(size = 5) +
theme(legend.position = "bottom") +
xlab(percentage[1]) +
ylab(percentage[3])
p23 <- ggplot(pca_coords, aes(x = PC2, y = PC3, color = group)) +
geom_point(size = 5) +
theme(legend.position = "None") +
xlab(percentage[2]) +
ylab(percentage[3])
# We plot them all.
output_figure_file <- paste0(output_data_dir, dataset_name, "_category_bias.png")
png(output_figure_file, width = 3 * 480)
full_p <- ggarrange(p12, p13, p23, ncol = 3, nrow = 1, common.legend = TRUE, legend = "bottom")
print(full_p)
dev.off()
rm(p12, p13, p23, output_figure_file, full_p)
# Second, we highlight only one category versus the others.
all_categories <- unique(pheno_data$gender_disease_status)
for (k in seq_len(length(all_categories))) {
# We define the category of interest.
category <- as.character(all_categories[k])
# We refine the groups accordingly.
binary_groups <- rep("Others", length(pheno_data$gender_disease_status))
binary_groups[pheno_data$gender_disease_status == category] <- category
pca_coords$binary_groups <- binary_groups
# We prepare the subplots one by one.
p12 <- ggplot(pca_coords, aes(x = PC1, y = PC2, color = binary_groups)) +
geom_point(size = 5) +
theme(legend.position = "None") +
xlab(percentage[1]) +
ylab(percentage[2])
p13 <- ggplot(pca_coords, aes(x = PC1, y = PC3, color = binary_groups)) +
geom_point(size = 5) +
theme(legend.position = "bottom") +
xlab(percentage[1]) +
ylab(percentage[3])
p23 <- ggplot(pca_coords, aes(x = PC2, y = PC3, color = binary_groups)) +
geom_point(size = 5) +
theme(legend.position = "None") +
xlab(percentage[2]) +
ylab(percentage[3])
# We group the plots and save as png.
category_name <- str_replace(category, "[.]", "_")
output_figure_file <- paste0(output_data_dir, dataset_name, "_category_bias_",
category_name, ".png")
png(output_figure_file, width = 3 * 480)
full_p <- ggarrange(p12, p13, p23, ncol = 3, nrow = 1, common.legend = TRUE,
legend = "bottom")
print(full_p)
dev.off()
rm(p12, p13, p23, category_name, output_figure_file, full_p, category, binary_groups)
} # End for all categories to highlight.
rm(k, pca_coords, all_categories, percentage)
message(paste0("[", Sys.time(), "][", dataset_name, "] Plots created."))
rm(dataset, dataset_name, pheno_data)
} # End for each dataset.
rm(i, config, input_data_dir, output_data_dir, raw_data_dir)
# We log the session details for reproducibility.
sessionInfo()
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("readr")
library("tidyverse")
library("ArrayUtils")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' Function that plots the age distribution (variable Age) of a dataset d, for various
#' sample categories defined by another variable (provided using split_by).
#'
#' Each age distribution is represented by a boxplot and the jitter points. There is no
#' return value.
#'
#' @param d The dataset to use for plotting.
#' @param split_by The name of thecolkumn to use to split the samples into several categories.
#' @param output_figure_file A string representing the path to the file which will
#' contains the saved plot.
#' @param plot_title The title of the plot, Default to "".
#' @param min_x The minimum value for the y axis. Default to NA (results in the default ggplot
#' behaviour).
#' @param max_x The maximum value for the y axis. Default to NA (results in the default ggplot
#' behaviour).
plot_age_dist <- function(d, split_by, output_figure_file,
plot_title = "",
min_y = NULL,
max_y = NULL) {
my_palette <- c("#7da7de", "#00599d", "#c2e0ae", "#62a73b")
png(output_figure_file)
myplot <- ggplot(d, aes(x = !!ensym(split_by), y = Age, colour = !!ensym(split_by))) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 5, width = 0.2, height = 0.1) +
coord_cartesian(ylim = c(min_y, max_y)) +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.title.y = element_text(size = 20,
face = "bold"),
axis.text = element_text(size = 20,
face = "bold"),
plot.title = element_text(size = 22,
hjust = 0.5,
face = "bold"),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 2),
panel.grid.major = element_line(colour = "lightgrey"),
strip.background = element_rect(fill = "black"),
strip.text = element_text(colour = "white")) +
scale_color_manual(values = my_palette) +
ggtitle(plot_title)
print(myplot)
dev.off()
rm(myplot)
}
# ================================================================================================
# Main
# ================================================================================================
# We prepare the plots by computing the min and max ages across all datasets.
get_max_cmd <- paste0("cut -f 4 ", output_data_dir,
"*clinical.tsv | grep -v Age | grep -v NA | sort -n | tail -n 1")
max_age_across_datasets <- as.numeric(system(get_max_cmd, intern = TRUE)) + 1
get_min_cmd <- paste0("cut -f 4 ", output_data_dir,
"*clinical.tsv | grep -v Age | grep -v NA | sort -n | head -n 1")
min_age_across_datasets <- as.numeric(system(get_min_cmd, intern = TRUE)) - 1
rm(get_max_cmd, get_min_cmd)
# We collect the average age difference (AAD) for all datasets and both meaningful
# comparisons (disease status and gender).
aads_datasets <- vector()
aads_ds_n <- vector()
aads_ds_values <- vector()
aads_gd_n <- vector()
aads_gd_values <- vector()
# We do all datasets one by one.
for (i in seq_len(length(config$rs_datasets))) {
# We get the dataset details.
dataset <- config$rs_datasets[[i]]
dataset_name <- dataset$dataset_name
# Special case: if we don't have age values (only NAs),
# then we don't do anything.
if (dataset$has_age == FALSE) {
next
}
# We load the data (clinical).
pheno_data_fn <- paste0(dataset_name, "_clinical.tsv")
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
rm(pheno_data_fn)
# We prepare the data for ggplot by adding an extra column (Gender * Disease.status).
d <- pheno_data %>%
dplyr::mutate(gender_disease_status = factor(paste(Gender, Disease.status,
sep = ".")))
message(paste0("[", Sys.time(), "][", dataset_name, "] Data filtered."))
# Compute the AAD metrics (Average Age Difference) between the ages of
# two groups. Because these groups might not have the same size, we reduce the
# largest group to the size of the smallest group (best effort) and then computes
# the AAD (sort of MUE - Mean Unsquared Error).
# First, looking at the ages of the different disease statuses.
small_set <- sort(pheno_data$Age[pheno_data$Disease.status == "PD"])
large_set <- sort(pheno_data$Age[pheno_data$Disease.status == "Control"])
if (length(small_set) > length(large_set)) {
large_set <- sort(pheno_data$Age[pheno_data$Disease.status == "PD"])
small_set <- sort(pheno_data$Age[pheno_data$Disease.status == "Control"])
}
reduced_large_set <- reduce_set(large_set, length(small_set))
aads_ds_n <- c(aads_ds_n, length(small_set))
aads_ds_local <- mean(abs(small_set - reduced_large_set))
aads_ds_values <- c(aads_ds_values, aads_ds_local)
aads_datasets <- c(aads_datasets, dataset_name)
rm(small_set, large_set, reduced_large_set)
# Second, looking at the ages of the different genders.
small_set <- sort(pheno_data$Age[pheno_data$Gender == "F"])
large_set <- sort(pheno_data$Age[pheno_data$Gender == "M"])
if (length(small_set) > length(large_set)) {
large_set <- sort(pheno_data$Age[pheno_data$Gender == "F"])
small_set <- sort(pheno_data$Age[pheno_data$Gender == "M"])
}
reduced_large_set <- reduce_set(large_set, length(small_set))
aads_gd_n <- c(aads_gd_n, length(small_set))
aads_gd_local <- mean(abs(small_set - reduced_large_set))
aads_gd_values <- c(aads_gd_values, aads_gd_local)
rm(small_set, large_set, reduced_large_set)
# We first plot the gender vs age.
output_figure_file <- paste0(output_data_dir, dataset_name, "_agebias_wrt_gender.png")
raad <- round(aads_gd_local, 2)
plot_age_dist(d, "Gender", output_figure_file,
plot_title = paste0("[Gender] ", dataset_name, " [AAD=", raad, "]"),
min_y = min_age_across_datasets,
max_y = max_age_across_datasets)
rm(output_figure_file, raad)
# We then plot the disease status vs age.
output_figure_file <- paste0(output_data_dir, dataset_name,
"_agebias_wrt_disease_status.png")
raad <- round(aads_ds_local, 2)
plot_age_dist(d, "Disease.status", output_figure_file,
plot_title = paste0("[Disease status] ", dataset_name, " [AAD=", raad, "]"),
min_y = min_age_across_datasets,
max_y = max_age_across_datasets)
rm(output_figure_file, raad)
# We finally plot the gender * disease status vs age.
output_figure_file <- paste0(output_data_dir, dataset_name,
"_agebias_wrt_gender_and_disease_status.png")
plot_age_dist(d, "gender_disease_status", output_figure_file,
plot_title = paste0("[Gender * Disease status] ", dataset_name),
min_y = min_age_across_datasets,
max_y = max_age_across_datasets)
rm(output_figure_file)
message(paste0("[", Sys.time(), "][", dataset_name, "] Plots created."))
rm(dataset, dataset_name, aads_ds_local, aads_gd_local, pheno_data, d)
} # End for each dataset.
rm(i, min_age_across_datasets, max_age_across_datasets)
# Summary plot of the Average Age Differences for all datasets and the two
# comparisons of interest.
summary_plot_filename <- paste0(output_data_dir, "Average_age_differences.png")
png(summary_plot_filename)
plot(x = c(1, length(aads_ds_values)),
y = c(0, 1.1 * max(c(aads_ds_values, aads_gd_values))),
type = "n",
main = "Average age difference across groups and datasets.",
xlab = "",
ylab = "",
xaxt = "n", font = 2, cex.axis = 1.5, cex.main = 1.5)
mtext("Average age difference", side = 2, line = 3, cex = 1.5, font = 2)
mtext("Datasets", side = 1, line = 1, cex = 1.5, font = 2)
# axis(side = 1,
# las = 2,
# labels = aads_datasets,
# at = seq(1, length(aads_ds_values)))
# lines(x = c(0, 1 + length(aads_ds_values)),
# y = c(2, 2),
# lty = 5,
# lwd = 2,
# col = "darkgrey")
lines(x = c(0, 1 + length(aads_ds_values)),
y = c(5, 5),
lty = 5,
lwd = 2,
col = "grey")
lines(x = c(0, 1 + length(aads_ds_values)),
y = c(10, 10),
lty = 5,
lwd = 2,
col = "lightgrey")
points(x = seq(1, length(aads_ds_values)),
y = aads_ds_values,
col = "darkgreen",
pch = 19,
cex = 2)
points(x = seq(1, length(aads_ds_values)),
y = aads_gd_values,
col = "darkorange",
pch = 19,
cex = 2)
legend(x = "topleft",
legend = c("Disease status", "Gender"),
col = c("darkgreen", "darkorange"),
pch = 19,
cex = 1.5,
lwd = 2,
text.font = 2)
dev.off()
rm(summary_plot_filename, aads_datasets, aads_ds_n, aads_ds_values, aads_gd_n, aads_gd_values)
rm(config, input_data_dir, output_data_dir, raw_data_dir)
# We log the session details for reproducibility.
sessionInfo()
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("Biobase")
library("ArrayUtils")
library("edgeR")
source("../libs/conf/confR.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(config$rs_datasets))) {
# We get the dataset details.
dataset <- config$rs_datasets[[i]]
dataset_name <- dataset$dataset_name
# We define the dataset specific I/Os.
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/")
batch_data_fn <- paste0(raw_data_dir, dataset_name, "/", "Batch.tsv")
# We load the data (clinical and batch).
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(raw_data_subdir))
batch_data <- NULL
if (file.exists(batch_data_fn)) {
batch_data <- read.delim(batch_data_fn, row.names = 1, stringsAsFactors = FALSE)
}
rm(batch_data_fn)