Commit 93963df9 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

First part of Step 05 update (new configuration system and new datasets).

parent 71a23914
......@@ -32,7 +32,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name, compressed, phenotype_groups)
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
......
......@@ -34,7 +34,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name, compressed, phenotype_groups)
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
......
......@@ -16,7 +16,7 @@ options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
......@@ -107,7 +107,7 @@ plot_expr_ratios <- function(mat_x, mat_y, pheno_data, plot_filename, verbose =
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name, compressed, phenotype_groups)
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
......
......@@ -17,7 +17,7 @@ options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
......@@ -139,7 +139,7 @@ aads_gd_values <- vector()
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name, compressed, phenotype_groups)
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
......
......@@ -5,7 +5,6 @@
# ================================================================================================
library("yaml")
library("Biobase")
library("limma")
library("ArrayUtils")
source("../libs/conf/confR.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -17,7 +16,7 @@ options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
......@@ -38,7 +37,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name, compressed, phenotype_groups)
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
......
......@@ -8,4 +8,4 @@ clean_outputs:
run_limma:
@sbatch ${CODE_FOLDER}/run_limma.sh
doc:
@/bin/bash ${CODE_FOLDER}/doc.sh
@sbatch ${CODE_FOLDER}/doc.sh
# Objectives
The objectives of this step is to identify the differentially expressed genes of each dataset / meta-dataset, and for various comparisons of interest (e.g., female vs male, disease versus control).
This produces gene lists that can be used for pathway and network analyses.
The objectives of this step is to identify the differentially expressed genes of each dataset, and for various comparisons of interest (e.g., female vs male, disease versus control).
This produces gene lists that can be then used for pathway and network analyses.
# Details and instructions
The 9 datasets and 2 meta-datasets are processed one by one to identify differentially expressed genes (using limma). The following analyses are performed:
The datasets are processed one by one to identify differentially expressed genes (using limma). The following analyses are performed:
- Females vs males
- PD patients vs controls
- Female PD patients vs female controls
......@@ -18,10 +18,10 @@ make clean_outputs
make run_limma
```
A document that contains all figures can then be generated (locally, not on the HPC).
A document that contains all figures can then be generated.
```
make doc
```
# Prerequisites
The only prerequisite is to have the preprocessed and cleaned data for all datasets / meta-datasets (Step 04).
The only prerequisite is to have the preprocessed and cleaned data for all datasets (Step 04).
#!/bin/bash -l
#SBATCH -J geneder:05:doc
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# I/Os and parameters
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/05-Get_DEGs/
ANNEX=/home/leon/Projects/GeneDER/Documents/WorkReport/Annexes
# All datasets for LIMMA. GSE7307 is special (no female PD), all others are ok.
DATASETS=(GSE20163 GSE20164 GSE20292 GSE8397 Simunovic GSE20141 GSE7307 GSE7621 E-MEXP-1416 HG-U133A HG-U133_Plus_2)
DATASETS_OK=(GSE20163 GSE20164 GSE20292 GSE8397 Simunovic GSE20141 GSE7621 E-MEXP-1416 HG-U133A HG-U133_Plus_2)
DATASETS_SPECIAL=(GSE7307)
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/dataset
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
......@@ -24,7 +39,7 @@ echo '\textsl{}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 05-Get\_DEGs. Various plots' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'related to the Limma analyses are displayed for all datasets. The MDS plot ains at ' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'related to the Limma analyses are displayed for all datasets. The MDS plots aim at ' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'identifying whether there could still be a bias in the dataset (after pre-processing).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'The MD plots represent the Mean versus the deviation globally for each comparison (significant genes' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'are highlighted in blue/red). The Venn diagrams indicate the overlap between related analyses' >> ${OUTPUT_FOLDER}/results_summary.tex
......@@ -37,82 +52,93 @@ echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# MDS plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
for i in "${DATASETS[@]}"
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
do
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_mdsplot.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_mdsplot.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
done
echo ' \caption{MDS plots of all datasets. From top to bottom and from left to right:' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ${DATASETS[@]} | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ${datasets__dataset_name[@]} | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
for i in "${DATASETS_OK[@]}"
for (( i=0; i<=$nbDatasets; i++ ))
do
# MD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_FemaleVsMale_PD.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_FemaleVsMale_control.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_PDVsControl_females.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_PDVsControl_males.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_Disease_status_gender.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{MD plots of all Limma analyses for dataset '"$i"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Each plot represents the mean signal (x-axis) versus its deviation (y-axis).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top center) Gender based analysis (Females vs males) in PD patients' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top right) Gender based analysis (Females vs males) in controls' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle left) Status based analysis (PD patients vs controls)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle center) Status based analysis (PD patients vs controls) for females' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle right) Status based analysis (PD patients vs controls) for males' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom left) First factorial analysis (focus on disease status differences for the different genders).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Note: the last two plots should be exactly the same.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Venn diagrams.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_Disease_status_gender.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Venn diagrams of the significant genes for all Limma analyses and for dataset '"$i"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top right) Status based analysis (PD patients vs controls)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom left) First factorial analysis (focus on disease status differences for the different genders) and its components.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status) and its components.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
done
# Special case for GSE7307.
for i in "${DATASETS_SPECIAL[@]}"
do
# MD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$i"'_MD_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{MD plots of all Limma analyses for dataset '"$i"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Each plot represents the mean signal (x-axis) versus its deviation (y-axis).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Status based analysis (PD patients vs controls)}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Venn diagrams.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Venn diagrams of the significant genes for all Limma analyses and for dataset '"$i"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Status based analysis (PD patients vs controls)}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
suitableForFact=${datasets__suitable_for_factorial_analysis[$i]}
if [ "${suitableForFact}" == "'TRUE'" ]
then
# Regular case, the factorial analysis was done.
# MD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_FemaleVsMale_PD.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_FemaleVsMale_control.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_PDVsControl_females.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_PDVsControl_males.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_Disease_status_gender.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{MD plots of all Limma analyses for dataset '"$datasetName"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Each plot represents the mean signal (x-axis) versus its deviation (y-axis).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top center) Gender based analysis (Females vs males) in PD patients' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top right) Gender based analysis (Females vs males) in controls' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle left) Status based analysis (PD patients vs controls)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle center) Status based analysis (PD patients vs controls) for females' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle right) Status based analysis (PD patients vs controls) for males' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom left) First factorial analysis (focus on disease status differences for the different genders).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Note: the last two plots should be exactly the same.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Venn diagrams.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_Disease_status_gender.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Venn diagrams of the significant genes for all Limma analyses and for dataset '"$datasetName"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top right) Status based analysis (PD patients vs controls)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom left) First factorial analysis (focus on disease status differences for the different genders) and its components.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status) and its components.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
else
# We have not done the factorial analysis so we have less plots to show.
# MD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_MD_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{MD plots of all Limma analyses for dataset '"$datasetName"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Each plot represents the mean signal (x-axis) versus its deviation (y-axis).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Status based analysis (PD patients vs controls)}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Venn diagrams.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$datasetName"'_venn_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Venn diagrams of the significant genes for all Limma analyses and for dataset '"$datasetName"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Status based analysis (PD patients vs controls)}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
fi
done
# Print footer
......@@ -121,6 +147,8 @@ echo '\end{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
# Compilation
pdflatex -synctex=1 -interaction=nonstopmode ${OUTPUT_FOLDER}/results_summary.tex
cp results_summary.pdf ${ANNEX}/05_summary_results.pdf
mv results_summary.pdf ${OUTPUT_FOLDER}/
rm results_summary*
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -8,36 +8,27 @@ library("Biobase")
library("hgu133plus2.db")
library("hgu133a.db")
library("u133x3p.db")
library("tidyverse")
library("ArrayUtils")
library("tidyverse")
source("../libs/conf/confR.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
datasets_raw <- as.matrix(unlist(global_config$datasets))
dim(datasets_raw) <- c(length(global_config$datasets[[1]]), length(global_config$datasets))
datasets_config <- data.frame(t(datasets_raw), row.names = datasets_raw[1, ]) %>%
select(X2, X3, X4, X5, X6, X7) %>%
rename(array_type = X2,
is_compressed = X3,
clinical_descriptors = X4,
is_meta_dataset = X5,
has_female_PD = X6,
has_paired_samples = X7)
remove(datasets_raw)
# Optional command line parameters to process only one dataset.
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
message(paste0("[", Sys.time(), "] Configuration read."))
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
......@@ -57,15 +48,13 @@ collapser <- function(x) {
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(row.names(datasets_config)))) {
for (i in seq_len(length(config$datasets))) {
# We get the dataset details (name)
dataset_name <- row.names(datasets_config[i, ])
dataset_array_type <- datasets_config[i, ]$array_type
dataset_has_pairs <- datasets_config[i, ]$has_paired_samples
dataset_has_fPD <- datasets_config[i, ]$has_female_PD
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
# We run the quality control of the current dataset (if necessary).
# We run limma on the current dataset (if necessary).
if (selected_dataset_name == "" || selected_dataset_name == dataset_name) {
# We load the data (clinical and preproccesed data).
......@@ -77,28 +66,30 @@ for (i in seq_len(length(row.names(datasets_config)))) {
mutate(gender_disease_status = paste(pheno_data$Gender,
pheno_data$Disease.status,
sep = "."))
scan_data_fn <- paste0(input_data_dir, dataset_name, "_normalized_scan_clean.tsv")
scan_eset <- ArrayUtils::read_eset(scan_data_fn)
exp_data_fn <- paste0(input_data_dir, dataset_name, "_normalized_clean.tsv")
exp_eset <- ArrayUtils::read_eset(exp_data_fn)
# Annotate the eset with gene information for better outputs.
# First, we collect raw the gene annotations from bioconductor packages.
gene_annots_raw <- ArrayUtils::get_affy_annots(dataset_array_type, rownames(scan_eset))
gene_annots_raw <- ArrayUtils::get_affy_annots(dataset$array_type, rownames(exp_eset))
# TODO: here read instead the sorted GPL file for special case.
# Then we still need to collapose the rows and merge to the rownames of eset.
# Then, we make sure we deal with duplicates and reorder the df to match the eset.
gene_annots_raw %>%
group_by(PROBEID) %>%
summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"), .funs = collapser) %>%
ungroup -> gene_annots
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(scan_eset))), ]
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
message(paste0("[", Sys.time(), "][", dataset_name, "] Gene annotations ready."))
# We only update the eset when the probe ids do match (otherwise, we issue a warning).
if (all(gene_annots$PROBEID == rownames(scan_eset))) {
if (all(gene_annots$PROBEID == rownames(exp_eset))) {
gene_annots_asdf <- new("AnnotatedDataFrame",
data = data.frame(gene_annots[, -1], stringsAsFactors = FALSE)
)
rownames(gene_annots_asdf) <- gene_annots$PROBEID
Biobase::featureData(scan_eset) <- gene_annots_asdf
rownames(gene_annots_asdf) <- gene_annots$PROBEID
Biobase::featureData(exp_eset) <- gene_annots_asdf
} else {
message(paste0("[", Sys.time(), "] Incorrect annotations for ", dataset_name, "."))
}
......@@ -107,26 +98,37 @@ for (i in seq_len(length(row.names(datasets_config)))) {
# Additional QC: plot the MDS to check again for outliers.
mds_filename <- paste0(output_data_dir, dataset_name, "_mdsplot.png")
png(mds_filename)
limma::plotMDS(scan_eset, labels = NULL, pch = 19)
limma::plotMDS(exp_eset, labels = NULL, pch = 19)
dev.off()
message(paste0("[", Sys.time(), "][", dataset_name, "] MDS plot created."))
# Loop over Limma analyses
for (j in seq_len(length(local_config$limma_analyses))) {
for (j in seq_len(length(config$limma_analyses))) {
# We extract the Limma parameters.
limma_parameters <- local_config$limma_analyses[[j]]
# We don't do the j>2 analyses (complex) when there is no female PD patients.
# This is because this will fail otherwise.
if ((j <= 2) | (dataset_has_fPD == TRUE)) {
fit <- ArrayUtils::run_limma(pheno_data, scan_eset, limma_parameters)
limma_parameters <- config$limma_analyses[[j]]
limma_is_factorial <- limma_parameters$factorial
limma_coeff_names <- limma_parameters$names
# We only do the factorial analyses when possible, this fails otherwise.
# For instance, when there is no female PD patients (GSE7307).
if ((!limma_is_factorial) | (dataset$suitable_for_factorial_analysis == "TRUE")) {
# We use a block_key and cofactor_name if necessary (that is when we have
# paired samples).
if (dataset$has_paired_samples) {
# By default, we use Tissue / Patient.
# This might be set as a parameter as well if we have more cases.
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
cofactor_name = "Tissue",
block_key = "Patient")
} else {
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters)
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] Limma fit done."))
# We loop over the coefficients we want to analyse for that Limma analysis.
limma_coeff <- limma_parameters[[3]]
for (k in seq_len(length(limma_coeff))) {
ArrayUtils::extract_DEGs(fit, limma_coeff, k, output_data_dir, dataset_name)
for (k in seq_len(length(limma_coeff_names))) {
ArrayUtils::extract_DEGs(fit, limma_coeff_names, k, output_data_dir, dataset_name)
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] DEG extraction done."))
}
......
local_input_data_dir: !!str '04/'
local_data_dir: !!str '05/'
local_code_dir: !!str '05-Get_DEGs/'
limma_analyses:
-
- Gender
- ["F - M"]
- ["FemaleVsMale"]
-
- Disease.status
- ["PD - Control"]
- ["PDVsControl"]
-
- gender_disease_status
- ["F.PD - F.Control", "M.PD - M.Control", "(F.PD - F.Control) - (M.PD - M.Control)"]
- ["PDVsControl_females", "PDVsControl_males", "Gender_disease_status"]
-
- gender_disease_status
- ["F.PD - M.PD", "F.Control - M.Control", "(F.PD - M.PD) - (F.Control - M.Control)"]
- ["FemaleVsMale_PD", "FemaleVsMale_control", "Disease_status_gender"]
......@@ -19,7 +19,8 @@ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/05-Get_DEGs/
# Loading modules.
module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual job.
Rscript --vanilla ${CODE_FOLDER}/get_DEGs.R > ${OUTPUT_FOLDER}/run_log.out 2> ${OUTPUT_FOLDER}/run_log.err
......
......@@ -25,7 +25,7 @@ modifications:
update_arrays: NA
-
dataset_name: GSE8397
remove_arrays: GSM208564.cel,GSM208634.cel,GSM208637.cel,GSM208638.cel
remove_arrays: GSM208654.cel,GSM208634.cel,GSM208637.cel,GSM208638.cel
update_arrays: NA
-
dataset_name: Simunovic
......@@ -34,7 +34,7 @@ modifications:
-
dataset_name: GSE24378
remove_arrays: NA
update_arrays: NA
update_arrays: GSM601053.CEL;Gender;F
-
dataset_name: GSE20333
remove_arrays: GSM509556.CEL,GSM509557.CEL
......@@ -51,9 +51,9 @@ modifications:
dataset_name: GSE26927
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE49036
remove_arrays: NA
update_arrays: NA
# -
# dataset_name: GSE49036
# remove_arrays: NA
# update_arrays: NA
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age has_female_PD
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age suitable_for_factorial_analysis
GSE20141 Affymetrix HGU133Plus2 D FALSE TRUE FALSE TRUE TRUE
GSE20163 Affymetrix HGU133A D FALSE FALSE FALSE TRUE TRUE
GSE20164 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE
......
......@@ -8,7 +8,7 @@ datasets:
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
has_female_PD: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
-
dataset_name: GSE20163
platform: Affymetrix
......@@ -18,7 +18,7 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
has_female_PD: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
-
dataset_name: GSE20164
platform: Affymetrix
......@@ -28,7 +28,7 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
has_female_PD: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
-
dataset_name: GSE20292
platform: Affymetrix
......@@ -38,7 +38,7 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
has_female_PD: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
-
dataset_name: GSE7307
platform: Affymetrix
......@@ -48,7 +48,7 @@ datasets:
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'FALSE'