Commit 96c62eee authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 05 to process all datasets at once.

parent a42c7580
......@@ -73,11 +73,11 @@ for (i in seq_len(length(config$platforms))) {
# We enrich the probe identifiers with the corresponding gene names using
# the prepared file (based on BioMart).
unique_probe_ids <- sort(unique(probe_ids))
unique_probe_ids <- sort(unique(probe_ids))
gene_annots_filename <- paste0(platform$platform_name, "_genenames.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(output_data_dir,
gene_annots_filename,
unique_probe_ids)
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(output_data_dir,
gene_annots_filename,
unique_probe_ids)
rm(gene_annots_filename)
# Then, we make sure we deal with duplicates and reorder the df to match the list we have.
......
......@@ -7,5 +7,7 @@ clean_outputs:
@rm -rf ${OUTPUT_FOLDER}*
run_limma:
@sbatch ${CODE_FOLDER}run_limma.sh
biomarker:
@sbatch ${CODE_FOLDER}plot_biomarkers.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
# Objectives
The objectives of this step is to identify the differentially expressed genes of each dataset, and for various comparisons of interest (e.g., female vs male, disease versus control).
This produces gene lists that can be then used for pathway and network analyses.
This produces gene lists that can be then used for the pathway and network analyses.
# Details and instructions
The datasets are processed one by one to identify differentially expressed genes (using limma). The following analyses are performed:
......
......@@ -6,7 +6,7 @@
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -53,21 +53,15 @@ echo '\clearpage' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# MDS plots.
nbVars=${#variance_methods__name[@]}
for (( j=0; j<$nbVars; j++ ))
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
varName=${variance_methods__name[$j]}
datasetName=${datasets__dataset_name[$i]}
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_mdsplot_'"$varName"'.png}' >> ${OUTPUT_FOLDER}results_summary.tex
done
echo ' \caption{MDS plots of all datasets ('"$varName"' data). From top to bottom and from left to right:' >> ${OUTPUT_FOLDER}results_summary.tex
echo ${datasets__dataset_name[@]} | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}results_summary.tex
echo '.}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_mdsplot_VSN.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.23]{'"$OUTPUT_FOLDER"''"$datasetName"'_mdsplot_noVSN.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{MDS plot of dataset '"$datasetName"' (Left: VSN, Right: noVSN).}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
done
......@@ -209,6 +203,26 @@ do
echo '\clearpage' >> ${OUTPUT_FOLDER}results_summary.tex
done
echo '\clearpage' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Per biomarker analysis.
nbBiomarkers=${#biomarkers__name[@]}
for (( i=0; i<$nbBiomarkers; i++ ))
do
biomarkerName=${biomarkers__name[$i]}
biomarkerTissue=${biomarkers__tissue[$i]}
biomarkerTissueForFile=${biomarkerTissue/ /_}
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.43]{'"$OUTPUT_FOLDER"'Biomarker_'"${biomarkerTissueForFile}"'_'"${biomarkerName}"'.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Expression values for biomarker '"${biomarkerName}"' for both DA and SN tissues.' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' (Top) Usage of expression values as derived from pre-processing. (Bottom) Usage of Z-scores ' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' derived from the expression values. (Left) VSN correction applied. (Right) No VSN correction applied.}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
done
# Print footer
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{document}' >> ${OUTPUT_FOLDER}results_summary.tex
......
......@@ -14,6 +14,8 @@ library("hgfocus.db")
library("ArrayUtils")
library("tidyverse")
library("RColorBrewer")
library("huex10sttranscriptcluster.db")
library("hugene10sttranscriptcluster.db")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -86,35 +88,22 @@ for (i in seq_len(length(config$datasets))) {
# either from the Bioconductor package or from the processed GPL file.
platform_config <- get_platform(config, dataset$array_type)
# First, from a bioconductor package (usual case).
gene_annots_raw <- NULL
if (platform_config$library_name != "NA") {
gene_annots_raw <- ArrayUtils::get_gene_annots_from_package(platform_config$library_name,
rownames(exp_eset))
} else {
# Here, we read instead the sorted GPL file for special cases with no Bioconductor library.
gpl_annot_folder <- paste0(raw_data_dir, "Platforms/")
gpl_annot_filename <- paste0(platform_config$geo_name, "_gene_annots.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(gpl_annot_folder,
gpl_annot_filename,
rownames(exp_eset))
rm(gpl_annot_folder, gpl_annot_filename)
}
rm(platform_config)
# We enrich the probe identifiers with the corresponding gene names using
# the prepared file (based on BioMart).
probe_ids <- rownames(exp_eset)
gene_annots_filename <- paste0(platform_config$platform_name, "_genenames.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(input_data_dir,
gene_annots_filename,
probe_ids)
rm(gene_annots_filename, probe_ids, platform_config)
# Then, we make sure we deal with duplicates and reorder the df to match the eset.
gene_annots <- gene_annots_raw %>%
dplyr::group_by(PROBEID) %>%
dplyr::summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"),
dplyr::summarize_at(.vars = c("SYMBOL"),
.funs = collapser) %>%
dplyr::ungroup()
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
# Just in case, for RNA-seq data, we use the following alternative to the above statment.
# This is only placed there for information since it was not used for microarray data.
# The current solution is equivalent.
#gene_annots <- merge(data.frame(PROBEID = rownames(exp_eset)),
# gene_annots, by = "PROBEID", all.x = TRUE, sort = FALSE)
rm(gene_annots_raw)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
"] Gene annotations ready."))
......@@ -166,10 +155,11 @@ for (i in seq_len(length(config$datasets))) {
# We have enough samples, now we need to decide which co-factors we are going to include in
# the limma analysis.
# First, we check if we need to consider paired samples, then we use a block_key and
# a cofactor_name. This is only valid for one dataset in our application.
# a cofactor_name. This is only valid for one dataset in our case.
age_correct <- TRUE
fit <- NULL
if (dataset$has_paired_samples) {
# By default, we use Tissue / Patient.
# This might be set as a parameter as well if we have more cases.
# We also set the age and batch co-factors.
......@@ -187,6 +177,7 @@ for (i in seq_len(length(config$datasets))) {
file_suffix = vsn$name,
verbose = TRUE)
} else {
# We do not have paired data. Let's then set the age and batch co-factors.
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
age_correct <- FALSE
......
......@@ -105,13 +105,13 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
fill = NA, preserve_class = FALSE)
exp_data <- Reduce(cbind.data.frame, buffered_inputs)
row.names(exp_data) <- exp_data_rn
rm(exp_data_rn)
rm(exp_data_rn, local_inputs, local_maxlens, buffered_inputs)
}
}
rm(relevant_datasets, dataset_name)
# We then select the probe with the max average expression across all datasets.
max_value <- 0
max_value <- -10
max_name <- "NA"
for (probe in all_probes) {
local_mean <- mean(as.numeric(exp_data[probe, ]), na.rm = TRUE)
......@@ -136,11 +136,12 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
# Main
# ================================================================================================
# The objective is here to creates plots for biomarkers as to estimate the actual cell content of
# the SN samples (Substantia Nigra) when compared to the pure DA samples (Dopaminergic neurons).
# The objective is here to creates plots for biomarkers as to estimate the actual cell content of
# the SN samples (Substantia Nigra) when compared to the pure DA samples (Dopaminergic neurons)
# and the induced pluripotent stem cell induced dopaminergic neurons (iPSC-DA).
# We prepare the matching between probes and genes.
gene_probe_matching <- read.delim(paste0(output_data_dir, "Combined_probe_matching.tsv"),
gene_probe_matching <- read.delim(paste0(input_data_dir, "Combined_probe_matching.tsv"),
row.names = 2,
stringsAsFactors = FALSE)
gene_probe_matching["X"] <- NULL
......@@ -170,11 +171,11 @@ for (i in seq_len(length(config$datasets))) {
dataset <- config$datasets[[i]]
# We read the expression data.
exp_data_vsn <- read.delim(paste0(output_data_dir, dataset$dataset_name,
exp_data_vsn <- read.delim(paste0(input_data_dir, dataset$dataset_name,
"_normalized_VSN.tsv"),
row.names = 1,
stringsAsFactors = FALSE)
exp_data_novsn <- read.delim(paste0(output_data_dir, dataset$dataset_name,
exp_data_novsn <- read.delim(paste0(input_data_dir, dataset$dataset_name,
"_normalized_noVSN.tsv"),
row.names = 1,
stringsAsFactors = FALSE)
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:10:00
#SBATCH --time=0-00:12:00
#SBATCH -p batch
#SBATCH --qos=normal
......@@ -23,7 +23,7 @@ module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs.
Rscript --vanilla ${CODE_FOLDER}plot_biomarkers.R > ${OUTPUT_FOLDER}plot_biomarkers_log.out 2> ${OUTPUT_FOLDER}plot_biomarkers_log.err
Rscript --vanilla ${CODE_FOLDER}plot_biomarkers_withdiseasestatus.R > ${OUTPUT_FOLDER}plot_biomarkers_ds_log.out 2> ${OUTPUT_FOLDER}plot_biomarkers_ds_log.err
#Rscript --vanilla ${CODE_FOLDER}plot_biomarkers_withdiseasestatus.R > ${OUTPUT_FOLDER}plot_biomarkers_ds_log.out 2> ${OUTPUT_FOLDER}plot_biomarkers_ds_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
This diff is collapsed.
......@@ -4,9 +4,9 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-01:00:00
#SBATCH --time=0-01:15:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment