Commit 8b38640b authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Code refactoring and optimization for th second step (RNA-seq and array...

Code refactoring and optimization for th second step (RNA-seq and array datasets are now processed altogether).
parent 892ed6fc
......@@ -7,11 +7,9 @@ clean_outputs:
@rm -rf ${OUTPUT_FOLDER}*
preprocess:
@sbatch ${CODE_FOLDER}preprocess.sh
vsn:
@sbatch ${CODE_FOLDER}stabilize_variance.sh
vsn_raw:
@sbatch ${CODE_FOLDER}stabilize_variance_raw.sh
get_log:
@sbatch ${CODE_FOLDER}get_log.sh
vsn:
@sbatch ${CODE_FOLDER}stabilize_variance.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
@sbatch ${CODE_FOLDER}doc.sh
\ No newline at end of file
......@@ -2,27 +2,28 @@
The objectives of this step are to preprocess the raw data and to save them for further use.
# Details and instructions
All Affymetrix datasets are preprocessed using R and the 'SCAN-UPC' package. Illumina and Agilent datasets are preprocessed using dedicated R libraries (limma and beadarrays). The Makefile contains the commands to launch all jobs on the cluster. The data are stored back as TSV files.
All Affymetrix datasets are preprocessed using SCAN and GC-RMA. Illumina and Agilent datasets are preprocessed using dedicated R libraries (limma and beadarrays). The RNA-seq data are just post-processed since the raw data are not available. The Makefile contains the commands to launch all jobs on the cluster.
The data are stored back as TSV files.
```
make clean_outputs
make preprocess
```
We can then make a plot of the preprocessing results (to identify the arrays for which SCAN modeling might have failed). The list of datasets is hard-coded in the bash script 'prepare_log.sh'. We don't do any post-processing of the normalization procedure for Illumina or Agilent arrays.
For the datasets analyzed through SCAN, we can then make a plot of the preprocessing results (to identify the arrays for which SCAN modeling might have failed). We don't do any post-processing of the normalization procedure for Illumina or Agilent arrays (QC are run anyway in all cases).
```
make get_log
```
We can then investigate the effect of applying a variance stabilization method on the data
We can then investigate the effect of applying a variance stabilization method on the data (aka to control heteroscedasticity)
```
make vsn
make vsn_raw
```
Finally, we create a report about the variance stabilization method so that we can manually check whether it is necessary to correct for variance bias.
Finally, we create a report about the processing and the variance stabilization methods so that we can manually check which normalization procedure makes more sense or whether it is indeed necessary to correct for variance bias.
```
make doc
```
# Prerequisites
In general, the only prerequisite is to have the raw data (mostly from GEO) in the Data folder. There should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files. In particular, it is not necessary to run the quality control before running the preprocessing since arrays are preprocessed independently (unless one has many problematic arrays, which would take CPU time for nothing). However, for Illumina and Agilent arrays, this is not the case, and the bad quality arrays needs to be filtered before pre-processing (arrays are not treated individually).
In general, the only prerequisite is to have the raw data (mostly from GEO) in the Data folder. For the array-based datasets, there should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files. For the RNA-seq dataset, the pre-processed data should be available as TSV files containing read counts. For SCAN in particular, it is not necessary to run the quality control before running the preprocessing since arrays are preprocessed independently (unless one has many problematic arrays, which would take CPU time for nothing). For all the other methods however, this is not the case, and the bad quality arrays needs to be filtered before pre-processing (arrays are not treated individually).
......@@ -44,6 +44,8 @@ for (i in seq_len(length(config$datasets))) {
dataset_groups <- c("Disease.status")
if (dataset$clinical_descriptors == "DG") {
dataset_groups <- c("Disease.status", "Gender")
} else if (dataset$clinical_descriptors == "DGA") {
dataset_groups <- c("Disease.status", "Gender")
}
# We preprocess the current dataset (if necessary).
......
......@@ -28,6 +28,7 @@ do
# We get the correct CDF file based on the dataset / platform configurations.
datasetArrayType=${datasets__array_type[$i]}
cdfName="NA"
mpsName="NA"
nbPlatforms=${#platforms__platform_name[@]}
for (( j=0; j<$nbPlatforms; j++ ))
do
......@@ -35,14 +36,22 @@ do
if [ "${platformName}" == "${datasetArrayType}" ]
then
cdfName=${platforms__cdf_name[$j]}
mpsName=${platforms__mps_name[$j]}
fi
done
# We prepare and run the APT command.
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
# We prepare the APT command. We take care of the cases for which we need to
# summarize the data at the transcript level (exon arrays). Otherwise, we summarize at
# the probeset level (by default).
if [ "${mpsName}" == "NA" ]
then
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
else
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -m ${mpsName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
fi
# We run the APT command and we rename / copy the result file to the real apt folder.
eval "$apt_cmd"
# We rename and copy the result file to the real apt folder.
mv ${OUTPUT_FOLDER}apt_gcrma_temp/gc-correction.scale-intensities.rma-bg.quant-norm.pm-only.med-polish.summary.txt ${OUTPUT_FOLDER}apt_gcrma/${datasetName}.tsv
# We clean up the temporary folder for that run.
......
......@@ -83,7 +83,7 @@ run_vsn <- function(output_data_subdir, dataset_name, normalization_name, batchc
rm(exp_data_majperc)
# We sometimes need to make several attempts since the GC-RMA data can lead to errors
# because they might contain many ties.
# because they might contain many ties (usage of quantile).
exp_data_vsn <- NULL
for (k in 1:5) {
exp_data_vsn <- tryCatch({
......@@ -106,7 +106,7 @@ run_vsn <- function(output_data_subdir, dataset_name, normalization_name, batchc
dim(noise) <- c(dim(exp_data)[1], dim(exp_data)[2])
exp_data <- exp_data + noise
}
} # End of loop on k of 10 iterations.
} # End of loop on k of 5 iterations.
# We plot the mean vs sdev after variance stabilization.
output_file_name <- paste0(output_data_subdir, dataset_name, "_normalized_",
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("ArrayUtils")
library("Biobase")
library("vsn")
library("affy")
library("preprocessCore")
source("../libs/conf/confR.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
rm(args)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
# We preprocess the current dataset (if necessary).
if (selected_dataset_name == "" || selected_dataset_name == dataset_name) {
# If we have not an affy dataset, we skip.
# Only for exploration purposes, we will include ways to analyse Agilent and
# Illumina datasetst should we decide to use vsn on raw data.
if (dataset$platform != "Affymetrix") {
next;
}
# We define the I/Os.
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/", "RAW/")
output_data_subdir <- paste0(output_data_dir, dataset_name, "/")
# We have no batch, so only one run.
message(paste0("[", Sys.time(), "] [", dataset_name,
"] Starting analyzing raw data."))
# We read the expression data.
raw_file_list <- affy::list.celfiles(raw_data_subdir, full.names = TRUE)
batch <- affy::ReadAffy(filenames = raw_file_list)
raw_exprs <- exprs(batch)
rm(raw_file_list, batch)
# We plot the mean vs sdev before variance stabilization.
output_file_name <- paste0(output_data_subdir, dataset_name, "_raw_meansd_ranks.png")
png(output_file_name)
vsn::meanSdPlot(log2(raw_exprs))
dev.off()
output_file_name <- paste0(output_data_subdir, dataset_name, "_raw_meansd_vals.png")
png(output_file_name)
vsn::meanSdPlot(log2(raw_exprs), ranks = FALSE)
dev.off()
rm(output_file_name)
# We run vsn on the raw data.
raw_exprs_vsn <- vsn::vsn2(raw_exprs, calib = "affine")
# We plot the mean vs sdev after variance stabilization.
output_file_name <- paste0(output_data_subdir, dataset_name, "_raw_meansd_ranks_vsn.png")
png(output_file_name)
vsn::meanSdPlot(raw_exprs_vsn)
dev.off()
output_file_name <- paste0(output_data_subdir, dataset_name, "_raw_meansd_vals_vsn.png")
png(output_file_name)
vsn::meanSdPlot(raw_exprs_vsn, ranks = FALSE)
dev.off()
rm(output_file_name)
rm(raw_exprs, raw_exprs_vsn)
message(paste0("[", Sys.time(), "] [", dataset_name, "] Ended."))
rm(raw_data_subdir, output_data_subdir)
} # End if dataset should be analysed.
rm(dataset, dataset_name)
} # End for each dataset.
# We log the session details for reproducibility.
rm(i, selected_dataset_name, raw_data_dir, output_data_dir, config)
sessionInfo()
#!/bin/bash -l
#SBATCH -J geneder:02:vsnraw
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 20
#SBATCH --time=0-10:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/02/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/02-Preprocessing/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Actual jobs
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}stabilize_variance_raw.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/vsn_raw_log.out 2> ${OUTPUT_FOLDER}${datasetName}/vsn_raw_log.err
echo "== Job $i ended (${datasetName}) =="
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment