Commit c726767e authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added the APT-GCRMA preprocessing scripts, plus some refactoring for steps 01-04.

parent 6a8dbf82
......@@ -38,6 +38,7 @@ for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/")
# By default, we have the disease status, but we can also have the gender.
dataset_groups <- c("Disease.status")
......@@ -50,6 +51,7 @@ for (i in seq_len(length(config$datasets))) {
for (j in seq_len(length(config$normalizations))) {
# We define the current normalization procedure.
normalization <- config$normalizations[[j]]
normalization_method <- normalization$methods[normalization$platforms == dataset$platform]
......@@ -57,7 +59,6 @@ for (i in seq_len(length(config$datasets))) {
"] Started..."))
# We define the I/Os.
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/")
output_data_subdir <- paste0(output_data_dir, dataset_name, "/", normalization$name, "/")
dir.create(output_data_subdir, showWarnings = TRUE, recursive = FALSE, mode = "0777")
......@@ -74,13 +75,41 @@ for (i in seq_len(length(config$datasets))) {
normalization$name, "_nobatchcorrection.tsv"),
paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, "_batchcorrection.tsv"))
esets <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_files,
platform = dataset$platform,
method = normalization_method,
batch_correction = "BOTH",
clean_samples = dataset$cleaning,
verbose = TRUE)
# Depending on the method, we will either use APT processed data (Affymetrix Power Tools)
# or the functions from the ArrayLib R library.
esets <- NULL
if (normalization_method == "APT-GCRMA") {
# We need to use GC-RMA data obtained via APT (was run before so files should exist)
# We read back the APT-GCRMA file (with already pre-processed data) and only use
# the library for the extra cleaning / batch correction / filtering steps.
exprs_fn <- paste0(output_data_dir, "apt_gcrma/", dataset_name, ".tsv")
exprs_raw <- read.table(exprs_fn,
header = TRUE,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
# We are not running apt-gcrma so we rely on the ArrayLib functions.
esets <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_files,
platform = dataset$platform,
method = normalization_method,
exprs_raw = data.matrix(exprs_raw),
batch_correction = "BOTH",
clean_samples = dataset$cleaning,
verbose = TRUE)
rm(exprs_raw, exprs_fn)
} else {
# We are not running apt-gcrma so we rely on the ArrayLib functions.
esets <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_files,
platform = dataset$platform,
method = normalization_method,
batch_correction = "BOTH",
clean_samples = dataset$cleaning,
verbose = TRUE)
}
rm(output_data_files)
message(paste0("[", Sys.time(), "] [", dataset_name, "][", normalization_method,
"] Pre-processing (with batches) finished."))
......@@ -119,13 +148,41 @@ for (i in seq_len(length(config$datasets))) {
"] Starting pre-processing (without batch)."))
output_data_file <- paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, "_nobatchcorrection.tsv")
eset <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_file,
platform = dataset$platform,
method = normalization_method,
batch_correction = "FALSE",
clean_samples = dataset$cleaning,
verbose = TRUE)
# Depending on the method, we will either use APT processed data (Affymetrix Power Tools)
# or the functions from the ArrayLib R library.
eset <- NULL
if (normalization_method == "APT-GCRMA") {
# We need to use GC-RMA data obtained via APT (was run before so files should exist)
# We read back the APT-GCRMA file (with already pre-processed data) and only use
# the library for the extra cleaning / batch correction / filtering steps.
exprs_fn <- paste0(output_data_dir, "apt_gcrma/", dataset_name, ".tsv")
exprs_raw <- read.table(exprs_fn,
header = TRUE,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
# We are not running apt-gcrma so we rely on the ArrayLib functions.
eset <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_file,
platform = dataset$platform,
method = normalization_method,
exprs_raw = data.matrix(exprs_raw),
batch_correction = "FALSE",
clean_samples = dataset$cleaning,
verbose = TRUE)
rm(exprs_raw, exprs_fn)
} else {
# We are not running apt-gcrma so we rely on the ArrayLib functions.
eset <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_file,
platform = dataset$platform,
method = normalization_method,
batch_correction = "FALSE",
clean_samples = dataset$cleaning,
verbose = TRUE)
}
rm(output_data_file)
message(paste0("[", Sys.time(), "] [", dataset_name, "][", normalization_method,
"] Pre-processing (without batch) finished."))
......
......@@ -27,6 +27,11 @@ source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Actual jobs
# I- CGRMA via Affymetrix Power Tools (tag: APT-GCRMA)
/bin/bash ${CODE_FOLDER}run_apt_gcrma.sh > ${OUTPUT_FOLDER}apt_log.out 2> ${OUTPUT_FOLDER}apt_log.err
# II- General workflow with all processing methods.
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
do
......@@ -34,12 +39,12 @@ do
if [ "${datasetName}" != "" ]
then
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}/${datasetName}/
mkdir ${OUTPUT_FOLDER}/${datasetName}/
Rscript --vanilla ${CODE_FOLDER}/preprocess.R ${datasetName} > ${OUTPUT_FOLDER}/${datasetName}/preprocessing_log.out 2> ${OUTPUT_FOLDER}/${datasetName}/preprocessing_log.err
rm -rf ${OUTPUT_FOLDER}${datasetName}/
mkdir ${OUTPUT_FOLDER}${datasetName}/
Rscript --vanilla ${CODE_FOLDER}preprocess.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.out 2> ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.err
echo "== Job $i ended (${datasetName}) =="
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
#!/bin/bash -l
# I/Os and parameters
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/02/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/02-Preprocessing/
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/project_config.yml
create_variables ../Confs/platforms_config.yml
# We set up the results folder that will contain all files for all datasets.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma
mkdir ${OUTPUT_FOLDER}/apt_gcrma
# We run the APT-GCRMA for all datasets (or at least all the Affymetrix ones).
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
platformName=${datasets__platform[$i]}
if [ "${platformName}" == "Affymetrix" ]
then
# We set up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma_temp
mkdir ${OUTPUT_FOLDER}/apt_gcrma_temp
# We get the correct CDF file based on the dataset / platform configurations.
datasetArrayType=${datasets__array_type[$i]}
cdfName="NA"
nbPlatforms=${#platforms__platform_name[@]}
for (( j=0; j<=$nbPlatforms; j++ ))
do
platformName=${platforms__platform_name[$j]}
if [ "${platformName}" == "${datasetArrayType}" ]
then
cdfName=${platforms__cdf_name[$j]}
fi
done
# We prepare and run the APT command.
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
eval "$apt_cmd"
# We rename and copy the result file to the real apt folder.
mv ${OUTPUT_FOLDER}apt_gcrma_temp/gc-correction.scale-intensities.rma-bg.quant-norm.pm-only.med-polish.summary.txt ${OUTPUT_FOLDER}apt_gcrma/${datasetName}.tsv
# We clean up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma_temp
fi
fi
done
This diff is collapsed.
......@@ -44,7 +44,8 @@ for (i in seq_len(length(config$datasets))) {
if (selected_dataset_name == "" || selected_dataset_name == dataset_name) {
# If we have not an affy dataset, we skip.
# Only for testing purposes, should be removed if we decide to use vsn on raw data.
# Only for exploration purposes, we will include ways to analyse Agilent and
# Illumina datasetst should we decide to use vsn on raw data.
if (dataset$platform != "Affymetrix") {
next;
}
......@@ -74,7 +75,7 @@ for (i in seq_len(length(config$datasets))) {
dev.off()
rm(output_file_name)
# We run vsn on teh raw data.
# We run vsn on the raw data.
raw_exprs_vsn <- vsn::vsn2(raw_exprs, calib = "affine")
# We plot the mean vs sdev after variance stabilization.
......
......@@ -283,11 +283,11 @@ p <- ggplot(loocv_all, aes(x = Age, y = Pred, color = abs(Age - Pred))) +
theme(legend.position = "None") +
facet_wrap( ~ dataset) +
coord_cartesian(xlim = c(local_min, local_max), ylim = c(local_min, local_max)) +
geom_abline(intercept = 0, slope = 1, colour = "darkgrey") +
geom_abline(intercept = -5, slope = 1, colour = "darkgrey", linetype = 2) +
geom_abline(intercept = 5, slope = 1, colour = "darkgrey", linetype = 2) +
geom_abline(intercept = 0, slope = 1, colour = "darkgrey") +
geom_abline(intercept = -5, slope = 1, colour = "darkgrey", linetype = 2) +
geom_abline(intercept = 5, slope = 1, colour = "darkgrey", linetype = 2) +
geom_abline(intercept = -10, slope = 1, colour = "darkgrey", linetype = 3) +
geom_abline(intercept = 10, slope = 1, colour = "darkgrey", linetype = 3) +
geom_abline(intercept = 10, slope = 1, colour = "darkgrey", linetype = 3) +
geom_point(size = 3)
print(p)
dev.off()
......
......@@ -156,8 +156,8 @@ for (i in seq_len(length(config$datasets))) {
array_yprobes <- setdiff(rownames(array_probes[[2]]), rownames(array_probes[[1]]))
# Subset the expression matrices with the X- and Y-chromosome probes.
array_xprobes <- intersect(array_xprobes, rownames(exp_data))
array_yprobes <- intersect(array_yprobes, rownames(exp_data))
array_xprobes <- intersect(array_xprobes, rownames(exp_data))
array_yprobes <- intersect(array_yprobes, rownames(exp_data))
exp_data_xonly <- exp_data[array_xprobes, ]
exp_data_yonly <- exp_data[array_yprobes, ]
rm(exp_data, array_xprobes, array_yprobes)
......
......@@ -52,8 +52,8 @@ for (i in seq_len(length(config$datasets))) {
mutate(gender_disease_status = factor(paste(Gender, Disease.status, sep = ".")))
# We load the data (preproccesed data).
scan_data_fn <- paste0(output_data_dir, dataset_name, "_normalized_clean.tsv")
exp_data <- ArrayUtils::read_eset(scan_data_fn, as_eset = FALSE)
exp_data_fn <- paste0(output_data_dir, dataset_name, "_normalized_clean.tsv")
exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
message(paste0("[", Sys.time(), "][", dataset_name, "] Data ready."))
# Compute the PCA.
......@@ -123,7 +123,7 @@ for (i in seq_len(length(config$datasets))) {
ylab(percentage[3])
# We group the plots and save as png.
category_name <- str_replace(category, "[.]", "_")
category_name <- str_replace(category, "[.]", "_")
output_figure_file <- paste0(output_data_dir, dataset_name,
"_category_bias_", category_name, ".png")
png(output_figure_file, width = 3 * 480)
......
......@@ -53,7 +53,7 @@ plot_age_dist <- function(D, split_by, output_figure_file,
png(output_figure_file)
myplot <- ggplot(D, aes(x = !!ensym(split_by), y = Age, colour = !!ensym(split_by))) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 3, width = 0.2, height = 0.1) +
geom_jitter(size = 3, width = 0.2, height = 0.1) +
coord_cartesian(ylim = c(min_y, max_y)) +
theme(legend.position = "none",
axis.title.x = element_blank(),
......@@ -105,9 +105,9 @@ for (i in seq_len(length(config$datasets))) {
# We load the data (clinical).
pheno_data_fn <- paste0(dataset_name, "_clinical_clean.tsv")
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
# We prepare the data for ggplot by adding an extra column (Gender * Disease.status).
pheno_data %>%
......@@ -119,8 +119,8 @@ for (i in seq_len(length(config$datasets))) {
# largest group to the size of the smallest group (best effort) and then computes
# the AAD (sort of MUE - Mean Unsquared Error).
# First, looking at the ages of the different disease statuses.
small_set <- sort(pheno_data$Age[pheno_data$Disease.status == "PD"])
large_set <- sort(pheno_data$Age[pheno_data$Disease.status == "Control"])
small_set <- sort(pheno_data$Age[pheno_data$Disease.status == "PD"])
large_set <- sort(pheno_data$Age[pheno_data$Disease.status == "Control"])
if (length(small_set) > length(large_set)) {
large_set <- sort(pheno_data$Age[pheno_data$Disease.status == "PD"])
small_set <- sort(pheno_data$Age[pheno_data$Disease.status == "Control"])
......@@ -132,8 +132,8 @@ for (i in seq_len(length(config$datasets))) {
aads_datasets <- c(aads_datasets, dataset_name)
# Second, looking at the ages of the different genders.
small_set <- sort(pheno_data$Age[pheno_data$Gender == "F"])
large_set <- sort(pheno_data$Age[pheno_data$Gender == "M"])
small_set <- sort(pheno_data$Age[pheno_data$Gender == "F"])
large_set <- sort(pheno_data$Age[pheno_data$Gender == "M"])
if (length(small_set) > length(large_set)) {
large_set <- sort(pheno_data$Age[pheno_data$Gender == "F"])
small_set <- sort(pheno_data$Age[pheno_data$Gender == "M"])
......
......@@ -145,7 +145,7 @@ for (i in seq_len(length(config$datasets))) {
limma_parameters <- config$limma_analyses[[j]]
# We only do the limma analysis when it is possible, that is when we have at least 2 samples
# foreach of the category we are comparing.For instance, when there is no female PD
# for each of the category we are comparing. For instance, when there is no female PD
# patients (GSE7307), we can not do any comparison that involves female PD patients.
if (counts[[limma_parameters$clinical_factor]] < 2) {
next;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment