Commit 4daf7830 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

New limma configuration and parameters.

parent fc6d5374
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:01:00
#SBATCH --time=0-00:02:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
......@@ -94,7 +94,7 @@ for (i in seq_len(length(config$datasets))) {
# unwanted samples).
batch_data_clean <- NULL
if (!is.null(batch_data)) {
batch_data_clean <- batch_data[!rownames(batch_data) %in% sample_delete_list, ]
batch_data_clean <- batch_data[!rownames(batch_data) %in% sample_delete_list, , drop = FALSE]
message(paste0("[", Sys.time(), "][", dataset_name, "] Batch data cleaned."))
}
remove(batch_data, sample_delete_list, sample_update_list)
......
......@@ -23,5 +23,7 @@ A document that contains all figures can then be generated.
make doc
```
Notice that not all datasets are analyzed using the exact same method. The co-factors might be different depending on whether there are replicates, potential batch effects, and complete clinical annotations (such as age). In addition, some datasets do not have enough samples in a given category (for instance, female PD patients) to perform the complex analyses that take this category into account.
# Prerequisites
The only prerequisite is to have the preprocessed and cleaned data for all datasets (Step 04).
......@@ -32,6 +32,7 @@ selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
rm(args)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
......@@ -58,7 +59,34 @@ for (i in seq_len(length(config$datasets))) {
sep = "."))
exp_data_fn <- paste0(input_data_dir, dataset_name, "_normalized_clean.tsv")
exp_eset <- ArrayUtils::read_eset(exp_data_fn)
rm(pheno_data_fn, exp_data_fn)
# We get the batch data from the batch file if it exists.
batch_data_file <- paste0(input_data_dir, dataset_name, "_batch_clean.tsv")
if (file.exists(batch_data_file)) {
batch_data <- read.delim(batch_data_file, row.names = 1, stringsAsFactors = FALSE)
pheno_data <- data.frame(c(pheno_data, batch_data))
pheno_data$Batch <- paste0("Batch", pheno_data$Batch)
rm(batch_data)
}
rm(batch_data_file)
# We count the number of samples per category to only perform the limma analyses
# we can perform (aka, we do not perform the factorial ones if we have only 1 sample
# in any of the four categories).
counts <- list()
counts$Gender <- min(table(pheno_data$Gender))
counts$Disease_status <- min(table(pheno_data$Disease.status))
counts$Disease_status_females <- min(table((pheno_data %>%
filter(Gender == "F"))$Disease.status))
counts$Disease_status_males <- min(table((pheno_data %>%
filter(Gender == "M"))$Disease.status))
counts$Gender_PD <- min(table((pheno_data %>%
filter(Disease.status == "PD"))$Gender))
counts$Gender_Control <- min(table((pheno_data %>%
filter(Disease.status == "Control"))$Gender))
counts$Gender_disease_status <- min(counts$Gender_PD, counts$Gender_Control)
# Annotate the eset with gene information for better outputs.
# First, we collect raw the gene annotations and we have two options
# either from the Bioconductor package or from the processed GPL file.
......@@ -70,13 +98,15 @@ for (i in seq_len(length(config$datasets))) {
gene_annots_raw <- ArrayUtils::get_gene_annots_from_package(platform_config$library_name,
rownames(exp_eset))
} else {
# Here, we read instead the sorted GPL file for special cases with no Biomart library.
# Here, we read instead the sorted GPL file for special cases with no Bioconductor library.
gpl_annot_folder <- paste0(raw_data_dir, "Platforms/")
gpl_annot_filename <- paste0(platform_config$geo_name, "_gene_annots.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(gpl_annot_folder,
gpl_annot_filename,
rownames(exp_eset))
rm(gpl_annot_folder, gpl_annot_filename)
}
rm(platform_config)
# Then, we make sure we deal with duplicates and reorder the df to match the eset.
gene_annots_raw %>%
......@@ -84,6 +114,7 @@ for (i in seq_len(length(config$datasets))) {
summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"), .funs = collapser) %>%
ungroup -> gene_annots
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
rm(gene_annots_raw)
message(paste0("[", Sys.time(), "][", dataset_name, "] Gene annotations ready."))
# We only update the eset when the probe ids do match (otherwise, we issue a warning).
......@@ -93,6 +124,7 @@ for (i in seq_len(length(config$datasets))) {
)
rownames(gene_annots_asdf) <- gene_annots$PROBEID
Biobase::featureData(exp_eset) <- gene_annots_asdf
rm(gene_annots_asdf, gene_annots)
} else {
message(paste0("[", Sys.time(), "] Incorrect annotations for ", dataset_name, "."))
}
......@@ -103,6 +135,7 @@ for (i in seq_len(length(config$datasets))) {
png(mds_filename)
limma::plotMDS(exp_eset, labels = NULL, pch = 19)
dev.off()
rm(mds_filename)
message(paste0("[", Sys.time(), "][", dataset_name, "] MDS plot created."))
# Loop over Limma analyses.
......@@ -110,38 +143,56 @@ for (i in seq_len(length(config$datasets))) {
# We extract the Limma parameters.
limma_parameters <- config$limma_analyses[[j]]
limma_is_factorial <- limma_parameters$factorial
limma_coeff_names <- limma_parameters$names
# We only do the factorial analyses when possible, this fails otherwise.
# For instance, when there is no female PD patients (GSE7307).
if (!limma_is_factorial | dataset$suitable_for_factorial_analysis == "TRUE") { # nolint
# We use a block_key and cofactor_name if necessary (that is when we have
# paired samples).
if (dataset$has_paired_samples) {
# By default, we use Tissue / Patient.
# This might be set as a parameter as well if we have more cases.
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
cofactor_name = "Tissue",
block_key = "Patient")
} else {
age_correct <- TRUE
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
age_correct <- FALSE
}
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters, age_correct)
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] Limma fit done."))
# We loop over the coefficients we want to analyse for that Limma analysis.
for (k in seq_len(length(limma_coeff_names))) {
ArrayUtils::extract_DEGs(fit, limma_coeff_names, k, output_data_dir, dataset_name)
# We only do the limma analysis when it is possible, that is when we have at least 2 samples
# foreach of the category we are comparing.For instance, when there is no female PD
# patients (GSE7307), we can not do any comparison that involves female PD patients.
if (counts[[limma_parameters$clinical_factor]] < 2) {
next;
}
# We have enough samples, now we need to decide which co-factors we are going to include in
# the limma analysis.
# First, we check if we need to consider paired samples, then we use a block_key and
# a cofactor_name. This is only valid for one dataset in our application.
age_correct <- TRUE
fit <- NULL
if (dataset$has_paired_samples) {
# By default, we use Tissue / Patient.
# This might be set as a parameter as well if we have more cases.
# We also set the age and batch co-factors.
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
age_correct <- FALSE
}
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
correct_for_age = age_correct,
correct_for_batch = dataset$has_batches,
cofactor_name = "Tissue",
block_key = "Patient",
verbose = TRUE)
} else {
# We do not have paired data. Let's then set the age and batch co-factors.
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
age_correct <- FALSE
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] DEG extraction done."))
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
correct_for_age = age_correct,
correct_for_batch = dataset$has_batches,
verbose = TRUE)
}
rm(age_correct)
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] Limma fit done."))
# We extract the DEGs for the coefficient of interest (for that limma analysis).
ArrayUtils::extract_DEGs(fit, limma_parameters$name, 1, output_data_dir, dataset_name)
rm(limma_parameters)
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] DEG extraction done."))
} # End for each Limma analysis.
rm(j)
} # End if dataset should be analysed.
rm(dataset, dataset_name, counts, exp_eset, pheno_data)
} # End for each dataset.
rm(i, config, raw_data_dir, output_data_dir, input_data_dir, selected_dataset_name)
# We log the session details for reproducibility.
sessionInfo()
modifications:
-
dataset_name: GSE20141
remove_arrays: GSM503952_C_2829_SNc.CEL,GSM503959_PD_1401_SNc.CEL,GSM503966_PD_5138_SNc.CEL,GSM503950_C_1074p_SNc.CEL,GSM503951_C_1271p_SNc.CEL,GSM503956_C_3603_SNc.CEL
update_arrays: NA
-
dataset_name: GSE20163
remove_arrays: NA
update_arrays: GSM506004.CEL;Gender;M,GSM506010.CEL;Gender;F,GSM506011.CEL;Gender;M,GSM506012.CEL;Gender;F
-
dataset_name: GSE20164
remove_arrays: GSM506020.CEL
update_arrays: NA
-
dataset_name: GSE20292
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE7307
remove_arrays: GSM176405.CEL
update_arrays: NA
-
dataset_name: GSE7621
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE8397
remove_arrays: GSM208654.cel,GSM208634.cel,GSM208637.cel,GSM208638.cel
update_arrays: NA
-
dataset_name: Simunovic
remove_arrays: park_1143_01.cel,park_1148_01.cel,park_1152_01 .cel
update_arrays: NA
-
dataset_name: GSE24378
remove_arrays: NA
update_arrays: GSM601053.CEL;Gender;M
-
dataset_name: GSE20333
remove_arrays: GSM509556.CEL,GSM509557.CEL
update_arrays: NA
-
dataset_name: Moreira
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE20159
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE26927
remove_arrays: NA
update_arrays: NA
-
dataset_name: GSE49036
remove_arrays: NA
update_arrays: NA
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age suitable_for_factorial_analysis tissue
GSE20141 Affymetrix HGU133Plus2 D FALSE TRUE FALSE TRUE TRUE DA
GSE20163 Affymetrix HGU133A D FALSE FALSE FALSE TRUE TRUE SN
GSE20164 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE SN
GSE20292 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE SN
GSE7307 Affymetrix HGU133Plus2 DG FALSE TRUE FALSE FALSE FALSE SN
GSE7621 Affymetrix HGU133Plus2 DG FALSE FALSE FALSE FALSE TRUE SN
GSE8397 Affymetrix HGU133A DG TRUE FALSE FALSE TRUE TRUE SN
Simunovic Affymetrix HGU133A DG FALSE TRUE FALSE TRUE TRUE DA
GSE24378 Affymetrix U133_X3P D FALSE TRUE FALSE TRUE TRUE DA
GSE20333 Affymetrix HG-Focus DG FALSE FALSE FALSE TRUE TRUE SN
Moreira Agilent G4112F DG FALSE FALSE FALSE TRUE TRUE SN
GSE20159 Illumina HumanV3 DG FALSE FALSE FALSE TRUE TRUE SN
GSE26927 Illumina HumanV2 DGA FALSE FALSE TRUE TRUE TRUE SN
GSE49036 Affymetrix HGU133Plus2 DGA FALSE FALSE FALSE TRUE TRUE SN
\ No newline at end of file
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age tissue
GSE20141 Affymetrix HGU133Plus2 D FALSE TRUE FALSE TRUE DA
GSE20163 Affymetrix HGU133A D FALSE FALSE FALSE TRUE SN
GSE20164 Affymetrix HGU133A DG FALSE TRUE FALSE TRUE SN
GSE20292 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE SN
GSE7307 Affymetrix HGU133Plus2 DG FALSE FALSE FALSE FALSE SN
GSE7621 Affymetrix HGU133Plus2 DG FALSE FALSE FALSE FALSE SN
GSE8397 Affymetrix HGU133A DG TRUE TRUE FALSE TRUE SN
Simunovic Affymetrix HGU133A DG FALSE TRUE FALSE TRUE DA
GSE24378 Affymetrix U133_X3P D FALSE TRUE FALSE TRUE DA
GSE20333 Affymetrix HG-Focus DG FALSE FALSE FALSE TRUE SN
Moreira Agilent G4112F DG FALSE FALSE FALSE TRUE SN
GSE20159 Illumina HumanV3 DG FALSE FALSE FALSE TRUE SN
GSE26927 Illumina HumanV2 DGA FALSE FALSE TRUE TRUE SN
GSE49036 Affymetrix HGU133Plus2 DGA FALSE FALSE FALSE TRUE SN
\ No newline at end of file
......@@ -8,7 +8,6 @@ datasets:
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: DA
-
dataset_name: GSE20163
......@@ -19,7 +18,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE20164
......@@ -27,10 +25,9 @@ datasets:
array_type: HGU133A
clinical_descriptors: DG
has_paired_samples: 'FALSE'
has_batches: 'FALSE'
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE20292
......@@ -41,7 +38,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE7307
......@@ -49,10 +45,9 @@ datasets:
array_type: HGU133Plus2
clinical_descriptors: DG
has_paired_samples: 'FALSE'
has_batches: 'TRUE'
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'FALSE'
suitable_for_factorial_analysis: 'FALSE'
tissue: SN
-
dataset_name: GSE7621
......@@ -63,7 +58,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'FALSE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE8397
......@@ -71,10 +65,9 @@ datasets:
array_type: HGU133A
clinical_descriptors: DG
has_paired_samples: 'TRUE'
has_batches: 'FALSE'
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: Simunovic
......@@ -85,7 +78,6 @@ datasets:
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: DA
-
dataset_name: GSE24378
......@@ -96,7 +88,6 @@ datasets:
has_batches: 'TRUE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: DA
-
dataset_name: GSE20333
......@@ -107,7 +98,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: Moreira
......@@ -118,7 +108,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE20159
......@@ -129,7 +118,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE26927
......@@ -140,7 +128,6 @@ datasets:
has_batches: 'FALSE'
cleaning: 'TRUE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
-
dataset_name: GSE49036
......@@ -151,5 +138,4 @@ datasets:
has_batches: 'FALSE'
cleaning: 'FALSE'
has_age: 'TRUE'
suitable_for_factorial_analysis: 'TRUE'
tissue: SN
......@@ -15,32 +15,39 @@ normalizations:
limma_analyses:
-
factor: Gender
coefficients: ["F - M"]
names: ["FemaleVsMale"]
factorial: FALSE
sample_groups: ["All"]
clinical_factors: ["Gender"]
coefficient: "F - M"
name: "FemaleVsMale"
clinical_factor: "Gender"
-
factor: Disease.status
coefficients: ["PD - Control"]
names: ["PDVsControl"]
factorial: FALSE
sample_groups: ["All"]
clinical_factors: ["Disease.status"]
coefficient: "PD - Control"
name: "PDVsControl"
clinical_factor: "Disease_status"
-
factor: gender_disease_status
coefficients: ["F.PD - F.Control", "M.PD - M.Control", "(F.PD - F.Control) - (M.PD - M.Control)"]
names: ["PDVsControl_females", "PDVsControl_males", "Gender_disease_status"]
factorial: TRUE
sample_groups: ["F", "M", "All"]
clinical_factors: ["Disease.status", "Disease.status", "Disease_status_gender"]
coefficient: "F.PD - F.Control"
name: "PDVsControl_females"
clinical_factor: "Disease_status_females"
-
factor: gender_disease_status
coefficients: ["F.PD - M.PD", "F.Control - M.Control", "(F.PD - M.PD) - (F.Control - M.Control)"]
names: ["FemaleVsMale_PD", "FemaleVsMale_control", "Disease_status_gender"]
factorial: TRUE
sample_groups: ["PD", "Control", "All"]
clinical_factors: ["Gender", "Gender", "Disease_status_gender"]
coefficient: "M.PD - M.Control"
name: "PDVsControl_males"
clinical_factor: "Disease_status_males"
-
factor: gender_disease_status
coefficient: "F.PD - M.PD"
name: "FemaleVsMale_PD"
clinical_factor: "Gender_PD"
-
factor: gender_disease_status
coefficient: "F.Control - M.Control"
name: "FemaleVsMale_control"
clinical_factor: "Gender_Control"
-
factor: gender_disease_status
coefficient: "(F.PD - F.Control) - (M.PD - M.Control)"
name: "Gender_disease_status"
clinical_factor: "Gender_disease_status"
# Integration schemes
nb_min_pval: 2
perc_min_pval: 0.33334
......
https://onlineyamltools.com/convert-csv-to-yaml
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment