#!/usr/bin/env Rscript # ================================================================================================ # Libraries # ================================================================================================ library("yaml") library("Biobase") library("hgu133plus2.db") library("hgu133a.db") library("u133x3p.db") library("illuminaHumanv3.db") library("hgug4112a.db") library("hgfocus.db") library("ArrayUtils") library("tidyverse") library("RColorBrewer") library("huex10sttranscriptcluster.db") library("hugene10sttranscriptcluster.db") source("../libs/conf/confR.R") source("../libs/utils/utils.R") message(paste0("[", Sys.time(), "] Libraries loaded.")) # ================================================================================================ # Configuration # ================================================================================================ options(bitmapType = "cairo") config <- read_config(config_dirs = c("../Confs/", "./")) raw_data_dir <- config$global_raw_data_dir output_data_dir <- paste0(config$global_data_dir, config$local_data_dir) input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir) message(paste0("[", Sys.time(), "] Configuration done.")) # ================================================================================================ # Main # ================================================================================================ # For the integration, we need to know how many samples there are in each dataset # and for each category (M.PD, F.PD, M.Control, F.Control). nb_samples_fn <- paste0(input_data_dir, "clinical_categories_summarized.tsv") nb_samples <- read.table(nb_samples_fn, header = TRUE, sep = "\t", as.is = TRUE, row.names = 1) remove(nb_samples_fn) # We do all datasets one by one. for (i in seq_len(length(config$datasets))) { # We get the dataset details. dataset <- config$datasets[[i]] dataset_name <- dataset$dataset_name # We load the clinical data. pheno_data_fn <- paste0(dataset_name, "_clinical.tsv") pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir, pheno_data_fn, use_factors = FALSE)) pheno_data <- pheno_data %>% dplyr::mutate(gender_disease_status = paste(pheno_data$Gender, pheno_data$Disease.status, sep = ".")) rm(pheno_data_fn) # We get the batch data from the batch file if it exists. batch_data_file <- paste0(input_data_dir, dataset_name, "_batch.tsv") if (file.exists(batch_data_file)) { batch_data <- read.delim(batch_data_file, row.names = 1, stringsAsFactors = FALSE) pheno_data <- data.frame(c(pheno_data, batch_data)) pheno_data$Batch <- paste0("Batch", pheno_data$Batch) #nolint rm(batch_data) } rm(batch_data_file) # We repeat the analysis for all VSN usages. for (j in seq_len(length(config$variance_methods))) { # We get the current VSN configuration. vsn <- config$variance_methods[[j]] # We get the expression data. exp_data_fn <- paste0(input_data_dir, dataset_name, "_normalized_", vsn$name, ".tsv") exp_eset <- ArrayUtils::read_eset(exp_data_fn) rm(exp_data_fn) # Annotate the eset with gene information for better outputs. # First, we collect raw the gene annotations and we have two options # either from the Bioconductor package or from the processed GEO plateform annotation file. platform_config <- get_platform(config, dataset$array_type) # We enrich the probe identifiers with the corresponding gene names using # the prepared file (based on BioMart). probe_ids <- rownames(exp_eset) gene_annots_filename <- paste0(platform_config$platform_name, "_genenames.tsv") gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(input_data_dir, gene_annots_filename, probe_ids) rm(gene_annots_filename, probe_ids, platform_config) # Then, we make sure we deal with duplicates and reorder the df to match the eset. gene_annots <- gene_annots_raw %>% dplyr::group_by(PROBEID) %>% dplyr::summarize_at(.vars = c("SYMBOL"), .funs = collapser) %>% dplyr::ungroup() gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ] rm(gene_annots_raw) message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name, "] Gene annotations ready.")) # Additional QC: plot the MDS to check again for outliers. mds_filename <- paste0(output_data_dir, dataset_name, "_mdsplot_", vsn$name, ".png") png(mds_filename) col_group <- factor(pheno_data$gender_disease_status) levels(col_group) <- RColorBrewer::brewer.pal(nlevels(col_group), "Set1") col_group <- as.character(col_group) limma::plotMDS(exp_eset, labels = pheno_data$gender_disease_status, col = col_group, pch = 19) dev.off() rm(mds_filename, col_group) message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name, "] MDS plot created.")) # We update the eset with the gene annotations. # We only do it when the probe ids do match (otherwise, we issue a warning). if (all(gene_annots$PROBEID == rownames(exp_eset))) { gene_annots_asdf <- new("AnnotatedDataFrame", data = data.frame(gene_annots[, -1], stringsAsFactors = FALSE) ) rownames(gene_annots_asdf) <- gene_annots$PROBEID Biobase::featureData(exp_eset) <- gene_annots_asdf rm(gene_annots_asdf, gene_annots) } else { message(paste0("[", Sys.time(), "][", vsn$name, "] Incorrect annotations for ", dataset_name, ".")) } message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name, "] ExpressionSet updated.")) # Loop over Limma analyses. for (k in seq_len(length(config$limma_analyses))) { # We extract the Limma parameters. limma_parameters <- config$limma_analyses[[k]] # We only do the limma analysis when it is possible, that is when we have at least 2 samples # for each of the category we are comparing. For instance, when there is no female PD # patients (GSE7307), we can not do any comparison that involves female PD patients. if (nb_samples[dataset_name, limma_parameters$clinical_factor] < 2) { next; } # We have enough samples, now we need to decide which co-factors we are going to include in # the limma analysis. # First, we check if we need to consider paired samples, then we use a block_key and # a cofactor_name. This is only valid for one dataset in our case. age_correct <- TRUE fit <- NULL if (dataset$has_paired_samples) { # By default, we use Tissue / Patient. # This might be set as a parameter as well if we have more cases. # We also set the age and batch co-factors. if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) { age_correct <- FALSE } fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters, correct_for_age = age_correct, correct_for_batch = dataset$has_batches, cofactor_name = "Tissue", block_key = "Patient", run_topconfects = FALSE, output_data_dir = output_data_dir, file_prefix = dataset_name, file_suffix = vsn$name, verbose = TRUE) } else { # We do not have paired data. Let's then set the age and batch co-factors. if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) { age_correct <- FALSE } fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters, correct_for_age = age_correct, correct_for_batch = dataset$has_batches, run_topconfects = FALSE, output_data_dir = output_data_dir, file_prefix = dataset_name, file_suffix = vsn$name, verbose = TRUE) } rm(age_correct) message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name, "][", k, "] Limma fit done.")) # We extract the DEGs for the coefficient of interest (for that limma analysis). ArrayUtils::extract_degs(fit, limma_parameters$name, 1, output_data_dir = output_data_dir, file_prefix = dataset_name, file_suffix = vsn$name) rm(limma_parameters, fit) message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name, "][", k, "] DEG extraction done.")) } # End for each Limma analysis. rm(k, vsn, exp_eset) } # End of each VSN configuration. rm(j, dataset, dataset_name, pheno_data) } # End for each dataset. rm(i, config, raw_data_dir, output_data_dir, input_data_dir, nb_samples) # We log the session details for reproducibility. sessionInfo()