#' @title Preprocess an Illumina dataset with beadarray. #' #' @description This function preprocess an Illumina dataset using beadarray and saves the #' results in a given TSV file. In addition, it returns the ESET object. #' #' The function assumes that a folder containing the raw data exists (as cel files). #' #' Note: the function does not check for the existence of folders or files. The bacth effect #' correction is not yet supported. #' #' @param input_data_dir A string representing the folder that contains the input data. #' @param output_data_files An array of strings representing the files that should contain the #' preprocessed data. At least one value, maximum two if batch_correction is "BOTH". #' @param compressed A boolean representing whether the cel files are compressed. This #' is FALSE by default. Note: only for compatibility reasons (has no effect on analysis). #' @param batch_correction A String indicating whether batch correction should #' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE". #' @param batch_filename A string indicating where the batch information can be found, #' default to 'Batch.tsv'. #' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing #' the samples that do not have clinical data. Default to FALSE. #' @param verbose A boolean representing whether the function should display log information. This #' is TRUE by default. #' @return The expression data as ESET objects. Potentially only one object (therefore unlisted). preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files, compressed = FALSE, batch_correction = "FALSE", batch_filename = "Batch.tsv", clean_samples = FALSE, verbose = TRUE) { # We define the I/Os. raw_data_input_dir <- paste0(input_data_dir, "RAW/") # Read raw data from GEO (matrix file). matrix_filename <- list.files(raw_data_input_dir, full.names = TRUE)[1] gse_eset <- GEOquery::getGEO(filename = matrix_filename) gse_data <- methods::as(gse_eset, "ExpressionSetIllumina") rm(raw_data_input_dir, matrix_filename, gse_eset) # We do different nornalization depending on the array. That is whether we have # controls probes (v3) or not (v2). The decision is based on the probe_quality field. gse_data_filt <- NULL if (is.null(Biobase::fData(gse_data)$PROBEQUALITY)) { # We have a v2 dataset, simple quantile normalization. gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "quantile") # We log the data (using an offset for small to negative values) offset <- 0 min_value <- min(Biobase::exprs(gse_data_norm)) if (min_value < 1) { offset <- 1.11 - min_value } gse_data_filt <- log2(offset + Biobase::exprs(gse_data_norm)) rm(gse_data, gse_data_norm) } else { # A bit of cleaning (specific to Illumina arrays). probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match", "negative", "regular") Biobase::fData(gse_data)$Status <- probe_status beadarray::Detection(gse_data) <- beadarray::calculateDetection(gse_data, status = probe_status) # We run the beadarray pre-processing method on the data. # Background correction and normalization at once. gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "neqc", status = probe_status) # Additional cleaning (after normalization - also Illumina specific). ids <- as.character(Biobase::featureNames(gse_data_norm)) qual <- unlist(mget(ids, get("illuminaHumanv3PROBEQUALITY"), ifnotfound = NA)) rem <- qual == "No match" | qual == "Bad" | is.na(qual) gse_data_filt <- Biobase::exprs(gse_data_norm[!rem, ]) rm(gse_data, gse_data_norm) } # We remove the probes that have 0 variance accross the samples. probe_vars <- apply(gse_data_filt, 1, var) probe_var_0 <- names(probe_vars[probe_vars == 0]) if (length(probe_var_0) > 0) { clean_probe_list <- setdiff(rownames(gse_data_filt), probe_var_0) gse_data_filt <- gse_data_filt[clean_probe_list, ] remove(clean_probe_list) } remove(probe_vars, probe_var_0) # We correct for the batch effect if necesary. gse_data_filt_bc <- NULL if (batch_correction != "FALSE") { gse_data_filt_bc <- correct_batch_effect(eset = gse_data_filt, input_data_dir = input_data_dir, is_eset = FALSE) if (batch_correction == "TRUE") { gse_data_filt <- gse_data_filt_bc remove(gse_data_filt_bc) } } else { remove(gse_data_filt_bc) } # If necessary, we remove the samples that do not have clinical data. if (clean_samples) { # We load the clinical data as to get the samples to keep. samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = FALSE))) # We only keep the samples with clinical data. gse_data_filt <- gse_data_filt[, samples] if (batch_correction == "BOTH") { gse_data_filt_bc <- gse_data_filt_bc[, samples] } } # We save the eset data as TSV file. utils::write.table(gse_data_filt, file = output_data_files[1], sep = "\t", quote = FALSE) eset <- Biobase::ExpressionSet(gse_data_filt) rm(gse_data_filt) eset_bc <- NULL if (batch_correction == "BOTH") { utils::write.table(gse_data_filt_bc, file = output_data_files[2], sep = "\t", quote = FALSE) eset_bc <- Biobase::ExpressionSet(gse_data_filt_bc) rm(gse_data_filt_bc) } else { remove(eset_bc) } # We clean up and log information. if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Expression data pre-processed with beadarray.")) } # We return the created ESET(s). if (batch_correction == "BOTH") { return(list(eset, eset_bc)) } else { return(eset) } }