#' @title Post preprocess a RNA-seq dataset. #' #' @description This function preprocess a dataset that was already pre-processed #' in order to perform the cleaning / filtering / batch corection steps. #' The results are saves in TSV file(s). In addition, it returns the ESET object(s). #' #' Note: the function does not check for the existence of folders or files. #' #' @param input_data_dir A string representing the folder that contains the input data. #' @param output_data_files An array of strings representing the files that should contain the #' preprocessed data. At least one value, maximum two if batch_correction is "BOTH". #' @param expression_filename A String indicating where the expression matrix can be found, #' default to 'Expdata.tsv'. #' @param batch_correction A String indicating whether batch correction should #' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE". #' @param batch_filename A string indicating where the batch information can be found, #' default to 'Batch.tsv'. Note: not yet suported. #' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing #' the samples that do not have clinical data. Default to FALSE. #' @param remove_low_counts A boolean indicating whether low counts should be removed prior #' to the normalization, TRUE by default. #' @param low_count_threshold A numeric representing the maximum number of counts associated to #' transcripts considered as not lowly expressed. Default value is 5. #' @param verbose A boolean representing whether the function should display log information. This #' is FALSE by default. #' @return The expression data as ESET objects. Potentially only one object (therefore unlisted). preprocess_data_rnaseq <- function(input_data_dir, output_data_files, expression_filename = "ExpData.tsv", batch_correction = "FALSE", batch_filename = "Batch.tsv", clean_samples = FALSE, remove_low_counts = TRUE, low_count_threshold = 5, verbose = FALSE) { # We read the pre-processed data (RNA-seq). exp_data_fn <- paste0(input_data_dir, expression_filename) exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE) rm(exp_data_fn) # The RNA-seq data consists of counts so we need to post-process # it to get some kind of microarray compatible data (for instance on a log2 scale). # We first need to filter the low expressed transcripts. if (remove_low_counts) { keep <- rowSums(exp_data) > low_count_threshold exp_data <- exp_data[keep, ] rm(keep) } # We need to normalize the counts. exp_data_dge <- edgeR::DGEList(counts = exp_data) exp_data_dge <- edgeR::calcNormFactors(exp_data_dge) rm(exp_data) # CPM then does the normalization and the log-transformation. exp_data_norm <- limma::voom(exp_data_dge, normalize.method = "quantile")$E rm(exp_data_dge) # We log information. if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Raw data read")) } # We remove the probes that have 0 variance accross the samples. probe_vars <- apply(exp_data_norm, 1, var) probe_var_0 <- names(probe_vars[probe_vars == 0]) eset <- NULL if (length(probe_var_0) > 0) { clean_probe_list <- setdiff(rownames(exp_data_norm), probe_var_0) eset <- Biobase::ExpressionSet(exp_data_norm[clean_probe_list, ]) rm(clean_probe_list) } else { eset <- Biobase::ExpressionSet(exp_data_norm) } # We clean up and log information. rm(exp_data_norm, probe_vars, probe_var_0) if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Data cleaned (step I).")) } # We correct for the batch effect if necesary. eset_bc <- NULL if (batch_correction != "FALSE") { eset_bc <- correct_batch_effect(eset = eset, input_data_dir = input_data_dir, verbose = verbose) # We log some information. if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Batch effect corrected.")) } if (batch_correction == "TRUE") { eset <- eset_bc rm(eset_bc) } } else { rm(eset_bc) } # If necessary, we remove the samples that do not have clinical data. if (clean_samples) { # We load the clinical data as to get the samples to keep. samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose))) # We only keep the samples with clinical data. eset <- eset[, samples] if (batch_correction == "BOTH") { eset_bc <- eset_bc[, samples] } # We clean up and log information. rm(samples) if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Data cleaned (step II).")) } } # We save the eset data as TSV file. utils::write.table(Biobase::exprs(eset), file = output_data_files[1], sep = "\t", quote = FALSE) if (batch_correction == "BOTH") { utils::write.table(Biobase::exprs(eset_bc), file = output_data_files[2], sep = "\t", quote = FALSE) } # We log information. if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Processed data written to files.")) } # We return the created ESET(s). if (batch_correction == "BOTH") { return(list(eset, eset_bc)) } else { return(eset) } }