preprocess_data_rnaseq.R 5.61 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
#' @title Post preprocess a RNA-seq dataset.
#'
#' @description This function preprocess a dataset that was already pre-processed
#' in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param remove_low_counts A boolean indicating whether low counts should be removed prior
#' to the normalization, TRUE by default.
#' @param low_count_threshold A numeric representing the maximum number of counts associated to
#' transcripts considered as not lowly expressed. Default value is 5.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_rnaseq <- function(input_data_dir, output_data_files,
                                   expression_filename = "ExpData.tsv",
                                   batch_correction    = "FALSE",
                                   batch_filename      = "Batch.tsv",
                                   clean_samples       = FALSE,
                                   remove_low_counts   = TRUE,
                                   low_count_threshold = 5,
                                   verbose             = FALSE) {

  # We read the pre-processed data (RNA-seq).
  exp_data_fn <- paste0(input_data_dir, expression_filename)
  exp_data    <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
  rm(exp_data_fn)

  # The RNA-seq data consists of counts so we need to post-process
  # it to get some kind of microarray compatible data (for instance on a log2 scale).
  # We first need to filter the low expressed transcripts.
  if (remove_low_counts) {
    keep     <- rowSums(exp_data) > low_count_threshold
    exp_data <- exp_data[keep, ]
    rm(keep)
  }

  # We need to normalize the counts.
  exp_data_dge <- edgeR::DGEList(counts = exp_data)
  exp_data_dge <- edgeR::calcNormFactors(exp_data_dge)
  rm(exp_data)

  # CPM then does the normalization and the log-transformation.
  exp_data_norm <- limma::voom(exp_data_dge, normalize.method = "quantile")$E
  rm(exp_data_dge)

  # We log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Raw data read"))
  }

  # We remove the probes that have 0 variance accross the samples.
  probe_vars  <- apply(exp_data_norm, 1, var)
  probe_var_0 <- names(probe_vars[probe_vars == 0])
  eset <- NULL
  if (length(probe_var_0) > 0) {
    clean_probe_list <- setdiff(rownames(exp_data_norm), probe_var_0)
    eset <- Biobase::ExpressionSet(exp_data_norm[clean_probe_list, ])
    rm(clean_probe_list)
  } else {
    eset <- Biobase::ExpressionSet(exp_data_norm)
  }

  # We clean up and log information.
  rm(exp_data_norm, probe_vars, probe_var_0)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
  }

  # We correct for the batch effect if necesary.
  eset_bc <- NULL
  if (batch_correction != "FALSE") {
    eset_bc <- correct_batch_effect(eset           = eset,
                                    input_data_dir = input_data_dir,
                                    verbose        = verbose)

    # We log some information.
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Batch effect corrected."))
    }

    if (batch_correction == "TRUE") {
      eset <- eset_bc
      rm(eset_bc)
    }
  } else {
    rm(eset_bc)
  }

  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
    samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
                                                                      verbose = verbose)))
    # We only keep the samples with clinical data.
    eset <- eset[, samples]
    if (batch_correction == "BOTH") {
      eset_bc <- eset_bc[, samples]
    }

    # We clean up and log information.
    rm(samples)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
    }
  }

  # We save the eset data as TSV file.
  utils::write.table(Biobase::exprs(eset),
                     file  = output_data_files[1],
                     sep   = "\t",
                     quote = FALSE)
  if (batch_correction == "BOTH") {
    utils::write.table(Biobase::exprs(eset_bc),
                       file  = output_data_files[2],
                       sep   = "\t",
                       quote = FALSE)
  }

  # We log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Processed data written to files."))
  }

  # We return the created ESET(s).
  if (batch_correction == "BOTH") {
    return(list(eset, eset_bc))
  } else {
    return(eset)
  }
}