preprocess_data_agilent_limma.R 3.59 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#' @title Preprocess an Agilent dataset with limma.
#'
#' @description This function preprocess an Agilent dataset using limma and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#'  preprocessed data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
#' @param batch_correction A boolean indicating whether batch correction should
#'  be performed, default to FALSE. Note: not yet suported.
#' @param batch_filename A string indicating where the batch information can be found,
#'  default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_agilent_limma <- function(input_data_dir, output_data_file,
                                          compressed       = FALSE,
                                          batch_correction = FALSE,
                                          batch_filename   = "Batch.tsv",
                                          clean_samples    = FALSE,
                                          verbose          = TRUE) {

  # We define the I/Os.
  raw_data_input_dir <- paste0(input_data_dir, "RAW/")

  # We read the data (aka agilent MA images).
  raw_file_list <- list.files(path = raw_data_input_dir)
  batch <- limma::read.maimages(files      = raw_file_list,
                                source     = "agilent",
                                path       = raw_data_input_dir,
                                green.only = TRUE,
                                verbose    = TRUE)
  batch_data = log2(batch$E)
  remove(raw_data_input_dir, batch)

  # We run the LIMMA pre-processing method on the data.
  # First, background correcttion.
  batch_data_bg <- limma::backgroundCorrect(batch_data,
                                            method         = "normexp",
                                            normexp.method = "mle",
                                            verbose        = TRUE)

  # Second, perform quantile normalization.
  batch_data_norm <- limma::normalizeBetweenArrays(batch_data_bg, method = "quantile")

  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
    samples <- rownames(pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = FALSE)))
    # We only keep the samples with clinical data.
    batch_data_norm <- batch_data_norm[, samples]
  }

  # We save the eset data as TSV file.
  eset <- methods::new("ExpressionSet", exprs = as.matrix(batch_data_norm))
  rm(batch_data_bg)
  utils::write.table(batch_data_norm, file = output_data_file, sep = "\t", quote = FALSE)
  rm(batch_data_norm)

  # We clean up and log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Expression data pre-processed with LIMMA."))
  }

  # We return the created ESET.
  return(eset)
}