preprocess_data.R 4.56 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#' @title Preprocess an expression dataset.
#'
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#'  preprocessed data.
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#'  be performed, default to FALSE.
#' @param batch_filename A string indicating where the batch information can be found,
#'  default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data <- function(input_data_dir, output_data_file,
                            platform         = "Affymetix",
                            method           = "SCAN",
                            compressed       = FALSE,
                            batch_correction = FALSE,
                            batch_filename   = "Batch.tsv",
                            clean_samples    = FALSE,
                            verbose          = TRUE) {

  # We launch the correct function depending on the array platform and desired method.
  eset <- NULL
  if (platform == "Affymetrix") {
    if (method == "SCAN") {
      eset <- preprocess_data_affymetrix_scan(input_data_dir,
                                              output_data_file,
                                              compressed       = compressed,
                                              batch_correction = batch_correction,
                                              batch_filename   = batch_filename,
                                              clean_samples    = clean_samples,
                                              verbose          = verbose)
50
51
52
53
54
55
56
57
    } else if (method == "GCRMA") {
      eset <- preprocess_data_affymetrix_gcrma(input_data_dir,
                                               output_data_file,
                                               compressed       = compressed,
                                               batch_correction = batch_correction,
                                               batch_filename   = batch_filename,
                                               clean_samples    = clean_samples,
                                               verbose          = verbose)
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
    }
  } else if (platform == "Agilent") {
    eset <- preprocess_data_agilent_limma(input_data_dir,
                                          output_data_file,
                                          compressed       = compressed,
                                          batch_correction = batch_correction,
                                          batch_filename   = batch_filename,
                                          clean_samples    = clean_samples,
                                          verbose          = verbose)
  } else if (platform == "Illumina") {
    eset <- preprocess_data_illumina_beadarray(input_data_dir,
                                               output_data_file,
                                               compressed       = compressed,
                                               batch_correction = batch_correction,
                                               batch_filename   = batch_filename,
                                               clean_samples    = clean_samples,
                                               verbose          = verbose)
  } else {
76
77
    message(paste0("[", Sys.time(), "] Platform ", platform,
                   " not yet supported (no preprocessing done)."))
78
79
80
81
82
  }

  # We return the created ESET.
  return(eset)
}