preprocess_data.R 4.76 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
#' @title Preprocess an expression dataset.
#'
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
13
14
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
15
16
17
18
19
20
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default.
21
22
#' @param batch_correction A String indicating whether batch correction should
#'  be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
23
24
25
26
27
#' @param batch_filename A string indicating where the batch information can be found,
#'  default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
28
#'  is FALSE by default.
29
30
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
31
32
33
                            platform         = "Affymetix",
                            method           = "SCAN",
                            compressed       = FALSE,
34
                            batch_correction = "FALSE",
35
36
                            batch_filename   = "Batch.tsv",
                            clean_samples    = FALSE,
37
                            verbose          = FALSE) {
38
39

  # We launch the correct function depending on the array platform and desired method.
40
  esets <- NULL
41
42
  if (platform == "Affymetrix") {
    if (method == "SCAN") {
43
44
      esets <- preprocess_data_affymetrix_scan(input_data_dir,
                                               output_data_files,
45
46
47
48
49
                                               compressed       = compressed,
                                               batch_correction = batch_correction,
                                               batch_filename   = batch_filename,
                                               clean_samples    = clean_samples,
                                               verbose          = verbose)
50
51
52
53
54
55
56
57
    } else if (method == "GCRMA") {
      esets <- preprocess_data_affymetrix_gcrma(input_data_dir,
                                                output_data_files,
                                                compressed       = compressed,
                                                batch_correction = batch_correction,
                                                batch_filename   = batch_filename,
                                                clean_samples    = clean_samples,
                                                verbose          = verbose)
58
59
    }
  } else if (platform == "Agilent") {
60
61
62
63
64
65
66
    esets <- preprocess_data_agilent_limma(input_data_dir,
                                           output_data_files,
                                           compressed       = compressed,
                                           batch_correction = batch_correction,
                                           batch_filename   = batch_filename,
                                           clean_samples    = clean_samples,
                                           verbose          = verbose)
67
  } else if (platform == "Illumina") {
68
69
70
71
72
73
74
    esets <- preprocess_data_illumina_beadarray(input_data_dir,
                                                output_data_files,
                                                compressed       = compressed,
                                                batch_correction = batch_correction,
                                                batch_filename   = batch_filename,
                                                clean_samples    = clean_samples,
                                                verbose          = verbose)
75
  } else {
76
    message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
77
                   " not yet supported (no preprocessing done)."))
78
79
  }

80
81
  # We return the created ESET(s).
  return(esets)
82
}