preprocess_data.R 5.56 KB
Newer Older
1
2
3
4
5
#' @title Preprocess an expression dataset.
#'
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
6
#' The function assumes that a folder containing the raw data exists (with cel files).
7
8
9
10
11
12
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
13
14
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
15
16
17
18
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
19
20
#' @param exprs_raw A matrix corresponding to the pre-processed data (in the case of APT-GCRMA).
#' Default to NULL since in most cases, we do not have pre-processed data already.
21
22
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default.
23
24
#' @param batch_correction A String indicating whether batch correction should
#'  be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
25
26
27
28
29
#' @param batch_filename A string indicating where the batch information can be found,
#'  default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
30
#'  is FALSE by default.
31
32
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
33
34
                            platform         = "Affymetix",
                            method           = "SCAN",
35
                            exprs_raw        = NULL,
36
                            compressed       = FALSE,
37
                            batch_correction = "FALSE",
38
39
                            batch_filename   = "Batch.tsv",
                            clean_samples    = FALSE,
40
                            verbose          = FALSE) {
41
42

  # We launch the correct function depending on the array platform and desired method.
43
  esets <- NULL
44
45
  if (platform == "Affymetrix") {
    if (method == "SCAN") {
46
47
      esets <- preprocess_data_affymetrix_scan(input_data_dir,
                                               output_data_files,
48
49
50
51
52
                                               compressed       = compressed,
                                               batch_correction = batch_correction,
                                               batch_filename   = batch_filename,
                                               clean_samples    = clean_samples,
                                               verbose          = verbose)
53
54
55
56
57
58
59
60
    } else if (method == "GCRMA") {
      esets <- preprocess_data_affymetrix_gcrma(input_data_dir,
                                                output_data_files,
                                                compressed       = compressed,
                                                batch_correction = batch_correction,
                                                batch_filename   = batch_filename,
                                                clean_samples    = clean_samples,
                                                verbose          = verbose)
61
62
63
64
65
66
67
68
    } else if (method == "APT-GCRMA") {
      esets <- preprocess_data_affymetrix_aptgcrma(exprs_raw,
                                                   input_data_dir,
                                                   output_data_files,
                                                   batch_correction = batch_correction,
                                                   batch_filename   = batch_filename,
                                                   clean_samples    = clean_samples,
                                                   verbose          = verbose)
69
70
    }
  } else if (platform == "Agilent") {
71
72
73
74
75
76
77
    esets <- preprocess_data_agilent_limma(input_data_dir,
                                           output_data_files,
                                           compressed       = compressed,
                                           batch_correction = batch_correction,
                                           batch_filename   = batch_filename,
                                           clean_samples    = clean_samples,
                                           verbose          = verbose)
78
  } else if (platform == "Illumina") {
79
80
81
82
83
84
85
    esets <- preprocess_data_illumina_beadarray(input_data_dir,
                                                output_data_files,
                                                compressed       = compressed,
                                                batch_correction = batch_correction,
                                                batch_filename   = batch_filename,
                                                clean_samples    = clean_samples,
                                                verbose          = verbose)
86
  } else {
87
    message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
88
                   " not yet supported (no preprocessing done)."))
89
90
  }

91
92
  # We return the created ESET(s).
  return(esets)
93
}