preprocess_data_affymetrix_scan.R 4.57 KB
Newer Older
1
#' @title Preprocess an Affymetrix dataset with SCAN.
2
#'
3
4
#' @description This function preprocess an Affymetrix dataset using SCAN and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
5
6
7
8
9
10
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
11
12
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
13
14
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default.
15
16
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
17
18
#' @param batch_filename A string indicating where the batch information can be found,
#'  default to 'Batch.tsv'.
19
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
20
#'  the samples that do not have clinical data. Default to FALSE.
21
#' @param verbose A boolean representing whether the function should display log information. This
22
#'  is FALSE by default.
23
24
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
25
                                            compressed       = FALSE,
26
                                            batch_correction = "FALSE",
27
28
                                            batch_filename   = "Batch.tsv",
                                            clean_samples    = FALSE,
29
                                            verbose          = FALSE) {
30
31
32
33
34
35
36

  # We define the I/Os.
  raw_data_input_dir <- paste0(input_data_dir, "RAW/")

  # We run the SCAN pre-processing method on the data.
  # We do not run the fast analysis (by default).
  input_data_regexp <- paste0(raw_data_input_dir, "*")
37
38
  eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_files[1])

39
40
41
42
43
44
  # We clean up and log information.
  rm(raw_data_input_dir, input_data_regexp)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Raw data processed."))
  }

45
46
47
48
49
50
51
  # We remove the probes that have 0 variance accross the samples.
  exp_data    <- Biobase::exprs(eset)
  probe_vars  <- apply(exp_data, 1, var)
  probe_var_0 <- names(probe_vars[probe_vars == 0])
  if (length(probe_var_0) > 0) {
    clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
    eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
52
53
54
55
56
57
58
    rm(clean_probe_list)
  }

  # We clean up and log information.
  rm(exp_data, probe_vars, probe_var_0)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
59
60
61
62
63
64
  }

  # We correct for the batch effect if necesary.
  eset_bc <- NULL
  if (batch_correction != "FALSE") {
    eset_bc <- correct_batch_effect(eset           = eset,
65
66
67
68
69
70
71
72
                                    input_data_dir = input_data_dir,
                                    verbose        = verbose)

    # We log some information.
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Batch effect corrected."))
    }

73
74
    if (batch_correction == "TRUE") {
      eset <- eset_bc
75
      rm(eset_bc)
76
    }
77
  } else {
78
    rm(eset_bc)
79
80
81
82
83
84
  }

  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
    samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
85
                                                                      verbose = verbose)))
86
87
88
89
90
    # We only keep the samples with clinical data.
    eset <- eset[, samples]
    if (batch_correction == "BOTH") {
      eset_bc <- eset_bc[, samples]
    }
91
92
93
94
95
96

    # We clean up and log information.
    rm(samples)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
    }
97
98
99
100
101
102
103
104
  }

  # We save the eset_bc data as TSV file. ESET was already done as part of SCAN.
  if (batch_correction == "BOTH") {
    utils::write.table(Biobase::exprs(eset_bc),
                       file  = output_data_files[2],
                       sep   = "\t",
                       quote = FALSE)
105
106
  }

107
  # We log information.
108
  if (verbose == TRUE) {
109
    message(paste0("[", Sys.time(), "] Processed data written to files."))
110
111
  }

112
113
  # We return the created ESET(s).
  if (batch_correction == "BOTH") {
114
    return(list(eset_bc, eset))
115
116
117
  } else {
    return(eset)
  }
118
}