preprocess_data_illumina_beadarray.R 7.11 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#' @title Preprocess an Illumina dataset with beadarray.
#'
#' @description This function preprocess an Illumina dataset using beadarray and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
12
13
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
14
15
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
16
17
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
18
#' @param batch_filename A string indicating where the batch information can be found,
19
#'  default to 'Batch.tsv'.
20
21
22
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
23
#'  is FALSE by default.
24
25
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files,
26
                                               compressed       = FALSE,
27
                                               batch_correction = "FALSE",
28
29
                                               batch_filename   = "Batch.tsv",
                                               clean_samples    = FALSE,
30
                                               verbose          = FALSE) {
31
32
33
34
35
36
37
38

  # We define the I/Os.
  raw_data_input_dir <- paste0(input_data_dir, "RAW/")

  # Read raw data from GEO (matrix file).
  matrix_filename <- list.files(raw_data_input_dir, full.names = TRUE)[1]
  gse_eset <- GEOquery::getGEO(filename = matrix_filename)
  gse_data <- methods::as(gse_eset, "ExpressionSetIllumina")
39
40

  # We clean up and log information.
41
  rm(raw_data_input_dir, matrix_filename, gse_eset)
42
43
44
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Raw data read."))
  }
45
46
47
48
49

  # We do different nornalization depending on the array. That is whether we have
  # controls probes (v3) or not (v2). The decision is based on the probe_quality field.
  gse_data_filt <- NULL
  if (is.null(Biobase::fData(gse_data)$PROBEQUALITY)) {
50

51
52
    # We have a v2 dataset, simple quantile normalization.
    gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "quantile")
53
54

    # We log the data (using an offset for small to negative values)
55
56
57
58
59
60
    offset    <- 0
    min_value <- min(Biobase::exprs(gse_data_norm))
    if (min_value < 1) {
      offset <- 1.11 - min_value
    }
    gse_data_filt <- log2(offset + Biobase::exprs(gse_data_norm))
61
62
63
64
65
66

    # We clean up and log information.
    rm(gse_data, gse_data_norm, offset, min_value)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Raw data processed."))
    }
67
68
69
70
71
72
73
74
75
76
77
78
  } else {

    # A bit of cleaning (specific to Illumina arrays).
    probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match",
                           "negative",
                           "regular")
    Biobase::fData(gse_data)$Status <- probe_status
    beadarray::Detection(gse_data)  <- beadarray::calculateDetection(gse_data,
                                                                     status = probe_status)

    # We run the beadarray pre-processing method on the data.
    # Background correction and normalization at once.
79
80
81
    gse_data_norm <-  beadarray::normaliseIllumina(gse_data,
                                                   method = "neqc",
                                                   status = probe_status)
82
83
84
85
86

    # Additional cleaning (after normalization - also Illumina specific).
    ids  <- as.character(Biobase::featureNames(gse_data_norm))
    qual <- unlist(mget(ids, get("illuminaHumanv3PROBEQUALITY"), ifnotfound = NA))
    rem  <- qual == "No match" | qual == "Bad" | is.na(qual)
87
    gse_data_filt <- Biobase::exprs(gse_data_norm[!rem, ])
88
89
90
91
92
93

    # We clean up and log information.
    rm(gse_data, gse_data_norm, probe_status, ids, qual, rem)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Raw data processed."))
    }
94
95
  }

96
97
98
99
100
101
  # We remove the probes that have 0 variance accross the samples.
  probe_vars  <- apply(gse_data_filt, 1, var)
  probe_var_0 <- names(probe_vars[probe_vars == 0])
  if (length(probe_var_0) > 0) {
    clean_probe_list <- setdiff(rownames(gse_data_filt), probe_var_0)
    gse_data_filt <- gse_data_filt[clean_probe_list, ]
102
103
104
105
106
107
108
    rm(clean_probe_list)
  }

  # We clean up and log information.
  rm(probe_vars, probe_var_0)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
109
110
111
112
113
114
115
  }

  # We correct for the batch effect if necesary.
  gse_data_filt_bc <- NULL
  if (batch_correction != "FALSE") {
    gse_data_filt_bc <- correct_batch_effect(eset           = gse_data_filt,
                                             input_data_dir = input_data_dir,
116
                                             verbose        = verbose,
117
                                             is_eset        = FALSE)
118
119
120
121
122
    # We log some information.
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Batch effect corrected."))
    }

123
124
    if (batch_correction == "TRUE") {
      gse_data_filt <- gse_data_filt_bc
125
      rm(gse_data_filt_bc)
126
127
    }
  } else {
128
    rm(gse_data_filt_bc)
129
130
  }

131
132
133
  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
134
135
    samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
                                                                      verbose = FALSE)))
136
137
    # We only keep the samples with clinical data.
    gse_data_filt <- gse_data_filt[, samples]
138
139
140
    if (batch_correction == "BOTH") {
      gse_data_filt_bc <- gse_data_filt_bc[, samples]
    }
141
142
143
144
145
146

    # We clean up and log information.
    rm(samples)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
    }
147
148
149
  }

  # We save the eset data as TSV file.
150
  utils::write.table(gse_data_filt, file = output_data_files[1], sep = "\t", quote = FALSE)
151
152
  eset <- Biobase::ExpressionSet(gse_data_filt)
  rm(gse_data_filt)
153
154
155
156
157
158
  eset_bc <- NULL
  if (batch_correction == "BOTH") {
    utils::write.table(gse_data_filt_bc, file = output_data_files[2], sep = "\t", quote = FALSE)
    eset_bc <- Biobase::ExpressionSet(gse_data_filt_bc)
    rm(gse_data_filt_bc)
  } else {
159
    rm(eset_bc)
160
  }
161

162
  # We log information.
163
  if (verbose == TRUE) {
164
    message(paste0("[", Sys.time(), "] Processed data written to files."))
165
166
  }

167
168
  # We return the created ESET(s).
  if (batch_correction == "BOTH") {
169
    return(list(eset_bc, eset))
170
171
172
  } else {
    return(eset)
  }
173
}