preprocess_data_illumina_beadarray.R 6.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
#' @title Preprocess an Illumina dataset with beadarray.
#'
#' @description This function preprocess an Illumina dataset using beadarray and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
12
13
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
14
15
#' @param compressed A boolean representing whether the cel files are compressed. This
#'  is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
16
17
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
18
#' @param batch_filename A string indicating where the batch information can be found,
19
#'  default to 'Batch.tsv'.
20
21
22
23
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#'  the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
24
25
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files,
26
                                               compressed       = FALSE,
27
                                               batch_correction = "FALSE",
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
                                               batch_filename   = "Batch.tsv",
                                               clean_samples    = FALSE,
                                               verbose          = TRUE) {

  # We define the I/Os.
  raw_data_input_dir <- paste0(input_data_dir, "RAW/")

  # Read raw data from GEO (matrix file).
  matrix_filename <- list.files(raw_data_input_dir, full.names = TRUE)[1]
  gse_eset <- GEOquery::getGEO(filename = matrix_filename)
  gse_data <- methods::as(gse_eset, "ExpressionSetIllumina")
  rm(raw_data_input_dir, matrix_filename, gse_eset)

  # We do different nornalization depending on the array. That is whether we have
  # controls probes (v3) or not (v2). The decision is based on the probe_quality field.
  gse_data_filt <- NULL
  if (is.null(Biobase::fData(gse_data)$PROBEQUALITY)) {
45

46
47
    # We have a v2 dataset, simple quantile normalization.
    gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "quantile")
48
49

    # We log the data (using an offset for small to negative values)
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    offset    <- 0
    min_value <- min(Biobase::exprs(gse_data_norm))
    if (min_value < 1) {
      offset <- 1.11 - min_value
    }
    gse_data_filt <- log2(offset + Biobase::exprs(gse_data_norm))
    rm(gse_data, gse_data_norm)
  } else {

    # A bit of cleaning (specific to Illumina arrays).
    probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match",
                           "negative",
                           "regular")
    Biobase::fData(gse_data)$Status <- probe_status
    beadarray::Detection(gse_data)  <- beadarray::calculateDetection(gse_data,
                                                                     status = probe_status)

    # We run the beadarray pre-processing method on the data.
    # Background correction and normalization at once.
    gse_data_norm <-  beadarray::normaliseIllumina(gse_data, method = "neqc", status = probe_status)

    # Additional cleaning (after normalization - also Illumina specific).
    ids  <- as.character(Biobase::featureNames(gse_data_norm))
    qual <- unlist(mget(ids, get("illuminaHumanv3PROBEQUALITY"), ifnotfound = NA))
    rem  <- qual == "No match" | qual == "Bad" | is.na(qual)
75
    gse_data_filt <- Biobase::exprs(gse_data_norm[!rem, ])
76
77
78
    rm(gse_data, gse_data_norm)
  }

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  # We remove the probes that have 0 variance accross the samples.
  probe_vars  <- apply(gse_data_filt, 1, var)
  probe_var_0 <- names(probe_vars[probe_vars == 0])
  if (length(probe_var_0) > 0) {
    clean_probe_list <- setdiff(rownames(gse_data_filt), probe_var_0)
    gse_data_filt <- gse_data_filt[clean_probe_list, ]
    remove(clean_probe_list)
  }
  remove(probe_vars, probe_var_0)

  # We correct for the batch effect if necesary.
  gse_data_filt_bc <- NULL
  if (batch_correction != "FALSE") {
    gse_data_filt_bc <- correct_batch_effect(eset           = gse_data_filt,
                                             input_data_dir = input_data_dir,
                                             is_eset        = FALSE)
    if (batch_correction == "TRUE") {
      gse_data_filt <- gse_data_filt_bc
      remove(gse_data_filt_bc)
    }
  } else {
    remove(gse_data_filt_bc)
  }

103
104
105
  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
106
107
    samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
                                                                      verbose = FALSE)))
108
109
    # We only keep the samples with clinical data.
    gse_data_filt <- gse_data_filt[, samples]
110
111
112
    if (batch_correction == "BOTH") {
      gse_data_filt_bc <- gse_data_filt_bc[, samples]
    }
113
114
115
  }

  # We save the eset data as TSV file.
116
  utils::write.table(gse_data_filt, file = output_data_files[1], sep = "\t", quote = FALSE)
117
118
  eset <- Biobase::ExpressionSet(gse_data_filt)
  rm(gse_data_filt)
119
120
121
122
123
124
125
126
  eset_bc <- NULL
  if (batch_correction == "BOTH") {
    utils::write.table(gse_data_filt_bc, file = output_data_files[2], sep = "\t", quote = FALSE)
    eset_bc <- Biobase::ExpressionSet(gse_data_filt_bc)
    rm(gse_data_filt_bc)
  } else {
    remove(eset_bc)
  }
127
128
129
130
131
132

  # We clean up and log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Expression data pre-processed with beadarray."))
  }

133
134
135
136
137
138
  # We return the created ESET(s).
  if (batch_correction == "BOTH") {
    return(list(eset, eset_bc))
  } else {
    return(eset)
  }
139
}