run_limma.R 9.92 KB
Newer Older
1
2
3
4
5
6
7
8
#' @title Run a limma analysis.
#'
#' @description This function prepares and runs a limma analysis up until the fitting procedure.
#' The Limma analysis can be simple (A vs B) or more complex ((A vs B) vs (C vs D)). One single
#' cofactor can also be taken into account in the limma. It returns the final fit that can then
#' be used to detect differentially expressed genes.
#'
#' @param pheno_data The data-frame that contains the phenotypic data (aka clinical data).
9
10
#' @param eset The ExpressionSet object that contains the pre-processed data. Alternatively,
#' this can be an EList object (limma package), especially for RNA-seq data.
11
12
13
14
#' @param limma_parameters A list of useful Limma parameters that include (1) the main clinical
#' factor used to create sample categories (i.e., Disease status) and (2) the Limma coefficients
#' provided as formulas (similar to the limma::makeContrasts function). Additional parameters are
#' ignored for the time being.
15
#' @param correct_for_age A boolean indicating whether we should correct for age in the limma
16
#' design matrix. This expects that a clinical factor termed Age exists. Default to FALSE.
17
#' @param correct_for_batch A boolean indicating whether we should correct for a potential batch
18
19
#' effect in the limma design matrix. This expects that the actual batch information exists in
#' the object pheno_data (as field Batch). Default to FALSE.
20
21
22
#' @param correct_for_gender A boolean indicating whether we should correct for sample gender
#' in the limma design matrix. This expects that a clinical factor termed Gender exists. Default
#' to FALSE.
23
#' @param cofactor_name A string indicating a potential cofactor (included in the phenotypic data).
24
25
#' The default is NULL, and no cofactor is considered. This will replace the age correction if
#' both were deemed necessary.
26
27
28
29
30
31
32
33
34
35
36
37
#' @param block_key A string indicating which column in the phenotyypic data should be used to
#' build blocks if there is a cofactor to take into account. For instance, if there are several
#' replicates per patient, the block_key might be "Patient" and the cofactor might be "Replicate".
#' This will replace the age correction if both were deemed necessary.
#' @param run_topconfects A boolean indicating whether the analysis should be complemented with a
#' TopConfects analysis that bases the decision on confident fold changes and effect size. Default
#' to FALSE.
#' @param output_data_dir A string representing the folder in which the output files
#' are going to be stored (for TopConfects only). Default to "".
#' @param file_prefix A string used to prefix the file names (for TopConfects only). Default to "".
#' @param file_suffix A string used as a suffix for the file names (for TopConfects only). Default
#' to "".
38
39
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
40
41
#' @return The limma fit.
run_limma <- function(pheno_data, eset, limma_parameters,
42
43
44
45
46
47
48
49
50
51
                      correct_for_age    = FALSE,
                      correct_for_batch  = FALSE,
                      correct_for_gender = FALSE,
                      cofactor_name      = NULL,
                      block_key          = NULL,
                      run_topconfects    = FALSE,
                      output_data_dir    = "",
                      file_prefix        = "",
                      file_suffix        = "",
                      verbose            = FALSE) {
52
53

  # We extract the limma analysis parameters
54
55
56
  coefficients    <- limma_parameters[[2]]
  has_block       <- !is.null(cofactor_name)

57
58
59
60
61
62
63
64
  # We refine the prefix / suffic for the file names.
  if (file_prefix != "") {
    file_prefix <- paste0(file_prefix, "_")
  }
  if (file_suffix != "") {
    file_suffix <- paste0("_", file_suffix)
  }

65
66
67
68
69
  # We create the design matrix (regular case - no block).
  # Depending on the co-factors, we have several possibilities.
  # We sill decide based on (i) a block based on a co-factor (paired samples),
  # (ii) the age clincal descriptor and (iii) the experimental batches.
  # If we need to add a block based on a co-factor.
70
71
72
73
74
75
76
  design <- create_design(pheno_data         = pheno_data,
                          limma_parameters   = limma_parameters,
                          correct_for_age    = correct_for_age,
                          correct_for_batch  = correct_for_batch,
                          correct_for_gender = correct_for_gender,
                          cofactor_name      = cofactor_name,
                          verbose            = verbose)
77

78
  # We build the contrast (only one so far).
79
  contrast <- eval(substitute(limma::makeContrasts(coeff1 = c1, levels = design),
80
                              list(c1 = coefficients)))
81
  if (run_topconfects) {
82
    design_topconfects <- limma::contrastAsCoef(design, contrast)$design
83
  }
84
85

  # We log information.
86
  rm(coefficients)
87
88
89
90
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Contrast matrix created."))
  }

91
  # We fit the data using the above models and the contrasts
92
  # to prepare the differential analyses.
93
94
  lmfit             <- NULL
  lmfit_topconfects <- NULL
95

96
97
  # We create a block if necessary.
  if (has_block) {
98
99
    corfit <- limma::duplicateCorrelation(eset, design, block = pheno_data[[block_key]])
    lmfit  <- limma::lmFit(eset, design, block = pheno_data[[block_key]],
100
                                 correlation   = corfit$consensus)
101
102
103
104
105
    if (run_topconfects) {
      lmfit_topconfects <- limma::lmFit(eset, design_topconfects,
                                        block       = pheno_data[[block_key]],
                                        correlation = corfit$consensus)
    }
106
107

    # We log information.
108
    rm(corfit)
109
110
111
112
113
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Linear fit performed (with blocks)."))
    }
  } else {
    lmfit <- limma::lmFit(eset, design)
114
115
116
    if (run_topconfects) {
      lmfit_topconfects <- limma::lmFit(eset, design_topconfects)
    }
117
118
119
120
121

    # We log information.
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Linear fit performed."))
    }
122
123
  }
  fit <- limma::contrasts.fit(lmfit, contrast)
124
  rm(has_block, design, contrast, lmfit)
125
  fit <- limma::eBayes(fit)
126

127
128
  # We now finish the TopConfects analysis if necesary.
  if (run_topconfects) {
129

130
131
132
133
134
    # We first run TopConfects on the dedicated lmfit object.
    confects <- topconfects::limma_confects(lmfit_topconfects, coef = 1, fdr = 0.05)

    # We need to check whether we have any confect at all (if no effect is > 0 at FDR = 0.05,
    # then all confects are set to NA values).
135
    confects_genes <- confects$table %>% filter(!is.na(confect)) #nolint
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
    if (nrow(confects_genes) > 0) {

      # Log info.
      if (verbose == TRUE) {
        message(paste0("[", Sys.time(), "] [TopConfects] ", nrow(confects_genes),
                       " genes identified with TopConfects."))
      }

      # We save the list of genes
      confects_table_fn <- paste0(output_data_dir, file_prefix, "topconfects_",
                                  limma_parameters$name, file_suffix, ".tsv")
      utils::write.table(confects_genes,
                         file      = confects_table_fn,
                         sep       = "\t",
                         quote     = FALSE,
                         col.names = NA)

      # First plot, the topconfects plot.
      tcplot_fn <- paste0(output_data_dir, file_prefix, "topconfects_",
                          limma_parameters$name, file_suffix, ".png")
      grDevices::png(tcplot_fn)
157
158
      local_n   <- min(50, nrow(confects_genes))
      p         <- topconfects::confects_plot(confects, n = local_n)
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
      print(p)
      grDevices::dev.off()
      rm(p, tcplot_fn)

      # Second plot, the me plot (mean - effect).
      meplot_fn <- paste0(output_data_dir, file_prefix, "topconfects_meplot_",
                          limma_parameters$name, file_suffix, ".png")
      grDevices::png(meplot_fn)
      p <- topconfects::confects_plot_me(confects)
      print(p)
      grDevices::dev.off()
      rm(p, meplot_fn)

      # Third plot, the comparison between topconfects and pure limma with
      # a rank rank plot.
      limma_top <- limma::topTable(fit, coef = 1, p.value = 0.05, n = Inf)
      if (nrow(limma_top) > 0) {
176

177
178
179
180
181
182
183
184
185
        # Log info.
        if (verbose == TRUE) {
          message(paste0("[", Sys.time(), "] [TopConfects] ", nrow(limma_top),
                         " genes identified with Limma."))
        }

        rrplot_fn <- paste0(output_data_dir, file_prefix, "topconfects_rrplot_limma_",
                            limma_parameters$name, file_suffix, ".png")
        grDevices::png(rrplot_fn)
186
187
188
189
190
        p <- topconfects::rank_rank_plot(confects_genes$name,
                                         rownames(limma_top),
                                         "TopConfects",
                                         "Limma",
                                         n = local_n)
191
192
193
194
195
196
197
198
199
        print(p)
        grDevices::dev.off()
        rm(p, rrplot_fn)

        # Fourth plot, the MD plot with both TopConfect results and Limma results highlighted.
        mdplot_fn <- paste0(output_data_dir, file_prefix, "topconfects_mdplot_limma_",
                            limma_parameters$name, file_suffix, ".png")
        grDevices::png(mdplot_fn)
        p <- limma::plotMD(fit, legend = "bottomleft", status = paste0(
200
201
          ifelse(rownames(fit) %in% rownames(limma_top), "Limma ", ""),
          ifelse(rownames(fit) %in% confects_genes$name, "TopConfects ", "")))
202
203
204
205
        print(p)
        grDevices::dev.off()
        rm(p, mdplot_fn)
      } else {
206

207
208
209
210
211
212
        # Log info.
        if (verbose == TRUE) {
          message(paste0("[", Sys.time(), "] [TopConfects] no gene identified with Limma."))
        }
      } # End of the plotting (if there are Limma genes).
    } else {
213

214
215
216
217
218
219
220
      # Log info.
      if (verbose == TRUE) {
        message(paste0("[", Sys.time(), "] [TopConfects] no gene identified with TopConfects."))
      }
    } # End of the plotting (if there are TopConfect genes).
  } # End of the TopConfects section.

221
  # We return the fit.
222
223
  return(fit)
}