preprocess_data_rnaseq.R 5.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#' @title Post preprocess a RNA-seq dataset.
#'
#' @description This function preprocess a dataset that was already pre-processed
#' in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param remove_low_counts A boolean indicating whether low counts should be removed prior
#' to the normalization, TRUE by default.
#' @param low_count_threshold A numeric representing the maximum number of counts associated to
#' transcripts considered as not lowly expressed. Default value is 5.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_rnaseq <- function(input_data_dir, output_data_files,
                                   expression_filename = "ExpData.tsv",
                                   batch_correction    = "FALSE",
                                   batch_filename      = "Batch.tsv",
                                   clean_samples       = FALSE,
                                   remove_low_counts   = TRUE,
                                   low_count_threshold = 5,
                                   verbose             = FALSE) {

  # We read the pre-processed data (RNA-seq).
  exp_data_fn <- paste0(input_data_dir, expression_filename)
  exp_data    <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
  rm(exp_data_fn)

  # The RNA-seq data consists of counts so we need to post-process
  # it to get some kind of microarray compatible data (for instance on a log2 scale).
  # We first need to filter the low expressed transcripts.
  if (remove_low_counts) {
    keep     <- rowSums(exp_data) > low_count_threshold
    exp_data <- exp_data[keep, ]
    rm(keep)
  }

  # We need to normalize the counts.
  exp_data_dge <- edgeR::DGEList(counts = exp_data)
  exp_data_dge <- edgeR::calcNormFactors(exp_data_dge)
  rm(exp_data)

  # CPM then does the normalization and the log-transformation.
  exp_data_norm <- limma::voom(exp_data_dge, normalize.method = "quantile")$E
  rm(exp_data_dge)

  # We log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Raw data read"))
  }

  # We remove the probes that have 0 variance accross the samples.
  probe_vars  <- apply(exp_data_norm, 1, var)
  probe_var_0 <- names(probe_vars[probe_vars == 0])
  eset <- NULL
  if (length(probe_var_0) > 0) {
    clean_probe_list <- setdiff(rownames(exp_data_norm), probe_var_0)
    eset <- Biobase::ExpressionSet(exp_data_norm[clean_probe_list, ])
    rm(clean_probe_list)
  } else {
    eset <- Biobase::ExpressionSet(exp_data_norm)
  }

  # We clean up and log information.
  rm(exp_data_norm, probe_vars, probe_var_0)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
  }

  # We correct for the batch effect if necesary.
  eset_bc <- NULL
  if (batch_correction != "FALSE") {
    eset_bc <- correct_batch_effect(eset           = eset,
                                    input_data_dir = input_data_dir,
                                    verbose        = verbose)

    # We log some information.
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Batch effect corrected."))
    }

    if (batch_correction == "TRUE") {
      eset <- eset_bc
      rm(eset_bc)
    }
  } else {
    rm(eset_bc)
  }

  # If necessary, we remove the samples that do not have clinical data.
  if (clean_samples) {
    # We load the clinical data as to get the samples to keep.
    samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
                                                                      verbose = verbose)))
    # We only keep the samples with clinical data.
    eset <- eset[, samples]
    if (batch_correction == "BOTH") {
      eset_bc <- eset_bc[, samples]
    }

    # We clean up and log information.
    rm(samples)
    if (verbose == TRUE) {
      message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
    }
  }

  # We save the eset data as TSV file.
  utils::write.table(Biobase::exprs(eset),
                     file  = output_data_files[1],
                     sep   = "\t",
                     quote = FALSE)
  if (batch_correction == "BOTH") {
    utils::write.table(Biobase::exprs(eset_bc),
                       file  = output_data_files[2],
                       sep   = "\t",
                       quote = FALSE)
  }

  # We log information.
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Processed data written to files."))
  }

  # We return the created ESET(s).
  if (batch_correction == "BOTH") {
    return(list(eset, eset_bc))
  } else {
    return(eset)
  }
}