get_DEGs.R 9.8 KB
Newer Older
1
2
3
4
5
6
7
#!/usr/bin/env Rscript

# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("Biobase")
8
9
10
library("hgu133plus2.db")
library("hgu133a.db")
library("u133x3p.db")
11
12
13
library("illuminaHumanv3.db")
library("hgug4112a.db")
library("hgfocus.db")
14
library("ArrayUtils")
15
library("tidyverse")
16
library("RColorBrewer")
17
18
library("huex10sttranscriptcluster.db")
library("hugene10sttranscriptcluster.db")
19
source("../libs/conf/confR.R")
20
source("../libs/utils/utils.R")
21
22
23
24
25
26
message(paste0("[", Sys.time(), "] Libraries loaded."))

# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
27
28
29
30
31
config          <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir    <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir  <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
32
33
34
35

# ================================================================================================
# Main
# ================================================================================================
36
37
38
39
40
41
42
43
44
45
46

# For the integration, we need to know how many samples there are in each dataset
# and for each category (M.PD, F.PD, M.Control, F.Control).
nb_samples_fn <- paste0(input_data_dir, "clinical_categories_summarized.tsv")
nb_samples    <- read.table(nb_samples_fn,
                            header    = TRUE,
                            sep       = "\t",
                            as.is     = TRUE,
                            row.names = 1)
remove(nb_samples_fn)

47
# We do all datasets one by one.
48
for (i in seq_len(length(config$datasets))) {
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
49

50
51
52
  # We get the dataset details.
  dataset      <- config$datasets[[i]]
  dataset_name <- dataset$dataset_name
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
53

54
55
56
57
58
59
  # We load the clinical data.
  pheno_data_fn <- paste0(dataset_name, "_clinical.tsv")
  pheno_data    <- Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
                                                                 pheno_data_fn,
                                                                 use_factors = FALSE))
  pheno_data   <- pheno_data %>%
60
61
62
    dplyr::mutate(gender_disease_status = paste(pheno_data$Gender,
                                                pheno_data$Disease.status,
                                                sep = "."))
63
64
65
66
67
68
69
  rm(pheno_data_fn)

  # We get the batch data from the batch file if it exists.
  batch_data_file <- paste0(input_data_dir, dataset_name, "_batch.tsv")
  if (file.exists(batch_data_file)) {
    batch_data       <- read.delim(batch_data_file, row.names = 1, stringsAsFactors = FALSE)
    pheno_data       <- data.frame(c(pheno_data, batch_data))
70
    pheno_data$Batch <- paste0("Batch", pheno_data$Batch) #nolint
71
72
73
74
75
76
77
78
79
80
81
82
    rm(batch_data)
  }
  rm(batch_data_file)

  # We repeat the analysis for all VSN usages.
  for (j in seq_len(length(config$variance_methods))) {

    # We get the current VSN configuration.
    vsn <- config$variance_methods[[j]]

    # We get the expression data.
    exp_data_fn <- paste0(input_data_dir, dataset_name, "_normalized_", vsn$name, ".tsv")
83
    exp_eset    <- ArrayUtils::read_eset(exp_data_fn)
84
85
    rm(exp_data_fn)

86
    # Annotate the eset with gene information for better outputs.
87
    # First, we collect raw the gene annotations and we have two options
88
    # either from the Bioconductor package or from the processed GEO plateform annotation file.
89
    platform_config <- get_platform(config, dataset$array_type)
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
90

91
92
93
94
95
96
97
98
    # We enrich the probe identifiers with the corresponding gene names using
    # the prepared file (based on BioMart).
    probe_ids            <- rownames(exp_eset)
    gene_annots_filename <- paste0(platform_config$platform_name, "_genenames.tsv")
    gene_annots_raw      <- ArrayUtils::get_gene_annots_from_file(input_data_dir,
                                                                  gene_annots_filename,
                                                                  probe_ids)
    rm(gene_annots_filename, probe_ids, platform_config)
99

100
    # Then, we make sure we deal with duplicates and reorder the df to match the eset.
101
    gene_annots <- gene_annots_raw %>%
102
      dplyr::group_by(PROBEID) %>%
103
      dplyr::summarize_at(.vars = c("SYMBOL"),
104
                          .funs = collapser) %>%
105
      dplyr::ungroup()
106
    gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
107
    rm(gene_annots_raw)
108
109
110
111
112
113
    message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
                   "] Gene annotations ready."))

    # Additional QC: plot the MDS to check again for outliers.
    mds_filename <- paste0(output_data_dir, dataset_name, "_mdsplot_", vsn$name, ".png")
    png(mds_filename)
114
115
116
117
118
119
120
    col_group         <- factor(pheno_data$gender_disease_status)
    levels(col_group) <- RColorBrewer::brewer.pal(nlevels(col_group), "Set1")
    col_group         <- as.character(col_group)
    limma::plotMDS(exp_eset,
                   labels = pheno_data$gender_disease_status,
                   col    = col_group,
                   pch    = 19)
121
    dev.off()
122
    rm(mds_filename, col_group)
123
124
    message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
                   "] MDS plot created."))
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
125

126
127
    # We update the eset with the gene annotations.
    # We only do it when the probe ids do match (otherwise, we issue a warning).
128
    if (all(gene_annots$PROBEID == rownames(exp_eset))) {
129
130
131
      gene_annots_asdf <- new("AnnotatedDataFrame",
                              data = data.frame(gene_annots[, -1], stringsAsFactors = FALSE)
      )
132
133
      rownames(gene_annots_asdf)     <- gene_annots$PROBEID
      Biobase::featureData(exp_eset) <- gene_annots_asdf
134
      rm(gene_annots_asdf, gene_annots)
135
    } else {
136
137
      message(paste0("[", Sys.time(), "][", vsn$name,
                     "] Incorrect annotations for ", dataset_name, "."))
138
    }
139
140
    message(paste0("[", Sys.time(), "][", dataset_name, "][",
                   vsn$name, "] ExpressionSet updated."))
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
141
142

    # Loop over Limma analyses.
143
    for (k in seq_len(length(config$limma_analyses))) {
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
144

145
      # We extract the Limma parameters.
146
      limma_parameters   <- config$limma_analyses[[k]]
Leon-Charles Tranchevent's avatar
Leon-Charles Tranchevent committed
147

148
      # We only do the limma analysis when it is possible, that is when we have at least 2 samples
149
      # for each of the category we are comparing. For instance, when there is no female PD
150
      # patients (GSE7307), we can not do any comparison that involves female PD patients.
151
      if (nb_samples[dataset_name, limma_parameters$clinical_factor] < 2) {
152
153
        next;
      }
154

155
156
157
      # We have enough samples, now we need to decide which co-factors we are going to include in
      # the limma analysis.
      # First, we check if we need to consider paired samples, then we use a block_key and
158
      # a cofactor_name. This is only valid for one dataset in our case.
159
160
161
      age_correct <- TRUE
      fit         <- NULL
      if (dataset$has_paired_samples) {
162

163
164
165
166
167
168
169
170
171
172
173
        # By default, we use Tissue / Patient.
        # This might be set as a parameter as well if we have more cases.
        # We also set the age and batch co-factors.
        if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
          age_correct <- FALSE
        }
        fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
                                     correct_for_age   = age_correct,
                                     correct_for_batch = dataset$has_batches,
                                     cofactor_name     = "Tissue",
                                     block_key         = "Patient",
174
                                     run_topconfects   = FALSE,
175
176
                                     output_data_dir   = output_data_dir,
                                     file_prefix       = dataset_name,
177
                                     file_suffix       = vsn$name,
178
179
                                     verbose           = TRUE)
      } else {
180

181
182
183
        # We do not have paired data. Let's then set the age and batch co-factors.
        if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
          age_correct <- FALSE
184
        }
185
186
187
        fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters,
                                     correct_for_age   = age_correct,
                                     correct_for_batch = dataset$has_batches,
188
                                     run_topconfects   = FALSE,
189
190
                                     output_data_dir   = output_data_dir,
                                     file_prefix       = dataset_name,
191
                                     file_suffix       = vsn$name,
192
                                     verbose           = TRUE)
193
      }
194
      rm(age_correct)
195
196
      message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
                     "][", k, "] Limma fit done."))
197
198

      # We extract the DEGs for the coefficient of interest (for that limma analysis).
199
      ArrayUtils::extract_degs(fit, limma_parameters$name, 1,
200
                               output_data_dir = output_data_dir,
201
202
203
204
205
                               file_prefix     = dataset_name,
                               file_suffix     = vsn$name)
      rm(limma_parameters, fit)
      message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
                     "][", k, "] DEG extraction done."))
206
    } # End for each Limma analysis.
207
208
    rm(k, vsn, exp_eset)
  } # End of each VSN configuration.
209
  rm(j, dataset, dataset_name, pheno_data)
210
} # End for each dataset.
211
rm(i, config, raw_data_dir, output_data_dir, input_data_dir, nb_samples)
212
213
214

# We log the session details for reproducibility.
sessionInfo()