run_quality_control_on_raw_agilent.R 5.76 KB
Newer Older
1
2
3
#' @title Executes a quality control of a given Agilent microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
4
5
#' It creates a heatmap of the samples (with a dendogram), pca plots (first two components),
#' boxplots of the array densities and array images. It does not return any value.
6
7
8
9
10
11
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed  A boolean representing whether the raw data are compressed or not. This is
12
#'  FALSE by default. Note: only present for compatibility reasons, it no effect on the analysis.
13
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
14
15
#'  samples in the QC report. This is none by default. Note: only present for compatibility
#'  reasons, has no effect on the analysis.
16
17
18
19
20
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
#' @return NULL
run_quality_control_on_raw_agilent <- function(input_data_dir,
                                               output_data_dir,
21
                                               compressed       = FALSE,
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
                                               phenotype_groups = vector(),
                                               verbose          = TRUE) {

  # We define the I/Os.
  raw_data_input_dir <- paste0(input_data_dir, "RAW/")

  # We load the clinical data as to annotate the QC.
  pheno_data_raw <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
  pheno_data     <- Biobase::pData(pheno_data_raw)

  # We load the data (aka agilent MA images).
  raw_file_list <- list.files(path = raw_data_input_dir)
  batch <- limma::read.maimages(files      = raw_file_list,
                                source     = "agilent",
                                path       = raw_data_input_dir,
                                green.only = TRUE,
                                verbose    = TRUE)
  batch$targets <- pheno_data
  batch_log_data = log2(batch$E)

  # We clean up and log information.
  rm(raw_file_list, raw_data_input_dir, pheno_data_raw)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] Expression data read."))
  }

  # We do some kind of quality control (ourselves - no external packages).
  dir.create(output_data_dir)
  # A- heatmap.
  # Compute correlations.
  cor_mat <- round(cor(batch_log_data, method = "pearson"), 2)
  # Create the heatmap.
  heatmap_filename <- paste0(output_data_dir, "hm.html")
  heatmaply::heatmaply(cor_mat,
                       file            = heatmap_filename,
                       showticklabels  = c(FALSE, TRUE),
                       row_side_colors = pheno_data[, 1:2])
  dev.off()
  # We clean up.
  rm(cor_mat, heatmap_filename)

  # B- PCA plots.
  # Compute the PCA.
  pca <- prcomp(t(batch_log_data), scale = FALSE)
  # Prepare the colors for the disease status and the gender.
  ds_colors <- rep(heatmaply::RdBu(2)[1], dim(pheno_data)[1])
  gd_colors <- rep(heatmaply::PuOr(2)[1], dim(pheno_data)[1])
  ds_colors[pheno_data$Disease.status == "Control"] <- heatmaply::RdBu(2)[2]
  gd_colors[pheno_data$Gender         == "F"]       <- heatmaply::PuOr(2)[2]
  # Plots the first two PCs.
  pca_ds_filename <- paste0(output_data_dir, "pca_status.png")
  png(pca_ds_filename)
  pca_plot_ds <- ggplot2::autoplot(pca, colour = ds_colors, size = 5)
  print(pca_plot_ds)
  dev.off()
  pca_gd_filename <- paste0(output_data_dir, "pca_gender.png")
  png(pca_gd_filename)
  pca_plot_gd <- ggplot2::autoplot(pca, colour = gd_colors, size=5)
  print(pca_plot_gd)
  dev.off()
  # We clean up.
  rm(pca, ds_colors, gd_colors, pca_ds_filename, pca_gd_filename)

  # C- boxplots (aka densities).
  # We melt the data frame for ggplot2.
  df_melted <- reshape2::melt(t(batch_log_data))
  # We plot the boxplots (one per array).
  boxplot_filename <- paste0(output_data_dir, "bx.png")
  png(boxplot_filename)
  boxplot_arrays <- ggplot2::ggplot(df_melted, ggplot2::aes(Var1, value)) +
    ggplot2::geom_boxplot(lwd = 1.2, outlier.alpha = 0.1, ggplot2::aes(colour = Var1)) +
    ggplot2::theme(legend.position  = "none",
          axis.title.x     = ggplot2::element_blank(),
          panel.background = ggplot2::element_blank(),
          axis.line        = ggplot2::element_line(colour = "black"),
          panel.grid.major = ggplot2::element_line(colour = "lightgrey"),
          axis.text.x      = ggplot2::element_blank())
  print(boxplot_arrays)
  dev.off()
  # We clean up.
  rm(df_melted, boxplot_filename, boxplot_arrays)

  # D- array images.
  # We have to extract the layout of the array from the batch object.
  # Basically to associate the correct value from the array (1..n) to
  # the pixel (i, j).
  rows <- batch$genes$Row
  cols <- batch$genes$Col
  nrows <- max(rows)
  ncols <- max(cols)
  array_data <- rep(NA, nrows * ncols)
  i <- (rows - 1) * ncols + cols
  # We now plot the arrays one by one.
  for (j in seq_len(dim(batch_log_data)[2])) {
    array_data[i]  <- batch_log_data[ ,j]
    array_filename <- paste0(output_data_dir, "array_", j, ".png")
    png(array_filename)
    limma::imageplot(array_data,
              main = rownames(df)[j],
              layout = list(ngrid.r = 1,
                            ngrid.c = 1,
                            nspot.r = nrows,
                            nspot.c = ncols))
    dev.off()
  }
  # We clean up.
  rm(rows, cols, nrows, ncols, array_data, i, j, array_filename)

  # We clean up and log information.
  rm(batch, batch_log_data, pheno_data)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "] QC analysis performed (raw data)."))
  }
}