#' @title Executes a quality control of a given Agilent microarray dataset (raw data). #' #' @description This function executes a quality control of the dataset defined by the input parameters. #' It creates a heatmap of the samples (with a dendogram), pca plots (first two components), #' boxplots of the array densities and array images. It does not return any value. #' #' Note: the function does not check for the existence of folders or files. #' #' @param input_data_dir A string representing the folder that contains the input data. #' @param output_data_dir A string representing the folder that contains the output data. #' @param compressed A boolean representing whether the raw data are compressed or not. This is #' FALSE by default. Note: only present for compatibility reasons, it no effect on the analysis. #' @param phenotype_groups A list of phenotype factor names that can be used to highlight the #' samples in the QC report. This is none by default. Note: only present for compatibility #' reasons, has no effect on the analysis. #' @param verbose A boolean representing whether the function should display log information. This #' is TRUE by default. #' @return NULL run_quality_control_on_raw_agilent <- function(input_data_dir, output_data_dir, compressed = FALSE, phenotype_groups = vector(), verbose = TRUE) { # We define the I/Os. raw_data_input_dir <- paste0(input_data_dir, "RAW/") # We load the clinical data as to annotate the QC. pheno_data_raw <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose) pheno_data <- Biobase::pData(pheno_data_raw) # We load the data (aka agilent MA images). raw_file_list <- list.files(path = raw_data_input_dir) batch <- limma::read.maimages(files = raw_file_list, source = "agilent", path = raw_data_input_dir, green.only = TRUE, verbose = TRUE) batch$targets <- pheno_data batch_log_data = log2(batch$E) # We clean up and log information. rm(raw_file_list, raw_data_input_dir, pheno_data_raw) if (verbose == TRUE) { message(paste0("[", Sys.time(), "] Expression data read.")) } # We do some kind of quality control (ourselves - no external packages). dir.create(output_data_dir) # A- heatmap. # Compute correlations. cor_mat <- round(cor(batch_log_data, method = "pearson"), 2) # Create the heatmap. heatmap_filename <- paste0(output_data_dir, "hm.html") heatmaply::heatmaply(cor_mat, file = heatmap_filename, showticklabels = c(FALSE, TRUE), row_side_colors = pheno_data[, 1:2]) dev.off() # We clean up. rm(cor_mat, heatmap_filename) # B- PCA plots. # Compute the PCA. pca <- prcomp(t(batch_log_data), scale = FALSE) # Prepare the colors for the disease status and the gender. ds_colors <- rep(heatmaply::RdBu(2)[1], dim(pheno_data)[1]) gd_colors <- rep(heatmaply::PuOr(2)[1], dim(pheno_data)[1]) ds_colors[pheno_data$Disease.status == "Control"] <- heatmaply::RdBu(2)[2] gd_colors[pheno_data$Gender == "F"] <- heatmaply::PuOr(2)[2] # Plots the first two PCs. pca_ds_filename <- paste0(output_data_dir, "pca_status.png") png(pca_ds_filename) pca_plot_ds <- ggplot2::autoplot(pca, colour = ds_colors, size = 5) print(pca_plot_ds) dev.off() pca_gd_filename <- paste0(output_data_dir, "pca_gender.png") png(pca_gd_filename) pca_plot_gd <- ggplot2::autoplot(pca, colour = gd_colors, size=5) print(pca_plot_gd) dev.off() # We clean up. rm(pca, ds_colors, gd_colors, pca_ds_filename, pca_gd_filename) # C- boxplots (aka densities). # We melt the data frame for ggplot2. df_melted <- reshape2::melt(t(batch_log_data)) # We plot the boxplots (one per array). boxplot_filename <- paste0(output_data_dir, "bx.png") png(boxplot_filename) boxplot_arrays <- ggplot2::ggplot(df_melted, ggplot2::aes(Var1, value)) + ggplot2::geom_boxplot(lwd = 1.2, outlier.alpha = 0.1, ggplot2::aes(colour = Var1)) + ggplot2::theme(legend.position = "none", axis.title.x = ggplot2::element_blank(), panel.background = ggplot2::element_blank(), axis.line = ggplot2::element_line(colour = "black"), panel.grid.major = ggplot2::element_line(colour = "lightgrey"), axis.text.x = ggplot2::element_blank()) print(boxplot_arrays) dev.off() # We clean up. rm(df_melted, boxplot_filename, boxplot_arrays) # D- array images. # We have to extract the layout of the array from the batch object. # Basically to associate the correct value from the array (1..n) to # the pixel (i, j). rows <- batch$genes$Row cols <- batch$genes$Col nrows <- max(rows) ncols <- max(cols) array_data <- rep(NA, nrows * ncols) i <- (rows - 1) * ncols + cols # We now plot the arrays one by one. for (j in seq_len(dim(batch_log_data)[2])) { array_data[i] <- batch_log_data[ ,j] array_filename <- paste0(output_data_dir, "array_", j, ".png") png(array_filename) limma::imageplot(array_data, main = rownames(df)[j], layout = list(ngrid.r = 1, ngrid.c = 1, nspot.r = nrows, nspot.c = ncols)) dev.off() } # We clean up. rm(rows, cols, nrows, ncols, array_data, i, j, array_filename) # We clean up and log information. rm(batch, batch_log_data, pheno_data) if (verbose == TRUE) { message(paste0("[", Sys.time(), "] QC analysis performed (raw data).")) } }