Commit 13644b47 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added Agilent support for quality control.

parent 64af219d
......@@ -2,3 +2,4 @@
.Rbuildignore
ArrayUtils.Rproj
man/*
.Rproj.user
......@@ -20,5 +20,9 @@ Imports:
limma,
stringr,
AnnotationDbi,
statmod
statmod,
heatmaply,
ggplot2,
ggfortify,
reshape2
RoxygenNote: 6.1.1
exportPattern("^[[:alpha:]]+")
importFrom("grDevices", "dev.off", "png")
importFrom("stats", "cor", "df", "prcomp")
......@@ -38,7 +38,7 @@ load_clinical_data <- function(data_dir,
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
data_dimensions = paste0(dim(pheno_data), collapse = " * ")
data_dimensions <- paste0(dim(pheno_data), collapse = " * ")
message(paste0("[", Sys.time(), "] Clinical data read (", data_dimensions, ")."))
}
......
#' @title Executes a quality control of a given microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then loads the raw data and
#' last runs the quality control of the annotated data. The function assumes that a folder with
#' both the clinical data and the raw data exists (in a subfolder '/RAW/'). It then creates a report
#' that contains various quality indicators and is stored as an HTML document. It does not return
#' any value.
#' It currently supports Affymetrix and Agilent arrays although the QC for Agilent produces only
#' a short report currently (mostly array images). This function is just a handler over the platform
#' dedicated functions.
#'
#' For Affymettrix arrays, it starts by loading the clinical data associated with the dataset, then
#' loads the raw data and last runs the quality control of the annotated data. The function assumes
#' that a folder with both the clinical data and the raw data exists (in a subfolder '/RAW/'). It
#' then creates a report that contains various quality indicators and is stored as an HTML document.
#' It does not return any value.
#'
#' For Agilent arrays, it ... TODO
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' @param platform A string representing the array platform among Affymetrix and Agilent.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
......@@ -19,38 +26,26 @@
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
phenotype_groups = vector(),
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
batch <- affy::ReadAffy(filenames = raw_file_list, compress = compressed, verbose = verbose)
Biobase::phenoData(batch) <- pheno_data
# We clean up and log information.
rm(raw_file_list, pheno_data, raw_data_input_dir)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data read."))
}
# We run the quality control itself.
arrayQualityMetrics::arrayQualityMetrics(expressionset = batch,
outdir = output_data_dir,
force = TRUE,
do.logtransform = TRUE,
intgroup = phenotype_groups)
output_data_dir,
platform,
compressed = TRUE,
phenotype_groups = vector(),
verbose = TRUE) {
# We clean up and log information.
rm(batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] QC analysis performed (raw data)."))
# We launch the correct function depending on the array platform.
if (platform == "Affymetrix") {
run_quality_control_on_raw_affymetrix(input_data_dir,
output_data_dir,
compressed,
phenotype_groups,
verbose)
} else if (platform == "Agilent") {
run_quality_control_on_raw_agilent(input_data_dir,
output_data_dir,
compressed,
phenotype_groups,
verbose)
} else {
message(paste0("[", Sys.time(), "] Platform ", platform, " not yet supported (no QC done)."))
}
}
#' @title Executes a quality control of a given Affymetrix microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then
#' loads the raw data and last runs the quality control of the annotated data. The function assumes
#' that a folder with both the clinical data and the raw data exists (in a subfolder '/RAW/'). It
#' then creates a report that contains various quality indicators and is stored as an HTML document.
#' It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw_affymetrix <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
phenotype_groups = vector(),
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
batch <- affy::ReadAffy(filenames = raw_file_list, compress = compressed, verbose = verbose)
Biobase::phenoData(batch) <- pheno_data
# We clean up and log information.
rm(raw_file_list, pheno_data, raw_data_input_dir)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data read."))
}
# We run the quality control itself.
arrayQualityMetrics::arrayQualityMetrics(expressionset = batch,
outdir = output_data_dir,
force = TRUE,
do.logtransform = TRUE,
intgroup = phenotype_groups)
# We clean up and log information.
rm(batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] QC analysis performed (raw data)."))
}
}
#' @title Executes a quality control of a given Agilent microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It ... TODO
#' It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw_agilent <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
phenotype_groups = vector(),
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We load the clinical data as to annotate the QC.
pheno_data_raw <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
pheno_data <- Biobase::pData(pheno_data_raw)
# We load the data (aka agilent MA images).
raw_file_list <- list.files(path = raw_data_input_dir)
batch <- limma::read.maimages(files = raw_file_list,
source = "agilent",
path = raw_data_input_dir,
green.only = TRUE,
verbose = TRUE)
batch$targets <- pheno_data
batch_log_data = log2(batch$E)
# We clean up and log information.
rm(raw_file_list, raw_data_input_dir, pheno_data_raw)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data read."))
}
# We do some kind of quality control (ourselves - no external packages).
dir.create(output_data_dir)
# A- heatmap.
# Compute correlations.
cor_mat <- round(cor(batch_log_data, method = "pearson"), 2)
# Create the heatmap.
heatmap_filename <- paste0(output_data_dir, "hm.html")
heatmaply::heatmaply(cor_mat,
file = heatmap_filename,
showticklabels = c(FALSE, TRUE),
row_side_colors = pheno_data[, 1:2])
dev.off()
# We clean up.
rm(cor_mat, heatmap_filename)
# B- PCA plots.
# Compute the PCA.
pca <- prcomp(t(batch_log_data), scale = FALSE)
# Prepare the colors for the disease status and the gender.
ds_colors <- rep(heatmaply::RdBu(2)[1], dim(pheno_data)[1])
gd_colors <- rep(heatmaply::PuOr(2)[1], dim(pheno_data)[1])
ds_colors[pheno_data$Disease.status == "Control"] <- heatmaply::RdBu(2)[2]
gd_colors[pheno_data$Gender == "F"] <- heatmaply::PuOr(2)[2]
# Plots the first two PCs.
pca_ds_filename <- paste0(output_data_dir, "pca_status.png")
png(pca_ds_filename)
pca_plot_ds <- ggplot2::autoplot(pca, colour = ds_colors, size = 5)
print(pca_plot_ds)
dev.off()
pca_gd_filename <- paste0(output_data_dir, "pca_gender.png")
png(pca_gd_filename)
pca_plot_gd <- ggplot2::autoplot(pca, colour = gd_colors, size=5)
print(pca_plot_gd)
dev.off()
# We clean up.
rm(pca, ds_colors, gd_colors, pca_ds_filename, pca_gd_filename)
# C- boxplots (aka densities).
# We melt the data frame for ggplot2.
df_melted <- reshape2::melt(t(batch_log_data))
# We plot the boxplots (one per array).
boxplot_filename <- paste0(output_data_dir, "bx.png")
png(boxplot_filename)
boxplot_arrays <- ggplot2::ggplot(df_melted, ggplot2::aes(Var1, value)) +
ggplot2::geom_boxplot(lwd = 1.2, outlier.alpha = 0.1, ggplot2::aes(colour = Var1)) +
ggplot2::theme(legend.position = "none",
axis.title.x = ggplot2::element_blank(),
panel.background = ggplot2::element_blank(),
axis.line = ggplot2::element_line(colour = "black"),
panel.grid.major = ggplot2::element_line(colour = "lightgrey"),
axis.text.x = ggplot2::element_blank())
print(boxplot_arrays)
dev.off()
# We clean up.
rm(df_melted, boxplot_filename, boxplot_arrays)
# D- array images.
# We have to extract the layout of the array from the batch object.
# Basically to associate the correct value from the array (1..n) to
# the pixel (i, j).
rows <- batch$genes$Row
cols <- batch$genes$Col
nrows <- max(rows)
ncols <- max(cols)
array_data <- rep(NA, nrows * ncols)
i <- (rows - 1) * ncols + cols
# We now plot the arrays one by one.
for (j in seq_len(dim(batch_log_data)[2])) {
array_data[i] <- batch_log_data[ ,j]
array_filename <- paste0(output_data_dir, "array_", j, ".png")
png(array_filename)
limma::imageplot(array_data,
main = rownames(df)[j],
layout = list(ngrid.r = 1,
ngrid.c = 1,
nspot.r = nrows,
nspot.c = ncols))
dev.off()
}
# We clean up.
rm(rows, cols, nrows, ncols, array_data, i, j, array_filename)
# We clean up and log information.
rm(batch, batch_log_data, pheno_data)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] QC analysis performed (raw data)."))
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment