Commit 79e9bae7 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Agilent/Illumina support for quality control and preprocessing.

parent 13644b47
......@@ -24,5 +24,9 @@ Imports:
heatmaply,
ggplot2,
ggfortify,
reshape2
reshape2,
GEOquery,
beadarray,
methods,
illuminaHumanv3.db
RoxygenNote: 6.1.1
......@@ -2,3 +2,5 @@ exportPattern("^[[:alpha:]]+")
importFrom("grDevices", "dev.off", "png")
importFrom("stats", "cor", "df", "prcomp")
importClassesFrom("AnnotationDbi")
importMethodsFrom("AnnotationDbi", "mget")
#' @title Preprocess an expression dataset.
#'
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data <- function(input_data_dir, output_data_file,
platform = "Affymetix",
method = "SCAN",
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We launch the correct function depending on the array platform and desired method.
eset <- NULL
if (platform == "Affymetrix") {
if (method == "SCAN") {
eset <- preprocess_data_affymetrix_scan(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "RMA") {
eset <- preprocess_data_affymetrix_rma(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
}
} else if (platform == "Agilent") {
eset <- preprocess_data_agilent_limma(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (platform == "Illumina") {
eset <- preprocess_data_illumina_beadarray(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] Platform ", platform, " not yet supported (no preprocessing done)."))
}
# We return the created ESET.
return(eset)
}
#' @title Preprocess a dataset with RMA.
#' @title Preprocess an Affymetrix dataset with RMA.
#'
#' @description This function preprocess a dataset using RMA and saves the results in
#' a given TSV file. In addition, it returns the ESET object.
#' @description This function preprocess an Affymetrix dataset using RMA and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#' Note: the function does not check for the existence of folders or files. The batch effect
#' correction is currently not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_rma <- function(input_data_dir, output_data_file,
compressed = TRUE,
verbose = TRUE) {
preprocess_data_affymetrix_rma <- function(input_data_dir, output_data_file,
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
......@@ -28,6 +38,14 @@ preprocess_data_rma <- function(input_data_dir, output_data_file,
batch <- affy::ReadAffy(filenames = input_data_files, compress = compressed, verbose = verbose)
eset <- affy::rma(batch)
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = FALSE)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
}
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset), file = output_data_file, sep = "\t", quote = FALSE)
......
#' @title Preprocess a dataset with SCAN.
#' @title Preprocess an Affymetrix dataset with SCAN.
#'
#' @description This function preprocess a dataset using SCAN and saves the results in
#' a given TSV file. In addition, it returns the ESET object.
#' @description This function preprocess an Affymetrix dataset using SCAN and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
......@@ -10,17 +10,24 @@
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param correct_for_batch_effect A boolean indicating whether batch correction should
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE. Note: the cleaning is not yet
#' suported for SCAN normalization.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_scan <- function(input_data_dir, output_data_file,
correct_for_batch_effect = FALSE,
batch_filename = "Batch.tsv",
verbose = TRUE) {
preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_file,
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
......@@ -30,7 +37,7 @@ preprocess_data_scan <- function(input_data_dir, output_data_file,
input_data_regexp <- paste0(raw_data_input_dir, "*")
remove(raw_data_input_dir)
eset <- vector()
if (correct_for_batch_effect == FALSE) {
if (batch_correction == FALSE) {
eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_file)
} else {
# We define the I/Os (for batch).
......
#' @title Preprocess an Agilent dataset with limma.
#'
#' @description This function preprocess an Agilent dataset using limma and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_agilent_limma <- function(input_data_dir, output_data_file,
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We read the data (aka agilent MA images).
raw_file_list <- list.files(path = raw_data_input_dir)
batch <- limma::read.maimages(files = raw_file_list,
source = "agilent",
path = raw_data_input_dir,
green.only = TRUE,
verbose = TRUE)
batch_data = log2(batch$E)
remove(raw_data_input_dir, batch)
# We run the LIMMA pre-processing method on the data.
# First, background correcttion.
batch_data_bg <- limma::backgroundCorrect(batch_data,
method = "normexp",
normexp.method = "mle",
verbose = TRUE)
# Second, perform quantile normalization.
batch_data_norm <- limma::normalizeBetweenArrays(batch_data_bg, method = "quantile")
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = FALSE)))
# We only keep the samples with clinical data.
batch_data_norm <- batch_data_norm[, samples]
}
# We save the eset data as TSV file.
eset <- methods::new("ExpressionSet", exprs = as.matrix(batch_data_norm))
rm(batch_data_bg)
utils::write.table(batch_data_norm, file = output_data_file, sep = "\t", quote = FALSE)
rm(batch_data_norm)
# We clean up and log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with LIMMA."))
}
# We return the created ESET.
return(eset)
}
#' @title Preprocess an Illumina dataset with beadarray.
#'
#' @description This function preprocess an Illumina dataset using beadarray and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_file,
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# Read raw data from GEO (matrix file).
matrix_filename <- list.files(raw_data_input_dir, full.names = TRUE)[1]
gse_eset <- GEOquery::getGEO(filename = matrix_filename)
gse_data <- methods::as(gse_eset, "ExpressionSetIllumina")
rm(raw_data_input_dir, matrix_filename, gse_eset)
# We do different nornalization depending on the array. That is whether we have
# controls probes (v3) or not (v2). The decision is based on the probe_quality field.
gse_data_filt <- NULL
if (is.null(Biobase::fData(gse_data)$PROBEQUALITY)) {
# We have a v2 dataset, simple quantile normalization.
gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "quantile")
# We log the data (using an offset for small to negatove values)
offset <- 0
min_value <- min(Biobase::exprs(gse_data_norm))
if (min_value < 1) {
offset <- 1.11 - min_value
}
gse_data_filt <- log2(offset + Biobase::exprs(gse_data_norm))
rm(gse_data, gse_data_norm)
} else {
# A bit of cleaning (specific to Illumina arrays).
probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match",
"negative",
"regular")
Biobase::fData(gse_data)$Status <- probe_status
beadarray::Detection(gse_data) <- beadarray::calculateDetection(gse_data,
status = probe_status)
# We run the beadarray pre-processing method on the data.
# Background correction and normalization at once.
gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "neqc", status = probe_status)
# Additional cleaning (after normalization - also Illumina specific).
ids <- as.character(Biobase::featureNames(gse_data_norm))
#qual <- unlist(mget(ids, illuminaHumanv3.db::illuminaHumanv3PROBEQUALITY, ifnotfound = NA))
qual <- unlist(mget(ids, get("illuminaHumanv3PROBEQUALITY"), ifnotfound = NA))
rem <- qual == "No match" | qual == "Bad" | is.na(qual)
gse_data_filt <- Biobase::exprs(gse_data_norm[!rem,])
rm(gse_data, gse_data_norm)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(pData(ArrayUtils::load_clinical_data(input_data_dir, verbose = FALSE)))
# We only keep the samples with clinical data.
gse_data_filt <- gse_data_filt[, samples]
}
# We save the eset data as TSV file.
utils::write.table(gse_data_filt, file = output_data_file, sep = "\t", quote = FALSE)
eset <- Biobase::ExpressionSet(gse_data_filt)
rm(gse_data_filt)
# We clean up and log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with beadarray."))
}
# We return the created ESET.
return(eset)
}
......@@ -29,11 +29,10 @@ run_quality_control_on_preprocessed <- function(eset, input_data_dir, output_dat
arrayQualityMetrics::arrayQualityMetrics(expressionset = eset,
outdir = output_data_dir,
force = TRUE,
do.logtransform = TRUE,
do.logtransform = FALSE,
intgroup = phenotype_groups)
# We clean up and log information.
rm(eset)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] QC analysis performed (preprocessed data)."))
}
......
......@@ -11,15 +11,18 @@
#' then creates a report that contains various quality indicators and is stored as an HTML document.
#' It does not return any value.
#'
#' For Agilent arrays, it ... TODO
#' For Agilent arrays, it creates a heatmap of the samples (with a dendogram), pca plots
#' (first two components), boxplots of the array densities and array images. It does
#' not return any value either.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param platform A string representing the array platform among Affymetrix and Agilent.
#' Default to Affymetrix.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' FALSE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
......@@ -27,8 +30,8 @@
#' @return NULL
run_quality_control_on_raw <- function(input_data_dir,
output_data_dir,
platform,
compressed = TRUE,
platform = "Affymetrix",
compressed = FALSE,
phenotype_groups = vector(),
verbose = TRUE) {
......@@ -36,15 +39,15 @@ run_quality_control_on_raw <- function(input_data_dir,
if (platform == "Affymetrix") {
run_quality_control_on_raw_affymetrix(input_data_dir,
output_data_dir,
compressed,
phenotype_groups,
verbose)
compressed = compressed,
phenotype_groups = phenotype_groups,
verbose = verbose)
} else if (platform == "Agilent") {
run_quality_control_on_raw_agilent(input_data_dir,
output_data_dir,
compressed,
phenotype_groups,
verbose)
compressed = compressed,
phenotype_groups = phenotype_groups,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] Platform ", platform, " not yet supported (no QC done)."))
}
......
......@@ -12,7 +12,7 @@
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' FALSE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
......@@ -20,7 +20,7 @@
#' @return NULL
run_quality_control_on_raw_affymetrix <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
compressed = FALSE,
phenotype_groups = vector(),
verbose = TRUE) {
......
#' @title Executes a quality control of a given Agilent microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It ... TODO
#' It does not return any value.
#' It creates a heatmap of the samples (with a dendogram), pca plots (first two components),
#' boxplots of the array densities and array images. It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' FALSE by default. Note: only present for compatibility reasons, it no effect on the analysis.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' samples in the QC report. This is none by default. Note: only present for compatibility
#' reasons, has no effect on the analysis.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw_agilent <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
compressed = FALSE,
phenotype_groups = vector(),
verbose = TRUE) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment