Commit 6d95399a authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added batch correction for all non-SCAN pre-processing methods. Changed Limma...

Added batch correction for all non-SCAN pre-processing methods. Changed Limma to add age as a co-factor when possible.
parent 7cb20fd2
......@@ -17,6 +17,7 @@ Imports:
gcrma,
arrayQualityMetrics,
SCAN.UPC,
sva,
massiR,
limma,
stringr,
......
......@@ -4,3 +4,6 @@ importFrom("grDevices", "dev.off", "png")
importFrom("stats", "cor", "df", "prcomp")
importClassesFrom("AnnotationDbi")
importMethodsFrom("AnnotationDbi", "mget")
importFrom("stats", "model.matrix")
importFrom("utils", "read.delim")
importFrom("stats", "var")
#' @title Correct for a batch effect.
#'
#' @description This function runs ComBat to correct for a batch effect. It also
#' takes into account clinical factors as to not kill the real biological signal.
#' It returns the corrected ESET object.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param eset The original expressionSet object that contains the (uncorrected) data.
#' @param input_data_dir A string representing the folder that contains the clinical
#' and batch data.
#' @param is_eset A boolean representing whether the input data is an ExpressionSet object
#' (TRUE) or a simple matrix with the expression values (FALSE). This is TRUE by default.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The corrected expression data as an ESET object.
correct_batch_effect <- function(eset,
input_data_dir,
is_eset = TRUE,
batch_filename = "Batch.tsv",
verbose = TRUE) {
# We read the clinical and batch data.
batch_data_file <- paste0(input_data_dir, batch_filename)
batch_data <- utils::read.delim(file = batch_data_file,
row.names = 1)
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE))
remove(batch_data_file)
# If necessary, we convert the input matrix to an ExpressionSet.
if (!is_eset) {
eset <- Biobase::ExpressionSet(eset)
}
# We then build the mod matrix that contains the clinically relevant co-factors.
# By default, we consider we have only the disease status, but we then also
# try to add the gender and age as co-factors to guide Combat to correct for batch effect
# without removing the biologically relevant signal.
mod_data <- stats::model.matrix(~Disease.status, data = pheno_data)
if (sum(is.na(pheno_data$Gender)) == 0) {
if (sum(is.na(pheno_data$Age)) == 0) {
mod_data <- stats::model.matrix(~Disease.status + Gender + as.numeric(as.character(Age)),
data = pheno_data)
} else {
mod_data <- stats::model.matrix(~Disease.status + Gender, data = pheno_data)
}
}
# ComBat corrects for the batch effect.
data_bc <- sva::ComBat(dat = Biobase::exprs(eset),
batch = batch_data$Batch,
mod = mod_data)
eset_bc <- Biobase::ExpressionSet(data_bc)
# We clean up and log information.
remove(batch_data, pheno_data, mod_data)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
# We return the corrected ExpressionSet or the matrix (same as input).
if (!is_eset) {
return(Biobase::exprs(eset_bc))
} else {
return(eset_bc)
}
}
......@@ -10,73 +10,73 @@
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data <- function(input_data_dir, output_data_file,
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix",
method = "SCAN",
compressed = FALSE,
batch_correction = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We launch the correct function depending on the array platform and desired method.
eset <- NULL
esets <- NULL
if (platform == "Affymetrix") {
if (method == "SCAN") {
eset <- preprocess_data_affymetrix_scan(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "GCRMA") {
eset <- preprocess_data_affymetrix_gcrma(input_data_dir,
output_data_file,
esets <- preprocess_data_affymetrix_scan(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "GCRMA") {
esets <- preprocess_data_affymetrix_gcrma(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
}
} else if (platform == "Agilent") {
eset <- preprocess_data_agilent_limma(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
esets <- preprocess_data_agilent_limma(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (platform == "Illumina") {
eset <- preprocess_data_illumina_beadarray(input_data_dir,
output_data_file,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
esets <- preprocess_data_illumina_beadarray(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] Platform ", platform,
" not yet supported (no preprocessing done)."))
}
# We return the created ESET.
return(eset)
# We return the created ESET(s).
return(esets)
}
......@@ -5,29 +5,28 @@
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The batch effect
#' correction is currently not yet supported.
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' is FALSE by default.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_file,
compressed = FALSE,
batch_correction = FALSE,
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
#' is TRUE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
......@@ -38,6 +37,30 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_file,
batch <- affy::ReadAffy(filenames = input_data_files, compress = compressed, verbose = verbose)
eset <- gcrma::gcrma(batch)
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
remove(clean_probe_list)
}
remove(exp_data, probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir)
if (batch_correction == "TRUE") {
eset <- eset_bc
remove(eset_bc)
}
} else {
remove(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
......@@ -45,10 +68,19 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_file,
verbose = FALSE)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
}
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset), file = output_data_file, sep = "\t", quote = FALSE)
utils::write.table(Biobase::exprs(eset), file = output_data_files[1], sep = "\t", quote = FALSE)
if (batch_correction == "BOTH") {
utils::write.table(Biobase::exprs(eset_bc),
file = output_data_files[2],
sep = "\t",
quote = FALSE)
}
# We clean up and log information.
remove(input_data_files, batch)
......@@ -56,6 +88,10 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_file,
message(paste0("[", Sys.time(), "] Expression data pre-processed with RMA."))
}
# We return the created ESET.
return(eset)
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
......@@ -8,23 +8,22 @@
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE. Note: the cleaning is not yet
#' suported for SCAN normalization.
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_file,
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
......@@ -36,16 +35,50 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_file,
# We do not run the fast analysis (by default).
input_data_regexp <- paste0(raw_data_input_dir, "*")
remove(raw_data_input_dir)
eset <- vector()
if (batch_correction == FALSE) {
eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_file)
eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_files[1])
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
remove(clean_probe_list)
}
remove(exp_data, probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir)
if (batch_correction == "TRUE") {
eset <- eset_bc
remove(eset_bc)
}
} else {
# We define the I/Os (for batch).
batch_data_file <- paste0(input_data_dir, batch_filename)
eset <- SCAN.UPC::SCAN(input_data_regexp,
outFilePath = output_data_file,
batchFilePath = batch_data_file)
remove(batch_data_file)
remove(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
}
# We save the eset_bc data as TSV file. ESET was already done as part of SCAN.
if (batch_correction == "BOTH") {
utils::write.table(Biobase::exprs(eset_bc),
file = output_data_files[2],
sep = "\t",
quote = FALSE)
}
# We clean up and log information.
......@@ -54,6 +87,10 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_file,
message(paste0("[", Sys.time(), "] Expression data pre-processed with SCAN."))
}
# We return the created ESET.
return(eset)
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
......@@ -5,26 +5,25 @@
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files. The bacth effect
#' correction is not yet supported.
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_agilent_limma <- function(input_data_dir, output_data_file,
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
......@@ -54,7 +53,36 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_file,
# Second, perform quantile normalization.
batch_data_norm <- limma::normalizeBetweenArrays(batch_data_bg, method = "quantile")
rm(batch_data_bg)
# We remove the duplicate rows (based on the row names only).
probe_id_counts <- table(row.names(batch_data))
unique_probe_ids <- setdiff(names(probe_id_counts), names(probe_id_counts[probe_id_counts > 1]))
batch_data_norm <- batch_data_norm[unique_probe_ids, ]
# We remove the probes that have 0 variance accross the samples.
probe_vars <- apply(batch_data_norm, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(batch_data_norm), probe_var_0)
batch_data_norm <- batch_data_norm[clean_probe_list, ]
remove(clean_probe_list)
}
remove(probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
batch_data_norm_bc <- NULL
if (batch_correction != "FALSE") {
batch_data_norm_bc <- correct_batch_effect(eset = batch_data_norm,
input_data_dir = input_data_dir,
is_eset = FALSE)
if (batch_correction == "TRUE") {
batch_data_norm <- batch_data_norm_bc
remove(batch_data_norm_bc)
}
} else {
remove(batch_data_norm_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
......@@ -62,24 +90,33 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_file,
verbose = FALSE)))
# We only keep the samples with clinical data.
batch_data_norm <- batch_data_norm[, samples]
if (batch_correction == "BOTH") {
batch_data_norm_bc <- batch_data_norm_bc[, samples]
}
}
# We remove the duplicate rows (based on the row names only).
probe_id_counts <- table(row.names(batch_data))
unique_probe_ids <- setdiff(names(probe_id_counts), names(probe_id_counts[probe_id_counts > 1]))
batch_data_norm <- batch_data_norm[unique_probe_ids, ]
# We save the eset data as TSV file.
eset <- methods::new("ExpressionSet", exprs = as.matrix(batch_data_norm))
rm(batch_data_bg)
utils::write.table(batch_data_norm, file = output_data_file, sep = "\t", quote = FALSE)
utils::write.table(batch_data_norm, file = output_data_files[1], sep = "\t", quote = FALSE)
rm(batch_data_norm)
eset_bc <- NULL
if (batch_correction == "BOTH") {
eset_bc <- methods::new("ExpressionSet", exprs = as.matrix(batch_data_norm_bc))
utils::write.table(batch_data_norm_bc, file = output_data_files[2], sep = "\t", quote = FALSE)
rm(batch_data_norm_bc)
} else {
remove(eset_bc)
}
# We clean up and log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with LIMMA."))
}
# We return the created ESET.
return(eset)
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
......@@ -9,22 +9,22 @@
#' correction is not yet supported.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. Note: only for compatibility reasons (has no effect on analysis).
#' @param batch_correction A boolean indicating whether batch correction should
#' be performed, default to FALSE. Note: not yet suported.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_file,
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
......@@ -42,9 +42,11 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_file,
# controls probes (v3) or not (v2). The decision is based on the probe_quality field.
gse_data_filt <- NULL
if (is.null(Biobase::fData(gse_data)$PROBEQUALITY)) {
# We have a v2 dataset, simple quantile normalization.
gse_data_norm <- beadarray::normaliseIllumina(gse_data, method = "quantile")
# We log the data (using an offset for small to negatove values)
# We log the data (using an offset for small to negative values)
offset <- 0
min_value <- min(Biobase::exprs(gse_data_norm))
if (min_value < 1) {
......@@ -74,6 +76,30 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_file,
rm(gse_data, gse_data_norm)
}
# We remove the probes that have 0 variance accross the samples.
probe_vars <- apply(gse_data_filt, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(gse_data_filt), probe_var_0)
gse_data_filt <- gse_data_filt[clean_probe_list, ]
remove(clean_probe_list)
}
remove(probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
gse_data_filt_bc <- NULL
if (batch_correction != "FALSE") {
gse_data_filt_bc <- correct_batch_effect(eset = gse_data_filt,
input_data_dir = input_data_dir,
is_eset = FALSE)
if (batch_correction == "TRUE") {
gse_data_filt <- gse_data_filt_bc
remove(gse_data_filt_bc)
}
} else {
remove(gse_data_filt_bc)
}