Commit 688d482a authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added the support for GCRMA data generated using APT (Affymetrix Power Tools).

parent be579cbe
......@@ -3,7 +3,7 @@
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#' The function assumes that a folder containing the raw data exists (with cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
......@@ -16,6 +16,8 @@
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param exprs_raw A matrix corresponding to the pre-processed data (in the case of APT-GCRMA).
#' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A String indicating whether batch correction should
......@@ -30,6 +32,7 @@
preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix",
method = "SCAN",
exprs_raw = NULL,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
......@@ -55,6 +58,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "APT-GCRMA") {
esets <- preprocess_data_affymetrix_aptgcrma(exprs_raw,
input_data_dir,
output_data_files,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
}
} else if (platform == "Agilent") {
esets <- preprocess_data_agilent_limma(input_data_dir,
......
#' @title Preprocess an Affymetrix dataset with APT-GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset that was already analysed
#' with APT-GCRMA in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param exprs_raw A matrix corresponding to the pre-processed APT-GCRMA data.
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_aptgcrma <- function(exprs_raw, input_data_dir, output_data_files,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
# We create an ESET object from the existing expression matrix.
eset <- Biobase::ExpressionSet(exprs_raw)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data read"))
}
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
rm(clean_probe_list)
}
# We clean up and log information.
rm(exp_data, probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir,
verbose = verbose)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
eset <- eset_bc
rm(eset_bc)
}
} else {
rm(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = verbose)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset),
file = output_data_files[1],
sep = "\t",
quote = FALSE)
if (batch_correction == "BOTH") {
utils::write.table(Biobase::exprs(eset_bc),
file = output_data_files[2],
sep = "\t",
quote = FALSE)
}
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
#' @title Preprocess an Affymetrix dataset with GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset using RMA and saves the
#' @description This function preprocess an Affymetrix dataset using GCRMA and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
......@@ -48,7 +48,7 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
rm(clean_probe_list)
}
......
......@@ -34,7 +34,7 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp <- paste0(raw_data_input_dir, "*")
eset <- SCAN.UPC::SCAN(input_data_regexp)
eset <- SCAN.UPC::SCAN(input_data_regexp)
# We clean up and log information.
rm(raw_data_input_dir, input_data_regexp)
......@@ -48,7 +48,7 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
rm(clean_probe_list)
}
......
......@@ -77,7 +77,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created."))
}
} else { # correct_for_batch == TRUE
} else {
# Here we have correct_for_batch = TRUE
# We do not have a block, we do not include age but we do include batch.
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + batches)
......@@ -89,7 +90,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
message(paste0("[", Sys.time(), "] Design matrix created (batches)."))
}
}
} else { # correct_for_age == TRUE
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
# We do not have a block, we do include age, but not batch.
ages <- as.numeric(pheno_data[["Age"]])
......@@ -101,7 +103,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages)."))
}
} else { # correct_for_batch == TRUE
} else {
# Here we have correct_for_batch = TRUE
# We do not have a block, we do include age and batch.
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
......@@ -115,7 +118,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
}
}
}
} else { # has_block == TRUE
} else {
# Here we have has_block = TRUE
if (correct_for_age == FALSE) {
if (correct_for_batch == FALSE) {
# We do have a block, we do not include age, nor batch.
......@@ -128,7 +132,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs)."))
}
} else { # correct_for_batch == TRUE
} else {
# Here we have correct_for_batch = TRUE
# We do have a block, we do not include age but we do include batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
batches <- factor(pheno_data[["Batch"]])
......@@ -143,7 +148,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
message(paste0("[", Sys.time(), "] Design matrix created (pairs, batches)."))
}
}
} else { # correct_for_age == TRUE
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
# We do have a block, we do include age, but not batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
......@@ -156,12 +162,14 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages)."))
}
} else { # correct_for_batch == TRUE
} else {
# Here we have correct_for_batch = TRUE
# We do have a block, we do include age and batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches + clinical_cofactor)
design <- stats::model.matrix(~ 0 + clinical + ages + batches +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
"Age",
levels(batches)[2:length(levels(batches))],
......@@ -181,7 +189,7 @@ run_limma <- function(pheno_data, eset, limma_parameters,
contrast <- eval(substitute(limma::makeContrasts(coeff1 = c1, levels = design),
list(c1 = coefficients)))
if (run_topconfects) {
design_topconfects <-limma::contrastAsCoef(design, contrast)$design
design_topconfects <- limma::contrastAsCoef(design, contrast)$design
}
# We log information.
......@@ -233,7 +241,7 @@ run_limma <- function(pheno_data, eset, limma_parameters,
# We need to check whether we have any confect at all (if no effect is > 0 at FDR = 0.05,
# then all confects are set to NA values).
confects_genes <- confects$table %>% filter(!is.na(confect))
confects_genes <- confects$table %>% filter(!is.na(confect)) #nolint
if (nrow(confects_genes) > 0) {
# Log info.
......@@ -255,8 +263,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
tcplot_fn <- paste0(output_data_dir, file_prefix, "topconfects_",
limma_parameters$name, file_suffix, ".png")
grDevices::png(tcplot_fn)
local_n <- min(50, nrow(confects_genes))
p <- topconfects::confects_plot(confects, n = local_n)
local_n <- min(50, nrow(confects_genes))
p <- topconfects::confects_plot(confects, n = local_n)
print(p)
grDevices::dev.off()
rm(p, tcplot_fn)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment