Commit 673b161e authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Some refactoring, better logs, improved GC / mem usage.

parent 24846b85
......@@ -14,38 +14,43 @@
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return The corrected expression data as an ESET object.
correct_batch_effect <- function(eset,
input_data_dir,
is_eset = TRUE,
batch_filename = "Batch.tsv",
verbose = TRUE) {
verbose = FALSE) {
# We read the clinical and batch data.
batch_data_file <- paste0(input_data_dir, batch_filename)
batch_data <- utils::read.delim(file = batch_data_file,
row.names = 1)
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE))
remove(batch_data_file)
verbose = verbose))
# If necessary, we convert the input matrix to an ExpressionSet.
if (!is_eset) {
eset <- Biobase::ExpressionSet(eset)
}
# We clean up and log information.
rm(batch_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data read."))
}
# We then build the mod matrix that contains the clinically relevant co-factors.
# By default, we consider we have only the disease status, but we then also
# try to add the gender and age as co-factors to guide Combat to correct for batch effect
# without removing the biologically relevant signal.
mod_data <- stats::model.matrix(~Disease.status, data = pheno_data)
mod_data <- stats::model.matrix(~Disease.status, data = pheno_data)
if (sum(is.na(pheno_data$Gender)) == 0) {
if (sum(is.na(pheno_data$Age)) == 0) {
mod_data <- stats::model.matrix(~Disease.status + Gender + as.numeric(as.character(Age)),
mod_data <- stats::model.matrix(~Disease.status + Gender + as.numeric(as.character(Age)),
data = pheno_data)
} else {
mod_data <- stats::model.matrix(~Disease.status + Gender, data = pheno_data)
mod_data <- stats::model.matrix(~Disease.status + Gender, data = pheno_data)
}
}
......@@ -55,26 +60,31 @@ correct_batch_effect <- function(eset,
clean_probe_list <- rownames(exp_data)
for (b in unique(batch_data$Batch)) {
# We take care of batch b.
samples_in_batch <- which(batch_data$Batch == b)
exp_data_batch <- exp_data[, samples_in_batch]
probe_vars <- apply(exp_data_batch, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
samples_in_batch <- which(batch_data$Batch == b)
exp_data_batch <- exp_data[, samples_in_batch]
probe_vars <- apply(exp_data_batch, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(clean_probe_list, probe_var_0)
}
}
remove(b, samples_in_batch, exp_data_batch, probe_vars, probe_var_0)
rm(b, samples_in_batch, exp_data_batch, probe_vars, probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
remove(exp_data, clean_probe_list)
# We clean up and log information.
rm(exp_data, clean_probe_list)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data ready."))
}
# ComBat corrects for the batch effect.
data_bc <- sva::ComBat(dat = Biobase::exprs(eset),
batch = batch_data$Batch,
mod = mod_data)
data_bc <- sva::ComBat(dat = Biobase::exprs(eset),
batch = batch_data$Batch,
mod = mod_data)
eset_bc <- Biobase::ExpressionSet(data_bc)
# We clean up and log information.
remove(batch_data, pheno_data, mod_data)
rm(batch_data, pheno_data, mod_data)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
......
......@@ -16,11 +16,14 @@
#' @param file_prefix A string used to prefix the file names. Default to "".
#' @param pval_adjust_method A string code indicating the multiple testing correction
#' method to use. Default to BH.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return NULL
extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
file_suffix = "",
file_prefix = "",
pval_adjust_method = "BH") {
pval_adjust_method = "BH",
verbose = FALSE) {
# We create the output file names.
if (file_suffix != "") {
......@@ -29,12 +32,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
if (file_prefix != "") {
file_prefix <- paste0( "_", file_prefix)
}
venn_filename <- paste0(output_data_dir, file_suffix, "venn_",
limma_coeffs[k], file_prefix, ".png")
md_fn <- paste0(output_data_dir, file_suffix, "MD_",
limma_coeffs[k], file_prefix, ".png")
table_fn <- paste0(output_data_dir, file_suffix, "toptable_",
limma_coeffs[k], file_prefix, ".tsv")
venn_fn <- paste0(output_data_dir, file_suffix, "venn_",
limma_coeffs[k], file_prefix, ".png")
md_fn <- paste0(output_data_dir, file_suffix, "MD_",
limma_coeffs[k], file_prefix, ".png")
table_fn <- paste0(output_data_dir, file_suffix, "toptable_",
limma_coeffs[k], file_prefix, ".tsv")
# We extract the DEGs using toptable.
table <- limma::topTable(fit,
......@@ -49,9 +52,14 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coefficient = paste0("coeff", k),
adjust.method = pval_adjust_method)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Differentially expressed genes extracted."))
}
# We plot Venn diagrams that summarize the results and show to which
# extend the analyses overlap.
grDevices::png(venn_filename)
grDevices::png(venn_fn)
limma::vennDiagram(results,
names = limma_coeffs,
include = c("up", "down"),
......@@ -62,6 +70,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
show.include = TRUE)
grDevices::dev.off()
# We clean and log information.
rm(venn_fn)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Venn diagram created."))
}
# We then plot the Mean vs Deviation of the comparisons and highlight the significant genes.
grDevices::png(md_fn)
limma::plotMD(fit,
......@@ -73,6 +87,18 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
hl.cex = c(0.9, 0.9))
grDevices::dev.off()
# We clean and log information.
rm(md_fn, results)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] MD plot created."))
}
# Save the top tables to TSV files.
utils::write.table(table, file = table_fn, sep = "\t", quote = FALSE, col.names = NA)
# We clean and log information.
rm(table, table_fn)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Differentially expressed gene list saved."))
}
}
......@@ -18,6 +18,7 @@ get_gene_annots_from_file <- function(folder, filename, entities) {
# We read the file into a data-frame.
gene_annots <- read.delim(file_path, row.names = NULL, stringsAsFactors = FALSE)
rm(file_path)
# We do a bit of cleaning.
gene_annots[is.na(gene_annots)] <- ""
......
......@@ -23,5 +23,7 @@ get_gene_annots_from_package <- function(affy_library_name, entities,
columns = columns,
keytype = keytype), list(local_db = as.name(affy_library_name)))
eval(cmd)
rm(cmd)
return(gene_annots)
}
......@@ -10,12 +10,12 @@
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv",
use_factors = TRUE,
verbose = TRUE) {
verbose = FALSE) {
# We define the I/Os.
clinical_data_file <- paste0(data_dir, clinical_file_name)
......@@ -40,6 +40,7 @@ load_clinical_data <- function(data_dir,
if (verbose == TRUE) {
data_dimensions <- paste0(dim(pheno_data), collapse = " * ")
message(paste0("[", Sys.time(), "] Clinical data read (", data_dimensions, ")."))
rm(data_dimensions)
}
# We return the clinical data.
......
......@@ -25,7 +25,7 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix",
......@@ -34,7 +34,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
verbose = FALSE) {
# We launch the correct function depending on the array platform and desired method.
esets <- NULL
......@@ -73,7 +73,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
clean_samples = clean_samples,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] Platform ", platform,
message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
" not yet supported (no preprocessing done)."))
}
......
......@@ -19,24 +19,29 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
verbose = FALSE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We run the RMA pre-processing method on the data.
input_data_files <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
remove(raw_data_input_dir)
batch <- affy::ReadAffy(filenames = input_data_files, compress = compressed, verbose = verbose)
eset <- gcrma::gcrma(batch)
# We clean up and log information.
rm(raw_data_input_dir, input_data_files, batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data processed."))
}
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
......@@ -44,33 +49,51 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
remove(clean_probe_list)
rm(clean_probe_list)
}
# We clean up and log information.
rm(exp_data, probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
remove(exp_data, probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir)
input_data_dir = input_data_dir,
verbose = verbose)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
eset <- eset_bc
remove(eset_bc)
rm(eset_bc)
}
} else {
remove(eset_bc)
rm(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE)))
verbose = verbose)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset data as TSV file.
......@@ -82,10 +105,9 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
quote = FALSE)
}
# We clean up and log information.
remove(input_data_files, batch)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with RMA."))
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
......
......@@ -19,14 +19,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
verbose = FALSE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
......@@ -34,9 +34,14 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp <- paste0(raw_data_input_dir, "*")
remove(raw_data_input_dir)
eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_files[1])
# We clean up and log information.
rm(raw_data_input_dir, input_data_regexp)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data processed."))
}
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
......@@ -44,33 +49,51 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
remove(clean_probe_list)
rm(clean_probe_list)
}
# We clean up and log information.
rm(exp_data, probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
remove(exp_data, probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir)
input_data_dir = input_data_dir,
verbose = verbose)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
eset <- eset_bc
remove(eset_bc)
rm(eset_bc)
}
} else {
remove(eset_bc)
rm(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE)))
verbose = verbose)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset_bc data as TSV file. ESET was already done as part of SCAN.
......@@ -81,10 +104,9 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
quote = FALSE)
}
# We clean up and log information.
rm(input_data_regexp)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with SCAN."))
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
......
......@@ -19,30 +19,35 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = TRUE) {
verbose = FALSE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We read the data (aka agilent MA images).
raw_file_list <- list.files(path = raw_data_input_dir)
batch <- limma::read.maimages(files = raw_file_list,
source = "agilent",
path = raw_data_input_dir,
green.only = TRUE,
verbose = TRUE)
batch <- limma::read.maimages(files = raw_file_list,
source = "agilent",
path = raw_data_input_dir,
green.only = TRUE,
verbose = TRUE)
batch_data <- log2(batch$E)
# We change the probe ids (1 to 45015) to be the probe names instead (A_XX_PXXXX).
# We change the probe ids (e.g., 1 to 45015) to be the probe names instead (e.g., A_XX_PXXXX).
rownames(batch_data) <- (batch$genes)$ProbeName # nolint
remove(raw_data_input_dir, batch)
# We clean up and log information.
rm(raw_data_input_dir, raw_file_list, batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data read."))
}
# We run the LIMMA pre-processing method on the data.
# First, background correcttion.
......@@ -53,46 +58,71 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
# Second, perform quantile normalization.
batch_data_norm <- limma::normalizeBetweenArrays(batch_data_bg, method = "quantile")
# We clean up and log information.
rm(batch_data_bg)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data processed."))
}
# We remove the duplicate rows (based on the row names only).
probe_id_counts <- table(row.names(batch_data))
unique_probe_ids <- setdiff(names(probe_id_counts), names(probe_id_counts[probe_id_counts > 1]))
batch_data_norm <- batch_data_norm[unique_probe_ids, ]
rm(probe_id_counts, unique_probe_ids, batch_data)
# We remove the probes that have 0 variance accross the samples.
probe_vars <- apply(batch_data_norm, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(batch_data_norm), probe_var_0)
batch_data_norm <- batch_data_norm[clean_probe_list, ]
remove(clean_probe_list)
batch_data_norm <- batch_data_norm[clean_probe_list, ]
rm(clean_probe_list)
}
# We clean up and log information.
rm(probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
remove(probe_vars, probe_var_0)
# We correct for the batch effect if necesary.
batch_data_norm_bc <- NULL
if (batch_correction != "FALSE") {
batch_data_norm_bc <- correct_batch_effect(eset = batch_data_norm,
input_data_dir = input_data_dir,
verbose = verbose,
is_eset = FALSE)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
batch_data_norm <- batch_data_norm_bc
remove(batch_data_norm_bc)
rm(batch_data_norm_bc)
}
} else {
remove(batch_data_norm_bc)
rm(batch_data_norm_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = FALSE)))
verbose = verbose)))
# We only keep the samples with clinical data.
batch_data_norm <- batch_data_norm[, samples]
if (batch_correction == "BOTH") {
batch_data_norm_bc <- batch_data_norm_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset data as TSV file.
......@@ -105,12 +135,12 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
utils::write.table(batch_data_norm_bc, file = output_data_files[2], sep = "\t", quote = FALSE)
rm(batch_data_norm_bc)
} else {
remove(eset_bc)
rm(eset_bc)
}
# We clean up and log information.
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with LIMMA."))
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
......
......@@ -20,14 +20,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.