Commit 056b87fa authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added RNA-seq support.

parent 4299aa24
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#' Default to NULL since in most cases, we do not have pre-processed data already. #' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This #' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. #' is FALSE by default.
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should #' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE". #' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found, #' @param batch_filename A string indicating where the batch information can be found,
...@@ -30,14 +32,15 @@ ...@@ -30,14 +32,15 @@
#' is FALSE by default. #' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted). #' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files, preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix", platform = "Affymetix",
method = "SCAN", method = "SCAN",
exprs_raw = NULL, exprs_raw = NULL,
compressed = FALSE, compressed = FALSE,
batch_correction = "FALSE", expression_filename = "ExpData.tsv",
batch_filename = "Batch.tsv", batch_correction = "FALSE",
clean_samples = FALSE, batch_filename = "Batch.tsv",
verbose = FALSE) { clean_samples = FALSE,
verbose = FALSE) {
# We launch the correct function depending on the array platform and desired method. # We launch the correct function depending on the array platform and desired method.
esets <- NULL esets <- NULL
...@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files, ...@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename = batch_filename, batch_filename = batch_filename,
clean_samples = clean_samples, clean_samples = clean_samples,
verbose = verbose) verbose = verbose)
} else if (platform == "RNAseq") {
esets <- preprocess_data_rnaseq(input_data_dir,
output_data_files,
expression_filename = expression_filename,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else { } else {
message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform, message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
" not yet supported (no preprocessing done).")) " not yet supported (no preprocessing done)."))
......
#' @title Post preprocess a RNA-seq dataset.
#'
#' @description This function preprocess a dataset that was already pre-processed
#' in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param remove_low_counts A boolean indicating whether low counts should be removed prior
#' to the normalization, TRUE by default.
#' @param low_count_threshold A numeric representing the maximum number of counts associated to
#' transcripts considered as not lowly expressed. Default value is 5.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_rnaseq <- function(input_data_dir, output_data_files,
expression_filename = "ExpData.tsv",
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
remove_low_counts = TRUE,
low_count_threshold = 5,
verbose = FALSE) {
# We read the pre-processed data (RNA-seq).
exp_data_fn <- paste0(input_data_dir, expression_filename)
exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
rm(exp_data_fn)
# The RNA-seq data consists of counts so we need to post-process
# it to get some kind of microarray compatible data (for instance on a log2 scale).
# We first need to filter the low expressed transcripts.
if (remove_low_counts) {
keep <- rowSums(exp_data) > low_count_threshold
exp_data <- exp_data[keep, ]
rm(keep)
}
# We need to normalize the counts.
exp_data_dge <- edgeR::DGEList(counts = exp_data)
exp_data_dge <- edgeR::calcNormFactors(exp_data_dge)
rm(exp_data)
# CPM then does the normalization and the log-transformation.
exp_data_norm <- limma::voom(exp_data_dge, normalize.method = "quantile")$E
rm(exp_data_dge)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data read"))
}
# We remove the probes that have 0 variance accross the samples.
probe_vars <- apply(exp_data_norm, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
eset <- NULL
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data_norm), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data_norm[clean_probe_list, ])
rm(clean_probe_list)
} else {
eset <- Biobase::ExpressionSet(exp_data_norm)
}
# We clean up and log information.
rm(exp_data_norm, probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir,
verbose = verbose)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
eset <- eset_bc
rm(eset_bc)
}
} else {
rm(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = verbose)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset),
file = output_data_files[1],
sep = "\t",
quote = FALSE)
if (batch_correction == "BOTH") {
utils::write.table(Biobase::exprs(eset_bc),
file = output_data_files[2],
sep = "\t",
quote = FALSE)
}
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment