Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit 6df04fcd authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Merge branch 'develop' into master for v0.4.

parents c2828e2d 688d482a
......@@ -2,3 +2,4 @@
.Rbuildignore
ArrayUtils.Rproj
man/*
.Rproj.user
Package: ArrayUtils
Type: Package
Title: Utils For Array Processing
Version: 0.1.1
Version: 0.3.2
Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu>
Description: This package contains functions to analyse microarray data.
......@@ -14,11 +14,22 @@ Imports:
utils,
stats,
affy,
gcrma,
arrayQualityMetrics,
SCAN.UPC,
sva,
massiR,
limma,
stringr,
AnnotationDbi,
statmod
statmod,
heatmaply,
ggplot2,
ggfortify,
reshape2,
GEOquery,
beadarray,
methods,
illuminaHumanv3.db,
topconfects
RoxygenNote: 6.1.1
exportPattern("^[[:alpha:]]+")
importFrom("grDevices", "dev.off", "png")
importFrom("stats", "cor", "df", "prcomp")
importClassesFrom("AnnotationDbi")
importMethodsFrom("AnnotationDbi", "mget")
importFrom("stats", "model.matrix")
importFrom("utils", "read.delim")
importFrom("stats", "var")
#' @title Correct for a batch effect.
#'
#' @description This function runs ComBat to correct for a batch effect. It also
#' takes into account clinical factors as to not kill the real biological signal.
#' It returns the corrected ESET object.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param eset The original expressionSet object that contains the (uncorrected) data.
#' @param input_data_dir A string representing the folder that contains the clinical
#' and batch data.
#' @param is_eset A boolean representing whether the input data is an ExpressionSet object
#' (TRUE) or a simple matrix with the expression values (FALSE). This is TRUE by default.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The corrected expression data as an ESET object.
correct_batch_effect <- function(eset,
input_data_dir,
is_eset = TRUE,
batch_filename = "Batch.tsv",
verbose = FALSE) {
# We read the clinical and batch data.
batch_data_file <- paste0(input_data_dir, batch_filename)
batch_data <- utils::read.delim(file = batch_data_file,
row.names = 1)
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = verbose))
# If necessary, we convert the input matrix to an ExpressionSet.
if (!is_eset) {
eset <- Biobase::ExpressionSet(eset)
}
# We clean up and log information.
rm(batch_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data read."))
}
# We then build the mod matrix that contains the clinically relevant co-factors.
# By default, we consider we have only the disease status, but we then also
# try to add the gender and age as co-factors to guide Combat to correct for batch effect
# without removing the biologically relevant signal.
mod_data <- stats::model.matrix(~Disease.status, data = pheno_data)
if (sum(is.na(pheno_data$Gender)) == 0) {
if (sum(is.na(pheno_data$Age)) == 0) {
mod_data <- stats::model.matrix(~Disease.status + Gender + as.numeric(as.character(Age)),
data = pheno_data)
} else {
mod_data <- stats::model.matrix(~Disease.status + Gender, data = pheno_data)
}
}
# We remove the probes that have 0 variance accross the samples for each batch.
# Not doing so will prevent Combat from running.
exp_data <- Biobase::exprs(eset)
clean_probe_list <- rownames(exp_data)
for (b in unique(batch_data$Batch)) {
# We take care of batch b.
samples_in_batch <- which(batch_data$Batch == b)
exp_data_batch <- exp_data[, samples_in_batch]
probe_vars <- apply(exp_data_batch, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(clean_probe_list, probe_var_0)
}
}
rm(b, samples_in_batch, exp_data_batch, probe_vars, probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
# We clean up and log information.
rm(exp_data, clean_probe_list)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data ready."))
}
# ComBat corrects for the batch effect.
data_bc <- sva::ComBat(dat = Biobase::exprs(eset),
batch = batch_data$Batch,
mod = mod_data)
eset_bc <- Biobase::ExpressionSet(data_bc)
# We clean up and log information.
rm(batch_data, pheno_data, mod_data)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
# We return the corrected ExpressionSet or the matrix (same as input).
if (!is_eset) {
return(Biobase::exprs(eset_bc))
} else {
return(eset_bc)
}
}
......@@ -12,29 +12,32 @@
#' @param k The index of the current coefficient (from limma_coeffs).
#' @param output_data_dir A string representing the folder in which the output files
#' are going to be stored.
#' @param file_suffix A string used as a suffix for the file names. Default to "".
#' @param file_prefix A string used to prefix the file names. Default to "".
#' @param file_suffix A string used as a suffix for the file names. Default to "".
#' @param pval_adjust_method A string code indicating the multiple testing correction
#' method to use. Default to BH.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return NULL
extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
file_suffix = "",
file_prefix = "",
pval_adjust_method = "BH") {
file_suffix = "",
pval_adjust_method = "BH",
verbose = FALSE) {
# We create the output file names.
if (file_suffix != "") {
file_suffix <- paste0(file_suffix, "_")
}
if (file_prefix != "") {
file_prefix <- paste0( "_", file_prefix)
file_prefix <- paste0(file_prefix, "_")
}
if (file_suffix != "") {
file_suffix <- paste0("_", file_suffix)
}
venn_filename <- paste0(output_data_dir, file_suffix, "venn_",
limma_coeffs[k], file_prefix, ".png")
md_fn <- paste0(output_data_dir, file_suffix, "MD_",
limma_coeffs[k], file_prefix, ".png")
table_fn <- paste0(output_data_dir, file_suffix, "toptable_",
limma_coeffs[k], file_prefix, ".tsv")
venn_fn <- paste0(output_data_dir, file_prefix, "venn_",
limma_coeffs[k], file_suffix, ".png")
md_fn <- paste0(output_data_dir, file_prefix, "MD_",
limma_coeffs[k], file_suffix, ".png")
table_fn <- paste0(output_data_dir, file_prefix, "toptable_",
limma_coeffs[k], file_suffix, ".tsv")
# We extract the DEGs using toptable.
table <- limma::topTable(fit,
......@@ -49,9 +52,14 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coefficient = paste0("coeff", k),
adjust.method = pval_adjust_method)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Differentially expressed genes extracted."))
}
# We plot Venn diagrams that summarize the results and show to which
# extend the analyses overlap.
grDevices::png(venn_filename)
grDevices::png(venn_fn)
limma::vennDiagram(results,
names = limma_coeffs,
include = c("up", "down"),
......@@ -62,6 +70,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
show.include = TRUE)
grDevices::dev.off()
# We clean and log information.
rm(venn_fn)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Venn diagram created."))
}
# We then plot the Mean vs Deviation of the comparisons and highlight the significant genes.
grDevices::png(md_fn)
limma::plotMD(fit,
......@@ -73,6 +87,18 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
hl.cex = c(0.9, 0.9))
grDevices::dev.off()
# We clean and log information.
rm(md_fn, results)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] MD plot created."))
}
# Save the top tables to TSV files.
utils::write.table(table, file = table_fn, sep = "\t", quote = FALSE, col.names = NA)
# We clean and log information.
rm(table, table_fn)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Differentially expressed gene list saved."))
}
}
#' @title Get gene annotations from a text file.
#'
#' @description This function reads a TSV file and extract the relevant
#' information into a data-frame.
#'
#' So far, the key column (containing the probe_id) is always the first one and
#' the other colums are all considered to contain relevant annotations. The
#' first row is considered as the header row.
#'
#' @param folder A path to the folder where the file resides.
#' @param filename The name of the file.
#' @param entities The actual set of object ids for which we want gene annotations.
#' @return A data-frame of the provided entities, annotated.
get_gene_annots_from_file <- function(folder, filename, entities) {
# We define the full path.
file_path <- paste0(folder, filename)
# We read the file into a data-frame.
gene_annots <- read.delim(file_path, row.names = NULL, stringsAsFactors = FALSE)
rm(file_path)
# We do a bit of cleaning.
gene_annots[is.na(gene_annots)] <- ""
rownames(gene_annots) <- gene_annots$PROBEID
gene_annots <- gene_annots[entities, ]
rownames(gene_annots) <- seq(1, length(rownames(gene_annots)))
return(gene_annots)
}
#' @title Get Gene annotations for affy arrays.
#' @title Get gene annotations from a BioConductor package.
#'
#' @description This function reads the library defined by the array type and
#' extract the relevant information into a data-frame.
#'
#' @param array_type A string representing the array type.
#' @param affy_library_name A string representing the library name (usually the array
#' type followed by ".db".
#' @param entities The actual set of object ids for which we want gene annotations.
#' It can for instance be probe ids or gene ids. The parameter keytype has to be
#' modified accordingly. By default, the expected type is probe id.
......@@ -12,16 +13,17 @@
#' @param keytype The type of key used. It will be used to map between the annotations
#' and the provided list of entities. The default is probe id.
#' @return A data-frame of the provided entities, annotated.
get_affy_annots <- function(array_type, entities,
get_gene_annots_from_package <- function(affy_library_name, entities,
columns = c("PROBEID", "SYMBOL", "ENSEMBL", "ENTREZID"),
keytype = "PROBEID") {
# We define the library name (removing underscores and adding .db).
affy_library_name <- paste0(stringr::str_to_lower(stringr::str_remove(array_type, "_")), ".db")
# We load the library given its name.
cmd <- substitute(gene_annots <- AnnotationDbi::select(
x = local_db,
keys = entities,
columns = columns,
keytype = keytype), list(local_db = as.name(affy_library_name)))
eval(cmd)
rm(cmd)
return(gene_annots)
}
......@@ -10,12 +10,12 @@
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' is FALSE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv",
use_factors = TRUE,
verbose = TRUE) {
verbose = FALSE) {
# We define the I/Os.
clinical_data_file <- paste0(data_dir, clinical_file_name)
......@@ -38,8 +38,9 @@ load_clinical_data <- function(data_dir,
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
data_dimensions = paste0(dim(pheno_data), collapse = " * ")
data_dimensions <- paste0(dim(pheno_data), collapse = " * ")
message(paste0("[", Sys.time(), "] Clinical data read (", data_dimensions, ")."))
rm(data_dimensions)
}
# We return the clinical data.
......
#' @title Preprocess an expression dataset.
#'
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (with cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param platform A string representing the array platform among Affymetrix, Illumina and Agilent.
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param exprs_raw A matrix corresponding to the pre-processed data (in the case of APT-GCRMA).
#' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix",
method = "SCAN",
exprs_raw = NULL,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
# We launch the correct function depending on the array platform and desired method.
esets <- NULL
if (platform == "Affymetrix") {
if (method == "SCAN") {
esets <- preprocess_data_affymetrix_scan(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "GCRMA") {
esets <- preprocess_data_affymetrix_gcrma(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (method == "APT-GCRMA") {
esets <- preprocess_data_affymetrix_aptgcrma(exprs_raw,
input_data_dir,
output_data_files,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
}
} else if (platform == "Agilent") {
esets <- preprocess_data_agilent_limma(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (platform == "Illumina") {
esets <- preprocess_data_illumina_beadarray(input_data_dir,
output_data_files,
compressed = compressed,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
" not yet supported (no preprocessing done)."))
}
# We return the created ESET(s).
return(esets)
}
#' @title Preprocess an Affymetrix dataset with APT-GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset that was already analysed
#' with APT-GCRMA in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param exprs_raw A matrix corresponding to the pre-processed APT-GCRMA data.
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_aptgcrma <- function(exprs_raw, input_data_dir, output_data_files,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
# We create an ESET object from the existing expression matrix.
eset <- Biobase::ExpressionSet(exprs_raw)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data read"))
}
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
rm(clean_probe_list)
}
# We clean up and log information.
rm(exp_data, probe_vars, probe_var_0)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step I)."))
}
# We correct for the batch effect if necesary.
eset_bc <- NULL
if (batch_correction != "FALSE") {
eset_bc <- correct_batch_effect(eset = eset,
input_data_dir = input_data_dir,
verbose = verbose)
# We log some information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Batch effect corrected."))
}
if (batch_correction == "TRUE") {
eset <- eset_bc
rm(eset_bc)
}
} else {
rm(eset_bc)
}
# If necessary, we remove the samples that do not have clinical data.
if (clean_samples) {
# We load the clinical data as to get the samples to keep.
samples <- rownames(Biobase::pData(ArrayUtils::load_clinical_data(input_data_dir,
verbose = verbose)))
# We only keep the samples with clinical data.
eset <- eset[, samples]
if (batch_correction == "BOTH") {
eset_bc <- eset_bc[, samples]
}
# We clean up and log information.
rm(samples)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Data cleaned (step II)."))
}
}
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset),
file = output_data_files[1],
sep = "\t",
quote = FALSE)
if (batch_correction == "BOTH") {
utils::write.table(Biobase::exprs(eset_bc),
file = output_data_files[2],
sep = "\t",
quote = FALSE)
}
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Processed data written to files."))
}
# We return the created ESET(s).
if (batch_correction == "BOTH") {
return(list(eset, eset_bc))
} else {
return(eset)
}
}
#' @title Preprocess an Affymetrix dataset with GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset using GCRMA and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We run the RMA pre-processing method on the data.
input_data_files <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
batch <- affy::ReadAffy(filenames = input_data_files, compress = compressed, verbose = verbose)
eset <- gcrma::gcrma(batch)
# We clean up and log information.
rm(raw_data_input_dir, input_data_files, batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data processed."))
}
# We remove the probes that have 0 variance accross the samples.
exp_data <- Biobase::exprs(eset)
probe_vars <- apply(exp_data, 1, var)
probe_var_0 <- names(probe_vars[probe_vars == 0])
if (length(probe_var_0) > 0) {
clean_probe_list <- setdiff(rownames(exp_data), probe_var_0)
eset <- Biobase::ExpressionSet(exp_data[clean_probe_list, ])
rm(clean_probe_list)
}