Commit c2828e2d authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Merge branch 'develop' into 'master' for v0.2.

parents 1d0ee0bb 64af219d
Package: ArrayUtils
Type: Package
Title: Utils For Array Processing
Version: 0.0.1
Version: 0.1.1
Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <>
Description: This package contains functions to analyse microarray data.
It is more a set of useful functions than a real package.
License: The Unlicense (see LICENSE)
License: file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1
#' @title Extract DEGs from a Limma fit.
#' @description The function extracts the differentially expressed genes from an existing
#' limma fit and for a predefined analysis (aka a coefficient). It saves the data in a TSV file
#' and creates several plots in the process (Venn diagrams, MD plots).
#' @param fit A fit as provided by a complete Limma analysis (for instance using
#' ArrayUtils::run_limma).
#' @param limma_coeffs A list of all limma coefficient names associated with that fit.
#' It will be used to name the output files, and to identify the results in the Venn
#' diagrams.
#' @param k The index of the current coefficient (from limma_coeffs).
#' @param output_data_dir A string representing the folder in which the output files
#' are going to be stored.
#' @param file_suffix A string used as a suffix for the file names. Default to "".
#' @param file_prefix A string used to prefix the file names. Default to "".
#' @param pval_adjust_method A string code indicating the multiple testing correction
#' method to use. Default to BH.
#' @return NULL
extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
file_suffix = "",
file_prefix = "",
pval_adjust_method = "BH") {
# We create the output file names.
if (file_suffix != "") {
file_suffix <- paste0(file_suffix, "_")
if (file_prefix != "") {
file_prefix <- paste0( "_", file_prefix)
venn_filename <- paste0(output_data_dir, file_suffix, "venn_",
limma_coeffs[k], file_prefix, ".png")
md_fn <- paste0(output_data_dir, file_suffix, "MD_",
limma_coeffs[k], file_prefix, ".png")
table_fn <- paste0(output_data_dir, file_suffix, "toptable_",
limma_coeffs[k], file_prefix, ".tsv")
# We extract the DEGs using toptable.
table <- limma::topTable(fit,
coef = paste0("coeff", k),
n = Inf,
adjust.method = pval_adjust_method,
confint = TRUE)
# Another way to extract the differentially expressed genes (in parallel to topTable).
# This is only used to then create the Venn diagrams.
results <- limma::decideTests(fit,
coefficient = paste0("coeff", k),
adjust.method = pval_adjust_method)
# We plot Venn diagrams that summarize the results and show to which
# extend the analyses overlap.
names = limma_coeffs,
include = c("up", "down"),
circle.col = c("darkorange", "darkorange2", "darkorange4"),
counts.col = c("blue", "green"),
lwd = 3,
cex = c(1.0, 0.1, 0.1),
show.include = TRUE)
# We then plot the Mean vs Deviation of the comparisons and highlight the significant genes.
coef = paste0("coeff", k),
status = results[, k],
values = c(1, -1),
hl.col = c("red", "blue"),
hl.pch = c(16, 16),
hl.cex = c(0.9, 0.9))
# Save the top tables to TSV files.
utils::write.table(table, file = table_fn, sep = "\t", quote = FALSE, col.names = NA)
#' @title Get Gene annotations for affy arrays.
#' @description This function reads the library defined by the array type and
#' extract the relevant information into a data-frame.
#' @param array_type A string representing the array type.
#' @param entities The actual set of object ids for which we want gene annotations.
#' It can for instance be probe ids or gene ids. The parameter keytype has to be
#' modified accordingly. By default, the expected type is probe id.
#' @param columns An array of column names to be returned. By default, it includes
#' the probe id, the gene official symbol, the EnsEMBL id and the Entrez-gene id.
#' @param keytype The type of key used. It will be used to map between the annotations
#' and the provided list of entities. The default is probe id.
#' @return A data-frame of the provided entities, annotated.
get_affy_annots <- function(array_type, entities,
columns = c("PROBEID", "SYMBOL", "ENSEMBL", "ENTREZID"),
keytype = "PROBEID") {
# We define the library name (removing underscores and adding .db).
affy_library_name <- paste0(stringr::str_to_lower(stringr::str_remove(array_type, "_")), ".db")
cmd <- substitute(gene_annots <- AnnotationDbi::select(
x = local_db,
keys = entities,
columns = columns,
keytype = keytype), list(local_db =
......@@ -8,11 +8,13 @@
#' @param data_dir A string representing the folder that contains the clinical data.
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv",
use_factors = TRUE,
verbose = TRUE) {
# We define the I/Os.
......@@ -22,9 +24,16 @@ load_clinical_data <- function(data_dir,
# We load the clinical data.
pheno_data <- Biobase::AnnotatedDataFrame(utils::read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
pheno_data <- NULL
if (use_factors) {
pheno_data <- Biobase::AnnotatedDataFrame(utils::read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
} else {
# We do not use factors.
pheno_data <- Biobase::AnnotatedDataFrame(utils::read.delim(file = clinical_data_file,
row.names = 1))
# We clean up and log information.
#' @title Run a limma analysis.
#' @description This function prepares and runs a limma analysis up until the fitting procedure.
#' The Limma analysis can be simple (A vs B) or more complex ((A vs B) vs (C vs D)). One single
#' cofactor can also be taken into account in the limma. It returns the final fit that can then
#' be used to detect differentially expressed genes.
#' @param pheno_data The data-frame that contains the phenotypic data (aka clinical data).
#' @param eset The ExpressionSet object that contains the pre-processed data.
#' @param limma_parameters A list of useful Limma parameters that include (1) the main clinical
#' factor used to create sample categories (i.e., Disease status) and (2) the Limma coefficients
#' provided as formulas (similar to the limma::makeContrasts function). Additional parameters are
#' ignored for the time being.
#' @param cofactor_name A string indicating a potential cofactor (included in the phenotypic data).
#' The default is NULL, and no cofactor is considered.
#' @param block_key A string indicating which column in the phenotyypic data should be used to build
#' blocks if there is a cofactor to take into account. For instance, if there are several replicates
#' per patient, the block_key might be "Patient" and the cofactor might be "Replicate".
#' @return The limma fit.
run_limma <- function(pheno_data, eset, limma_parameters,
cofactor_name = NULL,
block_key = NULL) {
# We extract the limma analysis parameters
clinical_factor <- limma_parameters[[1]]
coefficients <- limma_parameters[[2]]
has_block <- !is.null(cofactor_name)
is_factorial_limma <- length(coefficients) == 3
# We create the design matrix (regular case - no block).
clinical <- factor(pheno_data[[clinical_factor]])
design <- stats::model.matrix(~ 0 + clinical)
colnames(design) <- levels(clinical)
# If necessary we add a block based on a co-factor.
# For instance, if we have paired samples.
if (has_block) {
clinical_cofactor <- factor(pheno_data[[cofactor_name]])
design <- stats::model.matrix(~ 0 + clinical + clinical_cofactor)
colnames(design) <- c(levels(clinical), clinical_cofactor)
# We build the contrast (default only one).
contrast <- eval(substitute(limma::makeContrasts(coeff1 = c1, levels = design),
list(c1 = coefficients[1])))
# For larger lists of coefficients, we add them all.
if (is_factorial_limma) {
contrast <- eval(substitute(limma::makeContrasts(coeff1 = c1,
coeff2 = c2,
coeff3 = c3,
levels = design),
list(c1 = coefficients[1],
c2 = coefficients[2],
c3 = coefficients[3])))
# We fit the data using the above models and the contrats
# to prepare the differential analyses.
# Default to no block.
lmfit <- limma::lmFit(eset, design)
# We create a block if necessary.
if (has_block) {
corfit <- limma::duplicateCorrelation(eset, design, block = pheno_data[[block_key]])
lmfit_gender <- limma::lmFit(eset, design, block = pheno_data[[block_key]],
correlation = corfit$consensus)
fit <-, contrast)
fit <- limma::eBayes(fit)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment