Commit d767e271 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Merge branch 'develop' into 'master'.

parents 6df04fcd 60786b39
Package: ArrayUtils
Type: Package
Title: Utils For Array Processing
Version: 0.3.2
Version: 0.4.0
Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu>
Description: This package contains functions to analyse microarray data.
......@@ -14,12 +14,14 @@ Imports:
utils,
stats,
affy,
oligo,
gcrma,
arrayQualityMetrics,
SCAN.UPC,
sva,
massiR,
limma,
edgeR,
stringr,
AnnotationDbi,
statmod,
......@@ -32,4 +34,4 @@ Imports:
methods,
illuminaHumanv3.db,
topconfects
RoxygenNote: 6.1.1
RoxygenNote: 7.1.1
This is free and unencumbered software released into the public domain.
Copyright (c) 2019-2021 Biomedical Data Science (BDS), Luxembourg Centre for
Systems Biomedicine (LCSB), University of Luxembourg, Luxembourg
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
......@@ -7,3 +7,5 @@ importMethodsFrom("AnnotationDbi", "mget")
importFrom("stats", "model.matrix")
importFrom("utils", "read.delim")
importFrom("stats", "var")
importFrom("stats", "filter")
importFrom("methods", "as")
#' @title Creates a design matrix for a limma analysis.
#'
#' @description This function creates a design matrix for a limma analysis (without actually
#' running the analysis. The function parameters define which factors are included and how.
#'
#' @param pheno_data The data-frame that contains the phenotypic data (aka clinical data).
#' @param limma_parameters A list of useful Limma parameters that include (1) the main clinical
#' factor used to create sample categories (i.e., Disease status) and (2) the Limma coefficients
#' provided as formulas (similar to the limma::makeContrasts function). Additional parameters are
#' ignored for the time being.
#' @param correct_for_age A boolean indicating whether we should correct for age in the limma
#' design matrix. This expects that a clinical factor termed Age exists. Default to FALSE.
#' @param correct_for_batch A boolean indicating whether we should correct for a poential batch
#' effect in the limma design matrix. This expects that the actual batch information exists in
#' the object pheno_data (as field Batch). Default to FALSE.
#' @param correct_for_gender A boolean indicating whether we should correct for sample gender
#' in the limma design matrix. This expects that a clinical factor termed Gender exists. Default
#' to FALSE.
#' @param cofactor_name A string indicating a potential cofactor (included in the phenotypic data).
#' The default is NULL, and no cofactor is considered. This will replace the age correction if
#' both were deemed necessary.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The limma fit.
create_design <- function(pheno_data, limma_parameters,
correct_for_age = FALSE,
correct_for_batch = FALSE,
correct_for_gender = FALSE,
cofactor_name = NULL,
verbose = FALSE) {
# We extract the limma analysis parameters
clinical_factor <- limma_parameters[[1]]
has_block <- !is.null(cofactor_name)
clinical <- factor(pheno_data[[clinical_factor]])
rm(clinical_factor)
# We create the design matrix (regular case - no block).
# Depending on the co-factors, we have several possibilities.
# We sill decide based on (i) a block based on a co-factor (paired samples),
# (ii) the age clincal descriptor and (iii) the experimental batches.
# If we need to add a block based on a co-factor.
design <- NULL
if (has_block == FALSE) {
if (correct_for_age == FALSE) {
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do not have a block, we do not include age, nor batch, nor gender.
design <- stats::model.matrix(~ 0 + clinical)
colnames(design) <- levels(clinical)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created."))
}
} else {
# We do not have a block, we do not include age, nor batch, but we include gender.
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + genders)
colnames(design) <- c(levels(clinical), levels(genders)[2:length(levels(genders))])
# We log information.
rm(genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do not have a block, we do not include age, nor gender but we do include batch.
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + batches)
colnames(design) <- c(levels(clinical), levels(batches)[2:length(levels(batches))])
# We log information.
rm(batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (batches)."))
}
} else {
# We do not have a block, we do not include age but we do include batch and gender.
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + batches + genders)
colnames(design) <- c(levels(clinical), levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))])
# We log information.
rm(batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (batches, genders)."))
}
}
}
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do not have a block, we do include age, but not batch, nor gender.
ages <- as.numeric(pheno_data[["Age"]])
design <- stats::model.matrix(~ 0 + clinical + ages)
colnames(design) <- c(levels(clinical), "Age")
# We log information.
rm(ages)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages)."))
}
} else {
# We do not have a block, we do include age and gender but not batch.
ages <- as.numeric(pheno_data[["Age"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + genders)
colnames(design) <- c(levels(clinical), "Age", levels(genders)[2:length(levels(genders))])
# We log information.
rm(ages, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do not have a block, we do include age and batch but not gender.
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches)
colnames(design) <- c(levels(clinical), "Age", levels(batches)[2:length(levels(batches))])
# We log information.
rm(ages, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, batches)."))
}
} else {
# We do not have a block, we do include age, batch and gender.
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches + genders)
colnames(design) <- c(levels(clinical), "Age", levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))])
# We log information.
rm(ages, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, batches, genders)."))
}
}
}
}
} else {
# Here we have has_block = TRUE
if (correct_for_age == FALSE) {
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do have a block, we do not include age, nor batch, nor gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
design <- stats::model.matrix(~ 0 + clinical + clinical_cofactor)
colnames(design) <- c(levels(clinical), cofactor_name)
# We log information.
rm(clinical_cofactor)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs)."))
}
} else {
# We do have a block, we do not include age, nor batch, but we include gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + genders + clinical_cofactor)
colnames(design) <- c(levels(clinical), levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do have a block, we do not include age, nor gender but we do include batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + batches + clinical_cofactor)
colnames(design) <- c(levels(clinical),
levels(batches)[2:length(levels(batches))],
cofactor_name)
# We log information.
rm(clinical_cofactor, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, batches)."))
}
} else {
# We do have a block, we do not include age, nor gender but we do include batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + batches + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, batches, genders)."))
}
}
}
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do have a block, we do include age, but not batch, nor gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
design <- stats::model.matrix(~ 0 + clinical + ages + clinical_cofactor)
colnames(design) <- c(levels(clinical), "Age", cofactor_name)
# We log information.
rm(clinical_cofactor, ages)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages)."))
}
} else {
# We do have a block, we do include age and gender but not batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical), "Age",
levels(genders)[2:length(levels(genders))], cofactor_name)
# We log information.
rm(clinical_cofactor, ages, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do have a block, we do include age and batch but not gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
"Age",
levels(batches)[2:length(levels(batches))],
cofactor_name)
# We log information.
rm(clinical_cofactor, ages, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages, batches)."))
}
} else {
# We do have a block, we do include age, batch and gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
"Age",
levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, ages, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(),
"] Design matrix created (pairs, ages, batches, genders)."))
}
}
}
}
}
rm(clinical)
# We return the design.
return(design)
}
......@@ -19,7 +19,7 @@
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return NULL
extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
extract_degs <- function(fit, limma_coeffs, k, output_data_dir,
file_prefix = "",
file_suffix = "",
pval_adjust_method = "BH",
......@@ -44,7 +44,8 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coef = paste0("coeff", k),
n = Inf,
adjust.method = pval_adjust_method,
confint = TRUE)
confint = TRUE,
sort.by = "P")
# Another way to extract the differentially expressed genes (in parallel to topTable).
# This is only used to then create the Venn diagrams.
......
......@@ -17,14 +17,14 @@ get_gene_annots_from_file <- function(folder, filename, entities) {
file_path <- paste0(folder, filename)
# We read the file into a data-frame.
gene_annots <- read.delim(file_path, row.names = NULL, stringsAsFactors = FALSE)
gene_annots <- read.delim(file_path, header = FALSE, row.names = NULL, stringsAsFactors = FALSE)
rm(file_path)
# We do a bit of cleaning.
gene_annots[is.na(gene_annots)] <- ""
rownames(gene_annots) <- gene_annots$PROBEID
gene_annots <- gene_annots[entities, ]
rownames(gene_annots) <- seq(1, length(rownames(gene_annots)))
names(gene_annots) <- c("PROBEID", "SYMBOL")
entities_asdf <- data.frame("PROBEID" = entities)
gene_annots_clean <- merge(entities_asdf, gene_annots, by = "PROBEID", all.x = TRUE)
rm(entities_asdf, gene_annots)
return(gene_annots)
return(gene_annots_clean)
}
#' @title Loads a table containing clinical data.
#'
#' @description This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#' @description This function loads the clinical data associated with a dataset. It returns an
#' annotated data-frame that contains the clinical data.
#'
#' Note: the function assumes that a TSV file containing the clinical data exists. In
#' particular, it does not check for the existence of folders or files.
#'
#' @param data_dir A string representing the folder that contains the clinical data.
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param clinical_file_name A string containing the file name. By default, this is
#' 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as
#' factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' is FALSE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv",
......@@ -35,6 +37,14 @@ load_clinical_data <- function(data_dir,
row.names = 1))
}
# Special case: when gender is set ot F for all rows, the system reads it as a boolean.
raw_df <- as.data.frame(Biobase::pData(pheno_data))
if (typeof(raw_df$Gender) == "logical") {
raw_df$Gender[raw_df$Gender == FALSE] <- "F"
}
pheno_data <- as(raw_df, "AnnotatedDataFrame")
rm(raw_df)
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
......
......@@ -20,6 +20,8 @@
#' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
......@@ -30,14 +32,15 @@
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix",
method = "SCAN",
exprs_raw = NULL,
compressed = FALSE,
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
platform = "Affymetix",
method = "SCAN",
exprs_raw = NULL,
compressed = FALSE,
expression_filename = "ExpData.tsv",
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
verbose = FALSE) {
# We launch the correct function depending on the array platform and desired method.
esets <- NULL
......@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else if (platform == "RNAseq") {
esets <- preprocess_data_rnaseq(input_data_dir,
output_data_files,
expression_filename = expression_filename,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else {
message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
" not yet supported (no preprocessing done)."))
......
......@@ -70,7 +70,7 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files
probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match",
"negative",
"regular")
Biobase::fData(gse_data)$Status <- probe_status
Biobase::fData(gse_data)$Status <- probe_status # nolint
beadarray::Detection(gse_data) <- beadarray::calculateDetection(gse_data,
status = probe_status)
......
#' @title Post preprocess a RNA-seq dataset.
#'
#' @description This function preprocess a dataset that was already pre-processed
#' in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param remove_low_counts A boolean indicating whether low counts should be removed prior
#' to the normalization, TRUE by default.
#' @param low_count_threshold A numeric representing the maximum number of counts associated to
#' transcripts considered as not lowly expressed. Default value is 5.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_rnaseq <- function(input_data_dir, output_data_files,
expression_filename = "ExpData.tsv",
batch_correction = "FALSE",
batch_filename = "Batch.tsv",
clean_samples = FALSE,
remove_low_counts = TRUE,
low_count_threshold = 5,
verbose = FALSE) {
# We read the pre-processed data (RNA-seq).
exp_data_fn <- paste0(input_data_dir, expression_filename)
exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
rm(exp_data_fn)
# The RNA-seq data consists of counts so we need to post-process
# it to get some kind of microarray compatible data (for instance on a log2 scale).
# We first need to filter the low expressed transcripts.
if (remove_low_counts) {
keep <- rowSums(exp_data) > low_count_threshold
exp_data <- exp_data[keep, ]
rm(keep)
}
# We need to normalize the counts.
exp_data_dge <- edgeR::DGEList(counts = exp_data)
exp_data_dge <- edgeR::calcNormFactors(exp_data_dge)
rm(exp_data)
# CPM then does the normalization and the log-transformation.
exp_data_norm <- limma::voom(exp_data_dge, normalize.method = "quantile")$E
rm(exp_data_dge)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Raw data read"))