Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit d767e271 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Merge branch 'develop' into 'master'.

parents 6df04fcd 60786b39
Package: ArrayUtils Package: ArrayUtils
Type: Package Type: Package
Title: Utils For Array Processing Title: Utils For Array Processing
Version: 0.3.2 Version: 0.4.0
Author: Leon-Charles Tranchevent Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu> Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu>
Description: This package contains functions to analyse microarray data. Description: This package contains functions to analyse microarray data.
...@@ -14,12 +14,14 @@ Imports: ...@@ -14,12 +14,14 @@ Imports:
utils, utils,
stats, stats,
affy, affy,
oligo,
gcrma, gcrma,
arrayQualityMetrics, arrayQualityMetrics,
SCAN.UPC, SCAN.UPC,
sva, sva,
massiR, massiR,
limma, limma,
edgeR,
stringr, stringr,
AnnotationDbi, AnnotationDbi,
statmod, statmod,
...@@ -32,4 +34,4 @@ Imports: ...@@ -32,4 +34,4 @@ Imports:
methods, methods,
illuminaHumanv3.db, illuminaHumanv3.db,
topconfects topconfects
RoxygenNote: 6.1.1 RoxygenNote: 7.1.1
This is free and unencumbered software released into the public domain. Copyright (c) 2019-2021 Biomedical Data Science (BDS), Luxembourg Centre for
Systems Biomedicine (LCSB), University of Luxembourg, Luxembourg
Anyone is free to copy, modify, publish, use, compile, sell, or Permission is hereby granted, free of charge, to any person obtaining a copy
distribute this software, either in source code form or as a compiled of this software and associated documentation files (the "Software"), to deal
binary, for any purpose, commercial or non-commercial, and by any in the Software without restriction, including without limitation the rights
means. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
In jurisdictions that recognize copyright laws, the author or authors The above copyright notice and this permission notice shall be included in all
of this software dedicate any and all copyright interest in the copies or substantial portions of the Software.
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
OTHER DEALINGS IN THE SOFTWARE. SOFTWARE.
\ No newline at end of file
For more information, please refer to <http://unlicense.org>
...@@ -7,3 +7,5 @@ importMethodsFrom("AnnotationDbi", "mget") ...@@ -7,3 +7,5 @@ importMethodsFrom("AnnotationDbi", "mget")
importFrom("stats", "model.matrix") importFrom("stats", "model.matrix")
importFrom("utils", "read.delim") importFrom("utils", "read.delim")
importFrom("stats", "var") importFrom("stats", "var")
importFrom("stats", "filter")
importFrom("methods", "as")
#' @title Creates a design matrix for a limma analysis.
#'
#' @description This function creates a design matrix for a limma analysis (without actually
#' running the analysis. The function parameters define which factors are included and how.
#'
#' @param pheno_data The data-frame that contains the phenotypic data (aka clinical data).
#' @param limma_parameters A list of useful Limma parameters that include (1) the main clinical
#' factor used to create sample categories (i.e., Disease status) and (2) the Limma coefficients
#' provided as formulas (similar to the limma::makeContrasts function). Additional parameters are
#' ignored for the time being.
#' @param correct_for_age A boolean indicating whether we should correct for age in the limma
#' design matrix. This expects that a clinical factor termed Age exists. Default to FALSE.
#' @param correct_for_batch A boolean indicating whether we should correct for a poential batch
#' effect in the limma design matrix. This expects that the actual batch information exists in
#' the object pheno_data (as field Batch). Default to FALSE.
#' @param correct_for_gender A boolean indicating whether we should correct for sample gender
#' in the limma design matrix. This expects that a clinical factor termed Gender exists. Default
#' to FALSE.
#' @param cofactor_name A string indicating a potential cofactor (included in the phenotypic data).
#' The default is NULL, and no cofactor is considered. This will replace the age correction if
#' both were deemed necessary.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The limma fit.
create_design <- function(pheno_data, limma_parameters,
correct_for_age = FALSE,
correct_for_batch = FALSE,
correct_for_gender = FALSE,
cofactor_name = NULL,
verbose = FALSE) {
# We extract the limma analysis parameters
clinical_factor <- limma_parameters[[1]]
has_block <- !is.null(cofactor_name)
clinical <- factor(pheno_data[[clinical_factor]])
rm(clinical_factor)
# We create the design matrix (regular case - no block).
# Depending on the co-factors, we have several possibilities.
# We sill decide based on (i) a block based on a co-factor (paired samples),
# (ii) the age clincal descriptor and (iii) the experimental batches.
# If we need to add a block based on a co-factor.
design <- NULL
if (has_block == FALSE) {
if (correct_for_age == FALSE) {
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do not have a block, we do not include age, nor batch, nor gender.
design <- stats::model.matrix(~ 0 + clinical)
colnames(design) <- levels(clinical)
# We log information.
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created."))
}
} else {
# We do not have a block, we do not include age, nor batch, but we include gender.
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + genders)
colnames(design) <- c(levels(clinical), levels(genders)[2:length(levels(genders))])
# We log information.
rm(genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do not have a block, we do not include age, nor gender but we do include batch.
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + batches)
colnames(design) <- c(levels(clinical), levels(batches)[2:length(levels(batches))])
# We log information.
rm(batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (batches)."))
}
} else {
# We do not have a block, we do not include age but we do include batch and gender.
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + batches + genders)
colnames(design) <- c(levels(clinical), levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))])
# We log information.
rm(batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (batches, genders)."))
}
}
}
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do not have a block, we do include age, but not batch, nor gender.
ages <- as.numeric(pheno_data[["Age"]])
design <- stats::model.matrix(~ 0 + clinical + ages)
colnames(design) <- c(levels(clinical), "Age")
# We log information.
rm(ages)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages)."))
}
} else {
# We do not have a block, we do include age and gender but not batch.
ages <- as.numeric(pheno_data[["Age"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + genders)
colnames(design) <- c(levels(clinical), "Age", levels(genders)[2:length(levels(genders))])
# We log information.
rm(ages, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do not have a block, we do include age and batch but not gender.
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches)
colnames(design) <- c(levels(clinical), "Age", levels(batches)[2:length(levels(batches))])
# We log information.
rm(ages, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, batches)."))
}
} else {
# We do not have a block, we do include age, batch and gender.
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches + genders)
colnames(design) <- c(levels(clinical), "Age", levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))])
# We log information.
rm(ages, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (ages, batches, genders)."))
}
}
}
}
} else {
# Here we have has_block = TRUE
if (correct_for_age == FALSE) {
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do have a block, we do not include age, nor batch, nor gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
design <- stats::model.matrix(~ 0 + clinical + clinical_cofactor)
colnames(design) <- c(levels(clinical), cofactor_name)
# We log information.
rm(clinical_cofactor)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs)."))
}
} else {
# We do have a block, we do not include age, nor batch, but we include gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + genders + clinical_cofactor)
colnames(design) <- c(levels(clinical), levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do have a block, we do not include age, nor gender but we do include batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + batches + clinical_cofactor)
colnames(design) <- c(levels(clinical),
levels(batches)[2:length(levels(batches))],
cofactor_name)
# We log information.
rm(clinical_cofactor, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, batches)."))
}
} else {
# We do have a block, we do not include age, nor gender but we do include batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + batches + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, batches, genders)."))
}
}
}
} else {
# Here we have correct_for_age = TRUE
if (correct_for_batch == FALSE) {
if (correct_for_gender == FALSE) {
# We do have a block, we do include age, but not batch, nor gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
design <- stats::model.matrix(~ 0 + clinical + ages + clinical_cofactor)
colnames(design) <- c(levels(clinical), "Age", cofactor_name)
# We log information.
rm(clinical_cofactor, ages)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages)."))
}
} else {
# We do have a block, we do include age and gender but not batch.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical), "Age",
levels(genders)[2:length(levels(genders))], cofactor_name)
# We log information.
rm(clinical_cofactor, ages, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages, genders)."))
}
}
} else {
# Here we have correct_for_batch = TRUE
if (correct_for_gender == FALSE) {
# We do have a block, we do include age and batch but not gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
"Age",
levels(batches)[2:length(levels(batches))],
cofactor_name)
# We log information.
rm(clinical_cofactor, ages, batches)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Design matrix created (pairs, ages, batches)."))
}
} else {
# We do have a block, we do include age, batch and gender.
clinical_cofactor <- factor(pheno_data[[cofactor_name]]) # nolint
ages <- as.numeric(pheno_data[["Age"]])
batches <- factor(pheno_data[["Batch"]])
genders <- factor(pheno_data[["Gender"]])
design <- stats::model.matrix(~ 0 + clinical + ages + batches + genders +
clinical_cofactor)
colnames(design) <- c(levels(clinical),
"Age",
levels(batches)[2:length(levels(batches))],
levels(genders)[2:length(levels(genders))],
cofactor_name)
# We log information.
rm(clinical_cofactor, ages, batches, genders)
if (verbose == TRUE) {
message(paste0("[", Sys.time(),
"] Design matrix created (pairs, ages, batches, genders)."))
}
}
}
}
}
rm(clinical)
# We return the design.
return(design)
}
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#' @param verbose A boolean representing whether the function should display log information. This #' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default. #' is FALSE by default.
#' @return NULL #' @return NULL
extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir, extract_degs <- function(fit, limma_coeffs, k, output_data_dir,
file_prefix = "", file_prefix = "",
file_suffix = "", file_suffix = "",
pval_adjust_method = "BH", pval_adjust_method = "BH",
...@@ -44,7 +44,8 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir, ...@@ -44,7 +44,8 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coef = paste0("coeff", k), coef = paste0("coeff", k),
n = Inf, n = Inf,
adjust.method = pval_adjust_method, adjust.method = pval_adjust_method,
confint = TRUE) confint = TRUE,
sort.by = "P")
# Another way to extract the differentially expressed genes (in parallel to topTable). # Another way to extract the differentially expressed genes (in parallel to topTable).
# This is only used to then create the Venn diagrams. # This is only used to then create the Venn diagrams.
......
...@@ -17,14 +17,14 @@ get_gene_annots_from_file <- function(folder, filename, entities) { ...@@ -17,14 +17,14 @@ get_gene_annots_from_file <- function(folder, filename, entities) {
file_path <- paste0(folder, filename) file_path <- paste0(folder, filename)
# We read the file into a data-frame. # We read the file into a data-frame.
gene_annots <- read.delim(file_path, row.names = NULL, stringsAsFactors = FALSE) gene_annots <- read.delim(file_path, header = FALSE, row.names = NULL, stringsAsFactors = FALSE)
rm(file_path) rm(file_path)
# We do a bit of cleaning. # We do a bit of cleaning.
gene_annots[is.na(gene_annots)] <- "" names(gene_annots) <- c("PROBEID", "SYMBOL")
rownames(gene_annots) <- gene_annots$PROBEID entities_asdf <- data.frame("PROBEID" = entities)
gene_annots <- gene_annots[entities, ] gene_annots_clean <- merge(entities_asdf, gene_annots, by = "PROBEID", all.x = TRUE)
rownames(gene_annots) <- seq(1, length(rownames(gene_annots))) rm(entities_asdf, gene_annots)
return(gene_annots) return(gene_annots_clean)
} }
#' @title Loads a table containing clinical data. #' @title Loads a table containing clinical data.
#' #'
#' @description This function loads the clinical data associated with a dataset. It returns an annotated #' @description This function loads the clinical data associated with a dataset. It returns an
#' data-frame that contains the clinical data. #' annotated data-frame that contains the clinical data.
#' #'
#' Note: the function assumes that a TSV file containing the clinical data exists. In #' Note: the function assumes that a TSV file containing the clinical data exists. In
#' particular, it does not check for the existence of folders or files. #' particular, it does not check for the existence of folders or files.
#' #'
#' @param data_dir A string representing the folder that contains the clinical data. #' @param data_dir A string representing the folder that contains the clinical data.
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv' #' @param clinical_file_name A string containing the file name. By default, this is
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE). #' 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as
#' factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This #' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default. #' is FALSE by default.
#' @return An annotated data-frame that contains the clinical data. #' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir, load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv", clinical_file_name = "ClinicalData.tsv",
...@@ -35,6 +37,14 @@ load_clinical_data <- function(data_dir, ...@@ -35,6 +37,14 @@ load_clinical_data <- function(data_dir,
row.names = 1)) row.names = 1))
} }
# Special case: when gender is set ot F for all rows, the system reads it as a boolean.
raw_df <- as.data.frame(Biobase::pData(pheno_data))
if (typeof(raw_df$Gender) == "logical") {
raw_df$Gender[raw_df$Gender == FALSE] <- "F"
}
pheno_data <- as(raw_df, "AnnotatedDataFrame")
rm(raw_df)
# We clean up and log information. # We clean up and log information.
rm(clinical_data_file) rm(clinical_data_file)
if (verbose == TRUE) { if (verbose == TRUE) {
......
...@@ -20,6 +20,8 @@ ...@@ -20,6 +20,8 @@
#' Default to NULL since in most cases, we do not have pre-processed data already. #' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This #' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default. #' is FALSE by default.
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should #' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE". #' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found, #' @param batch_filename A string indicating where the batch information can be found,
...@@ -30,14 +32,15 @@ ...@@ -30,14 +32,15 @@
#' is FALSE by default. #' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted). #' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data <- function(input_data_dir, output_data_files, preprocess_data <- function(input_data_dir, output_data_files,
platform = "Affymetix", platform = "Affymetix",
method = "SCAN", method = "SCAN",
exprs_raw = NULL, exprs_raw = NULL,
compressed = FALSE, compressed = FALSE,
batch_correction = "FALSE", expression_filename = "ExpData.tsv",
batch_filename = "Batch.tsv", batch_correction = "FALSE",
clean_samples = FALSE, batch_filename = "Batch.tsv",
verbose = FALSE) { clean_samples = FALSE,
verbose = FALSE) {
# We launch the correct function depending on the array platform and desired method. # We launch the correct function depending on the array platform and desired method.
esets <- NULL esets <- NULL
...@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files, ...@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename = batch_filename, batch_filename = batch_filename,
clean_samples = clean_samples, clean_samples = clean_samples,
verbose = verbose) verbose = verbose)
} else if (platform == "RNAseq") {
esets <- preprocess_data_rnaseq(input_data_dir,
output_data_files,
expression_filename = expression_filename,
batch_correction = batch_correction,
batch_filename = batch_filename,
clean_samples = clean_samples,
verbose = verbose)
} else { } else {
message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform, message(paste0("[", Sys.time(), "] [WARNING] Platform ", platform,
" not yet supported (no preprocessing done).")) " not yet supported (no preprocessing done)."))
......
...@@ -70,7 +70,7 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files ...@@ -70,7 +70,7 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files
probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match", probe_status <- ifelse(Biobase::fData(gse_data)$PROBEQUALITY == "No match",
"negative", "negative",
"regular") "regular")
Biobase::fData(gse_data)$Status <- probe_status Biobase::fData(gse_data)$Status <- probe_status # nolint
beadarray::Detection(gse_data) <- beadarray::calculateDetection(gse_data, beadarray::Detection(gse_data) <- beadarray::calculateDetection(gse_data,
status = probe_status) status = probe_status)
......
#' @title Post preprocess a RNA-seq dataset.