Commit ada4dbce authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Preprocessing functions added.

parent 53aad4aa
......@@ -6,9 +6,9 @@ Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu>
Description: This package contains functions to analyse microarray data.
It is more a set of useful functions than a real package.
License: The Unlicense
License: The Unlicense (see LICENSE)
Encoding: UTF-8
LazyData: true
Imports:
utils,affy,Biobase,arrayQualityMetrics
utils,affy,Biobase,arrayQualityMetrics,SCAN.UPC,doParallel
RoxygenNote: 6.1.1
#' @title Loads a table containing clinical data.
#'
#' @description This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#'
#' Note: the function assumes that a TSV file containing the clinical data exists. In
#' particular, it does not check for the existence of folders or files.
#'
#' @param data_dir A string representing the folder that contains the clinical data.
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data <- function(data_dir,
clinical_file_name = "ClinicalData.tsv",
verbose = TRUE) {
# We define the I/Os.
clinical_data_file <- paste0(data_dir, clinical_file_name)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] File set to ", clinical_data_file))
}
# We load the clinical data.
pheno_data <- Biobase::AnnotatedDataFrame(utils::read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
data_dimensions = paste0(dim(pheno_data), collapse = " * ")
message(paste0("[", Sys.time(), "] Clinical data read (", data_dimensions, ")."))
}
# We return the clinical data.
return(pheno_data)
}
#' @title Preprocess a dataset with SCAN.
#'
#' @description This function preprocess a dataset using SCAN and saves the results in
#' a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param correct_for_batch_effect A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_scan <- function(input_data_dir, output_data_file,
correct_for_batch_effect = FALSE,
batch_filename = "Batch.tsv",
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp <- paste0(raw_data_input_dir, "*")
remove(raw_data_input_dir)
eset <- vector()
if (correct_for_batch_effect == FALSE) {
eset <- SCAN.UPC::SCAN(input_data_regexp, outFilePath = output_data_file)
} else {
# We define the I/Os (for batch).
batch_data_file <- paste0(input_data_dir, batch_filename)
eset <- SCAN.UPC::SCAN(input_data_regexp,
outFilePath = output_data_file,
batchFilePath = batch_data_file)
remove(batch_data_file)
}
# We clean up and log information.
rm(input_data_regexp)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with SCAN."))
}
# We return the created ESET.
return(eset)
}
#' @title Executes a quality control of a given microarray dataset (preprocessed data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated to annotate the preprocessed data and
#' then runs the quality control of the annotated data. The function assumes that a folder with
#' the clinical data exists. It then creates a report that contains various quality
#' indicators and is stored as an HTML document. It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param eset An ESET object that contains the preprocessed expression data.
#' @param input_data_dir A string representing the folder that contains the input data (clinical data).
#' @param output_data_dir A string representing the folder that will contain the output of the QC.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_preprocessed <- function(eset, input_data_dir, output_data_dir,
phenotype_groups = vector(),
verbose = TRUE) {
# We load the clinical data as to annotate the ESET object and make QC more useful.
pheno_data <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
Biobase::phenoData(eset) <- pheno_data
remove(pheno_data)
# Now, we do the QC on the normalized data.
arrayQualityMetrics::arrayQualityMetrics(expressionset = eset,
outdir = output_data_dir,
force = TRUE,
do.logtransform = TRUE,
intgroup = phenotype_groups)
# We clean up and log information.
rm(eset)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] QC analysis performed (preprocessed data)."))
}
}
#' @title Executes a quality control of a given micro-array dataset.
#' @title Executes a quality control of a given microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then loads the raw data and
#' last runs the quality control of the annotated data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains both the clinical data ('ClinicalData.tsv')
#' and the raw data (as cel files in a '/RAW/' folder).
#' last runs the quality control of the annotated data. The function assumes that a folder with
#' both the clinical data and the raw data exists (in a subfolder '/RAW/'). It then creates a report
#' that contains various quality indicators and is stored as an HTML document. It does not return
#' any value.
#'
#' It then creates a report that contains various quality indicators and is stored as an HTML
#' document. It does not return any value.
#' Note: the function does not check for the existence of folders or files.
#'
#' Note: the function does not check for the existence of folders and files (not a real package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset to analyse.
#' @param raw_data_dir A string representing the folder that contains the input data.
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
......@@ -22,29 +17,18 @@
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control <- function(dataset_name,
raw_data_dir,
#' @return NULL
run_quality_control_on_raw <- function(input_data_dir,
output_data_dir,
compressed = TRUE,
phenotype_groups = c(),
phenotype_groups = vector(),
verbose = TRUE) {
# We define the I/Os.
clinical_data_file <- paste0(raw_data_dir, dataset_name, "/", "ClinicalData.tsv")
raw_data_input_dir <- paste0(raw_data_dir, dataset_name, "/", "RAW/")
data_output_dir <- paste0(output_data_dir, dataset_name, "/")
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data <- Biobase::AnnotatedDataFrame(utils::read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] Phenotypic data read."))
}
pheno_data <- ArrayUtils::load_clinical_data(input_data_dir, verbose = verbose)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
......@@ -54,19 +38,19 @@ run_quality_control <- function(dataset_name,
# We clean up and log information.
rm(raw_file_list, pheno_data, raw_data_input_dir)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] Expression data read."))
message(paste0("[", Sys.time(), "] Expression data read."))
}
# We run the quality control itself.
arrayQualityMetrics::arrayQualityMetrics(expressionset = batch,
outdir = data_output_dir,
outdir = output_data_dir,
force = TRUE,
do.logtransform = TRUE,
intgroup = phenotype_groups)
# We clean up and log information.
rm(data_output_dir, batch)
rm(batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] QC analysis performed."))
message(paste0("[", Sys.time(), "] QC analysis performed (raw data)."))
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment