Commit 1d0ee0bb authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added more processing functions and gender prediction methods.

parent ada4dbce
......@@ -10,5 +10,10 @@ License: The Unlicense (see LICENSE)
Encoding: UTF-8
LazyData: true
Imports:
utils,affy,Biobase,arrayQualityMetrics,SCAN.UPC,doParallel
Biobase,
utils,
affy,
arrayQualityMetrics,
SCAN.UPC,
massiR
RoxygenNote: 6.1.1
#' @title Preprocess a dataset with RMA.
#'
#' @description This function preprocess a dataset using RMA and saves the results in
#' a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_rma <- function(input_data_dir, output_data_file,
compressed = TRUE,
verbose = TRUE) {
# We define the I/Os.
raw_data_input_dir <- paste0(input_data_dir, "RAW/")
# We run the RMA pre-processing method on the data.
input_data_files <- affy::list.celfiles(raw_data_input_dir, full.names = TRUE)
remove(raw_data_input_dir)
batch <- affy::ReadAffy(filenames = input_data_files, compress = compressed, verbose = verbose)
eset <- affy::rma(batch)
# We save the eset data as TSV file.
utils::write.table(Biobase::exprs(eset), file = output_data_file, sep = "\t", quote = FALSE)
# We clean up and log information.
remove(input_data_files, batch)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data pre-processed with RMA."))
}
# We return the created ESET.
return(eset)
}
#' @title Read an expression set from a TSV file.
#'
#' @description This function loads an expression matrix and returns an eset. The
#' function assumes a folder with the data exists and does not
#' check for its existence.
#'
#' @param data_file A string representing the file that contains the expression data.
#' @param header A boolean indicating whether the file contains a header (default to TRUE).
#' @param sep A string that is used as a field separator to read the data (default to
#' tab for TSV files).
#' @param row_names The index of the row names (default to 1).
#' @param as_is A boolean indicating whether R should keep the data as they are in the
#' file (default to TRUE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return An ExpressionSet containing the preprocessed expression data.
read_eset <- function(data_file,
header = TRUE,
sep = "\t",
row_names = 1,
as_is = TRUE,
verbose = TRUE) {
# We load the matrix and creates the associated eset object.
exprs_mat <- as.matrix(utils::read.table(data_file,
header = header,
sep = sep,
row.names = row_names,
as.is = as_is))
exprs_eset <- Biobase::ExpressionSet(assayData = exprs_mat)
# We clean up and log information.
rm(exprs_mat)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Expression data read."))
}
# We return the eset.
return(exprs_eset)
}
#' @title Reads probe lists.
#'
#' @description This function loads the lists of probes from TSV files (for instance,
#' from Biomart). It then returns a list of data-frames as to be able to use them with massiR.
#'
#' Note: the function does not check for the existence of folders and files.
#'
#' @param probe_list_names A list of strings representing the names of the files to be loaded (without
#' their eventual prefixes / suffixes).
#' @param filename_prefix A prefix used by all files (e.g., "mart_export_").
#' @param filename_suffix A suffix used by all files (e.g., ".tsv").
#' @param data_dir A string representing the folder that contains the probe data (as TSV files).
#' @param header A boolean indicating whether the file contains a header. This is FALSE by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A list data-frames containing the probes (as row.names only).
read_probe_lists <- function(probe_list_names, filename_prefix = "", filename_suffix = "",
data_dir, header = FALSE, verbose = TRUE) {
# We define the I/Os.
probe_list_fullnames <- paste0(data_dir, filename_prefix, probe_list_names, filename_suffix)
# We then load the files one by one.
to_return <- vector("list", length = length(probe_list_fullnames))
for (i in seq_len(length(probe_list_fullnames))) {
# Regular file reading.
probes <- utils::read.csv(probe_list_fullnames[i], sep = "\t", header = header)
# We reformat the set of probes.
to_return[[i]] <- data.frame(row.names = probes[[colnames(probes)[1]]])
remove(probes)
}
# We clean up and log information.
rm(probe_list_fullnames)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] Probe lists retrieved."))
}
# We return the probe lists (as a list of data-frames).
return(to_return)
}
#' @title Predicts the gender of all samples in a given expression set.
#'
#' @description This function runs massiR on an eset object. The objective is to predict the gender
#' associated with the samples from the expression of a subset of probes (i.e., typically
#' on probes for the Y chomosome).
#'
#' @param eset The ExpressionSet object to analyze.
#' @param probes The set of probes to use to discriminate males and females (most probably
#' these would be Y-chromosome probes).
#' @param pheno_data The clinical data of the dataset (if known) as to estimate the accuracy of
#' the massiR predictions.
#' @param massir_threshold Am integer representing the massiR threshold (between 1 and 4). This
#' controls the number of probes that are considered as suitable (based on the quartiles) The
#' default is 4 (most conservative).
#' @param plot A boolean representing whether control plots need to be created. This is set to
#' FALSE by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A matrix with a subset of the original signal based on the optimnal probes selected
#' by massiR.
run_massir <- function(eset, probes, pheno_data,
massir_threshold = 4,
plot = FALSE,
verbose = TRUE) {
# We start with some control plots.
if (plot == TRUE) {
# We subset the data with the given probes (using massiR).
eset_subset <- massiR::massi_y(eset, probes)
# If necessary, we plot the signal (for visual control).
massiR::massi_y_plot(eset_subset)
# Cleaning
rm(eset_subset)
}
# We select the most suitable probes (i.e., most varying) and subset the data
# again (for latter use).
selected_probes <- massiR::massi_select(eset, probes, threshold = massir_threshold)
eset_subset_best <- eset[rownames(selected_probes), ]
# We perform the clutering on the 'best' probe subset.
cluster_results <- massiR::massi_cluster(selected_probes)
# If necessary, we check the clustering quality and predictibility.
if (plot == TRUE) {
massiR::massi_cluster_plot(selected_probes, cluster_results)
}
# We also log the results of the dip analysis.
if (verbose == TRUE) {
dip_result <- massiR::massi_dip(selected_probes)
rm(dip_result)
}
# We compute simple stats.
cases <- table(data.frame(
Predictions <- cluster_results[[2]]$sex,
Observed <- Biobase::pData(pheno_data)$Gender
))
acc <- (cases[1, 1] + cases[2, 2]) / sum(cases)
# We log the simple stats.
if (verbose == TRUE) {
cases
acc
}
# We clean up and log information.
rm(selected_probes, cluster_results, cases, acc)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "] massiR analysis performed."))
}
# We return the subset of the data for later use.
return(eset_subset_best)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment