Commit 1c8a956e authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Scripts to clean (and update) datasets.

parent 4c50b52e
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Clean_datasets/
clean:
@rm -rf *~
cleanO:
@rm -rf ${OUTPUT_FOLDER}/*
run:
@Rscript --vanilla ${CODE_FOLDER}/clean_datasets.R > ${OUTPUT_FOLDER}/rlog.out 2> ${OUTPUT_FOLDER}/rlog.err
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("zeallot")
library("Biobase")
library("limma")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
raw_data_dir <- global_config$global_raw_data_dir
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
# Optional command line parameters to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
# ================================================================================================
# Functions
# ================================================================================================
# COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the clinical data ('ClinicalData.tsv').
#'
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset for which the clinical data
#' are required.
#' @param raw_data_dir A string representing the folder that contains the input data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A annotated data-frame that contains the clinical data.
get_clinical_data <- function(dataset_name, raw_data_dir, verbose = TRUE) {
# We define the I/Os.
clinical_data_file <- paste0(raw_data_dir, dataset_name, "/", "ClinicalData.tsv")
# We load the clinical data as to make QC more useful (again).
pheno_data <- Biobase::AnnotatedDataFrame(read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] Phenotypic data read."))
}
# We return the clinical data.
return(pheno_data)
}
# COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the preprocessed data (obtained using SCAN) associated with a dataset.
#' It returns a matrix and an eset of the data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the processed data.
#'
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset to load.
#' @param data_dir A string representing the folder that contains the processed data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A list of two objects (matrix and ExpressionSet) containing the preprocessed
#' expression data.
get_scan_data <- function(dataset_name, data_dir, verbose = TRUE) {
# We define the I/Os.
processed_data_file <- paste0(data_dir, dataset_name, "/", dataset_name, "_normalized.tsv")
# We load the SCAN matrix and creates the associated eset object.
exprs_scan_mat <- as.matrix(read.table(processed_data_file,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE))
exprs_scan_eset <- Biobase::ExpressionSet(assayData = exprs_scan_mat)
# We clean up and log information.
rm(processed_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] SCAN expression data read."))
}
# We return the data.
return(list(exprs_scan_mat, exprs_scan_eset))
}
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in 1:length(local_config$datasets)) {
# We get the dataset details (name, compress, array_type)
dataset_config <- local_config$datasets[[i]]
# We run the quality control of the current dataset (if necessary).
if (selected_dataset_name == "" || selected_dataset_name == dataset_config[[1]]) {
# We load the data (clinical and preproccesed data).
pheno_data <- Biobase::pData(get_clinical_data(dataset_config[1], raw_data_dir))
c(exprs_mat, exprs_eset) %<-% get_scan_data(dataset_config[1], input_data_dir)
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Data loaded."))
# We prepare the lists of samples to clean or update.
sample_delete_list <- strsplit(dataset_config[2], ",")[[1]]
sample_delete_list_nogz <- gsub(".gz", "", sample_delete_list, fixed = TRUE)
sample_update_list <- strsplit(dataset_config[3], ",")[[1]]
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Lists prepared."))
# We start by cleaning the eset (i.e., removing columns corresponding to
# unwanted samnples).
exprs_mat_clean <- exprs_mat[, !colnames(exprs_mat) %in% sample_delete_list]
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Expression data cleaned."))
# We continue by cleaning the clinical data (i.e., removing the rows of the
# unwanted samples).
pheno_data_clean <- pheno_data[!rownames(pheno_data) %in% sample_delete_list_nogz, ]
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Clinical data cleaned."))
# We update the clinical information if necessary.
if (length(sample_update_list) > 0) {
for (j in 1:length(sample_update_list)) {
update_info <- strsplit(sample_update_list[j], ";")[[1]]
pheno_data_clean[update_info[1], update_info[2]] <- as.factor(update_info[3])
}
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Clinical data updated."))
}
# Save clean data as TSV files.
exprs_mat_clean_filename <- paste0(output_data_dir, dataset_config[1], "_normalized_clean.tsv")
write.table(exprs_mat_clean, file = exprs_mat_clean_filename, sep = "\t", quote = FALSE)
pheno_data_clean_filename <- paste0(output_data_dir, dataset_config[1], "_clinical_clean.tsv")
write.table(pheno_data_clean,
file = pheno_data_clean_filename,
sep = "\t",
quote = FALSE,
col.names = NA)
message(paste0("[", Sys.time(), "][", dataset_config[1], "] Data saved."))
}
}
# We log the session details for reproducibility.
sessionInfo()
local_input_data_dir: !!str '02/'
local_data_dir: !!str '04/'
local_code_dir: !!str '04-Clean_datasets/'
datasets:
- ['GSE20141', 'GSM503950_C_1074p_SNc.CEL.gz,GSM503951_C_1271p_SNc.CEL.gz,GSM503952_C_2829_SNc.CEL.gz,GSM503956_C_3603_SNc.CEL.gz,GSM503959_PD_1401_SNc.CEL.gz,GSM503966_PD_5138_SNc.CEL.gz', '']
- ['GSE20163', '', 'GSM506004.CEL;Gender;M,GSM506010.CEL;Gender;F,GSM506011.CEL;Gender;M,GSM506012.CEL;Gender;F']
- ['GSE20164', 'GSM506020.CEL.gz', '']
- ['GSE20292', '', '']
- ['GSE7307', 'GSM176405.CEL.gz', '']
- ['GSE7621', '', '']
- ['GSE8397', 'GSM208634.cel.gz,GSM208637.cel.gz,GSM208638.cel.gz,GSM208654.cel.gz', '']
- ['Simunovic', 'park_1143_01.cel,park_1148_01.cel', '']
- ['E-MEXP-1416', '', '']
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/CodeCompliance/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/Code_style_check/
clean:
@rm -rf *~
run:
check:
@Rscript --vanilla ${CODE_FOLDER}/lint_r_scripts.R
......@@ -18,5 +18,8 @@ lint("../02-Preprocessing/plot_scan_log.R", linters = with_defaults(line_length_
# Step 03
lint("../03-Predict_gender/predict_gender.R", linters = with_defaults(line_length_linter(100)))
# Step 04
lint("../04-Clean_datasets/clean_datasets.R", linters = with_defaults(line_length_linter(100)))
# Self lintr
lint("./lint_r_scripts.R", linters = with_defaults(line_length_linter(100)))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment