clean_datasets.R 7.38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env Rscript

# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("zeallot")
library("Biobase")
library("limma")
message(paste0("[", Sys.time(), "] Libraries loaded."))

# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
raw_data_dir <- global_config$global_raw_data_dir
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
# Optional command line parameters to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
  selected_dataset_name <- args[1]
}

# ================================================================================================
# Functions
# ================================================================================================

# COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#' 
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the clinical data ('ClinicalData.tsv').
#' 
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#' 
#' @param dataset_name A string representing the name of the dataset for which the clinical data
#'  are required.
#' @param raw_data_dir A string representing the folder that contains the input data.
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
#' @return A annotated data-frame that contains the clinical data.
get_clinical_data <- function(dataset_name, raw_data_dir, verbose = TRUE) {

  # We define the I/Os.
  clinical_data_file <- paste0(raw_data_dir, dataset_name, "/", "ClinicalData.tsv")

  # We load the clinical data as to make QC more useful (again).
  pheno_data <- Biobase::AnnotatedDataFrame(read.delim(file = clinical_data_file,
                                                       row.names = 1,
                                                       colClasses = "factor"))

  # We clean up and log information.
  rm(clinical_data_file)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "][", dataset_name, "] Phenotypic data read."))
  }

  # We return the clinical data.
  return(pheno_data)
}

# COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the preprocessed data (obtained using SCAN) associated with a dataset.
#' It returns a matrix and an eset of the data.
#' 
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the processed data.
#' 
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#' 
#' @param dataset_name A string representing the name of the dataset to load.
#' @param data_dir A string representing the folder that contains the processed data.
#' @param verbose A boolean representing whether the function should display log information. This
#'  is TRUE by default.
#' @return A list of two objects (matrix and ExpressionSet) containing the preprocessed 
#'  expression data.
get_scan_data <- function(dataset_name, data_dir, verbose = TRUE) {

  # We define the I/Os.
  processed_data_file <- paste0(data_dir, dataset_name, "/", dataset_name, "_normalized.tsv")

  # We load the SCAN matrix and creates the associated eset object.
  exprs_scan_mat <- as.matrix(read.table(processed_data_file,
                                         header = TRUE,
                                         sep = "\t",
                                         row.names = 1,
                                         as.is = TRUE))
  exprs_scan_eset <- Biobase::ExpressionSet(assayData = exprs_scan_mat)

  # We clean up and log information.
  rm(processed_data_file)
  if (verbose == TRUE) {
    message(paste0("[", Sys.time(), "][", dataset_name, "] SCAN expression data read."))
  }

  # We return the data.
  return(list(exprs_scan_mat, exprs_scan_eset))
}

# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in 1:length(local_config$datasets)) {

  # We get the dataset details (name, compress, array_type)
  dataset_config <- local_config$datasets[[i]]

  # We run the quality control of the current dataset (if necessary).
  if (selected_dataset_name == "" || selected_dataset_name == dataset_config[[1]]) {

    # We load the data (clinical and preproccesed data).
    pheno_data <- Biobase::pData(get_clinical_data(dataset_config[1], raw_data_dir))
    c(exprs_mat, exprs_eset) %<-% get_scan_data(dataset_config[1], input_data_dir)
    message(paste0("[", Sys.time(), "][", dataset_config[1], "] Data loaded."))

    # We prepare the lists of samples to clean or update.
    sample_delete_list <- strsplit(dataset_config[2], ",")[[1]]
    sample_delete_list_nogz <- gsub(".gz", "", sample_delete_list, fixed = TRUE)
    sample_update_list <- strsplit(dataset_config[3], ",")[[1]]
    message(paste0("[", Sys.time(), "][", dataset_config[1], "] Lists prepared."))

    # We start by cleaning the eset (i.e., removing columns corresponding to
    # unwanted samnples).
    exprs_mat_clean <- exprs_mat[, !colnames(exprs_mat) %in% sample_delete_list]
    message(paste0("[", Sys.time(), "][", dataset_config[1], "] Expression data cleaned."))

    # We continue by cleaning the clinical data (i.e., removing the rows of the
    # unwanted samples).
    pheno_data_clean <- pheno_data[!rownames(pheno_data) %in% sample_delete_list_nogz, ]
    message(paste0("[", Sys.time(), "][", dataset_config[1], "] Clinical data cleaned."))

    # We update the clinical information if necessary.
    if (length(sample_update_list) > 0) {
      for (j in 1:length(sample_update_list)) {
        update_info <- strsplit(sample_update_list[j], ";")[[1]]
        pheno_data_clean[update_info[1], update_info[2]] <- as.factor(update_info[3])
      }
      message(paste0("[", Sys.time(), "][", dataset_config[1], "] Clinical data updated."))
    }

    # Save clean data as TSV files.
    exprs_mat_clean_filename <- paste0(output_data_dir, dataset_config[1], "_normalized_clean.tsv")
    write.table(exprs_mat_clean, file = exprs_mat_clean_filename, sep = "\t", quote = FALSE)
    pheno_data_clean_filename <- paste0(output_data_dir, dataset_config[1], "_clinical_clean.tsv")
    write.table(pheno_data_clean,
                file = pheno_data_clean_filename,
                sep = "\t",
                quote = FALSE,
                col.names = NA)
    message(paste0("[", Sys.time(), "][", dataset_config[1], "] Data saved."))

  }
}

# We log the session details for reproducibility.
sessionInfo()