Commit e8dec793 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added age prediction to samples with missing patient age, changed limma to...

Added age prediction to samples with missing patient age, changed limma to include age as co-factor, adapted pre-processing to add BC for GC-RMA and to keep no BC (BC: Batch Correction).
parent e40fa6b8
......@@ -56,23 +56,57 @@ for (i in seq_len(length(config$datasets))) {
# We define the I/Os.
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/")
output_data_subdir <- paste0(output_data_dir, dataset_name, "/", normalization$name, "/")
output_data_file <- paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, ".tsv")
dir.create(output_data_subdir, showWarnings = TRUE, recursive = FALSE, mode = "0777")
# We pre-process the data using the default method of each platform.
eset <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_file,
platform = dataset$platform,
method = normalization_method,
batch_correction = dataset$has_batches,
clean_samples = dataset$cleaning)
# When possible, we keep two ExpressionSet objects, one with obtained with batch correction,
# one obtained without batch correction.
# Otherwise, we just have one object and the associated QC report.
if (dataset$has_batches) {
# There are batches (and potentially batch effect), so we create two expression matrices
# and two QC report. By default, batch correction is made.
output_data_files <- c(paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, "_default.tsv"),
paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, "_nobatchcorrection.tsv"))
esets <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_files,
platform = dataset$platform,
method = normalization_method,
batch_correction = "BOTH",
clean_samples = dataset$cleaning)
# We run another QC.
ArrayUtils::run_quality_control_on_preprocessed(eset,
raw_data_subdir,
output_data_subdir,
phenotype_groups = dataset_groups)
# We run QC after preprocessing.
# Default case in the same folder for consistency.
ArrayUtils::run_quality_control_on_preprocessed(esets[[1]],
raw_data_subdir,
output_data_subdir,
phenotype_groups = dataset_groups)
# The nobatchcorrection case in its specific folder (just in case we need it).
output_data_subdir_nobc <- paste0(output_data_subdir, "nobatchcorrection/")
dir.create(output_data_subdir_nobc, showWarnings = TRUE, recursive = FALSE, mode = "0777")
ArrayUtils::run_quality_control_on_preprocessed(esets[[2]],
raw_data_subdir,
output_data_subdir_nobc,
phenotype_groups = dataset_groups)
} else {
# We have no batch, so only one preprocessing run and one QC.
output_data_file <- paste0(output_data_subdir, dataset_name, "_normalized_",
normalization$name, "_default.tsv")
eset <- ArrayUtils::preprocess_data(raw_data_subdir,
output_data_file,
platform = dataset$platform,
method = normalization_method,
batch_correction = "FALSE",
clean_samples = dataset$cleaning)
# We run QC after preprocessing.
ArrayUtils::run_quality_control_on_preprocessed(eset,
raw_data_subdir,
output_data_subdir,
phenotype_groups = dataset_groups)
}
}
} # End if dataset should be analysed.
} # End for each dataset.
......
......@@ -8,4 +8,4 @@ clean_outputs:
predict:
@sbatch ${CODE_FOLDER}/predict.sh
doc:
@sbatch ${CODE_FOLDER}/doc.sh
\ No newline at end of file
@sbatch ${CODE_FOLDER}/doc.sh
......@@ -2,7 +2,7 @@
The objectives of this step is to predict the gender of the patients whose gender is not indicated in the clinical annotations.
# Details and instructions
All datasets are used regardless of whether there exists samples without associated gender. This is motivated by the fact that we also want to etimate the overall accuracy of the gender predictions. The Makefile contains the commands to get the data from Biomart and then make the predictions. Plots are made and whether the predicted data should be used is left to the user to decide (manually).
All datasets are used regardless of whether there exists samples with missing clinical annotations. This is motivated by the fact that we also want to etimate the overall accuracy of the predictions. The Makefile contains the commands to get the data from Biomart and then make the predictions. Plots are made and whether the predicted data should be used is left to the user to decide (manually).
```
make predict
```
......@@ -13,4 +13,4 @@ make doc
```
# Prerequisites
The prerequisites are to have the raw data (mostly from GEO) in the Data folder and the pre-processed data from step 02. There should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files and a "ClinicalData.tsv" file for the annotations.
The prerequisites are to have the raw data (mostly from GEO) in the Data folder and the pre-processed data from step 02. There should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files and a "ClinicalData.tsv" file for the annotations (including of course gender).
......@@ -38,7 +38,7 @@ echo '\maketitle' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 03-Gender\_prediction. Heatmaps of the expression' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 03-Predict\_gender. Heatmaps of the expression' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'signal of Y-chromosome probes are displayed for all datasets. Clustering results are displayed on top of' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'the heatmaps to see if samples with shared gender are clustered together. In addition, signal from' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'Y-chromosome probes versus signal from X-chromosome probes is are also plotted. Blue samples are' >> ${OUTPUT_FOLDER}/results_summary.tex
......
......@@ -7,6 +7,8 @@ clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
clean_data:
@sbatch ${CODE_FOLDER}/clean_data.sh
predict_age:
@sbatch ${CODE_FOLDER}/predict.sh
check:
@sbatch ${CODE_FOLDER}/check.sh
doc:
......
# Objectives
The objectives of this step is clean the datasets, i.e., remove the arrays that have been flagged during the previous steps due to various errors (refered to as QC-I, SCAN-no-converg, QC-II or CLIN-no-gender).
This produces clean datasets that can be used for further analyses.
This produces clean datasets that can be used for further analyses. In addition, we also predict the age for the datasets where patient age has not been provided.
# Details and instructions
The datasets are processed one by one to remove the bad arrays and update the clinical files accordingly. The clinical data is also updated to include gender predictions. The necessary information is storted in the local configuration file, that needs to be manually updated based on the outputs of the previous steps. Please note that the 'Batch.tsv' file is not updated since it is not used after preprocessing.
......@@ -9,6 +9,11 @@ make clean_outputs
make clean_data
```
The missing age values are then predicted and automatically udpated.
```
make predict_age
```
The age balance is then checked for all datasets. No statistics is derived but plots are created for manual inspection.
```
make check
......
......@@ -42,7 +42,7 @@ for (i in seq_len(length(config$datasets))) {
dataset_name <- dataset$dataset_name
# We load the data (clinical).
pheno_data_fn <- paste0(dataset_name, "_clinical_clean.tsv")
pheno_data_fn <- paste0(dataset_name, "_clinical_updated.tsv")
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
......
......@@ -104,7 +104,7 @@ for (i in seq_len(length(config$datasets))) {
}
# We load the data (clinical).
pheno_data_fn <- paste0(dataset_name, "_clinical_clean.tsv")
pheno_data_fn <- paste0(dataset_name, "_clinical_updated.tsv")
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(output_data_dir,
pheno_data_fn,
use_factors = FALSE))
......
#!/bin/bash -l
#SBATCH -J geneder:04:predAge
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:30:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GEO_PLAFORM_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Platforms/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Clean_datasets/
# Loading modules.
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/platforms_config.yml
# Run the gender prediction jobs.
Rscript --vanilla ${CODE_FOLDER}/predict_age.R > ${OUTPUT_FOLDER}/log_predictage.out 2> ${OUTPUT_FOLDER}/log_predictage.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("ArrayUtils")
library("tidyverse")
library("affy")
source("../libs/conf/confR.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
dataset <- config$datasets[[i]]
dataset_name <- dataset$dataset_name
# We define the I/Os.
clinical_data_fn <- paste0(dataset_name, "_clinical_clean.tsv")
# We load the data (clinical and preprocessed data).
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(
output_data_dir, clinical_file_name = clinical_data_fn))
exp_data_fn <- paste0(output_data_dir, dataset_name, "_normalized_clean.tsv")
exp_data <- ArrayUtils::read_eset(exp_data_fn, as_eset = FALSE)
message(paste0("[", Sys.time(), "][", dataset_name, "] Data read."))
# We might not have any missing age, in case we do nothing.
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
# If there is no patient age at all, we skip the dataset sicne we can not make predictions.
if (sum(is.na(pheno_data$Age)) == length(pheno_data$Age)) {
next;
}
# For each sample whose patient age is not known.
predictions <- c()
ids_missing_samples <- which(is.na(pheno_data$Age))
for (j in ids_missing_samples) {
# We gather the relevant samples to predict the age of sample {j}.
# Samples are relevant when they are of the same gender and disease status
# and the patient age is known.
relevant_samples <- pheno_data %>% mutate(cel_name = row.names(pheno_data)) %>%
filter(Disease.status == pheno_data$Disease.status[j]) %>%
filter(Gender == pheno_data$Gender[j]) %>%
filter(!is.na(Age)) %>% select(cel_name)
# We only continue if we have enough data (at least one relevant sample).
if (dim(relevant_samples)[1] < 1) {
next;
}
# We will know compute the weighted k-NN based on the processed expression profiles.
reference_profile <- exp_data[, row.names(pheno_data)[j]]
all_cors <- c()
all_ages <- c()
for (k in seq_len(dim(relevant_samples)[1])) {
current_profile <- exp_data[, relevant_samples[k, ]]
all_cors <- c(all_cors, cor(reference_profile, current_profile, method = "spearman"))
current_age <- pheno_data %>% mutate(cel_name = row.names(pheno_data)) %>%
filter(cel_name == relevant_samples[k, ]) %>%
select(Age)
all_ages <- c(all_ages, levels(current_age$Age)[current_age$Age])
}
knn_raw_data <- data.frame(cor = all_cors, age = all_ages, stringsAsFactors = FALSE)
# We define the k we can use (at least 1 and at most 3).
k <- min(3, length(all_cors))
knn_data <- knn_raw_data %>% arrange(-cor) %>% head(k) %>%
mutate(adj_cor = (1 + cor) / 2)
sum_of_adj_cors <- sum(knn_data$adj_cor)
knn_data <- knn_data %>% mutate(weight = adj_cor / sum_of_adj_cors)
# We finally compute the predicted age.
predicted_age <- round(sum(knn_data$weight * as.numeric(knn_data$age)), 0)
predictions <- c(predictions, predicted_age)
} # End for each sample whose patient age is not known.
message(paste0("[", Sys.time(), "][", dataset_name, "] Predictions done."))
# We now should update the clinical data with the predictions.
pheno_data$Age <- as.numeric(as.character(pheno_data$Age))
for (l in seq_len(length(ids_missing_samples))) {
pheno_data[ids_missing_samples[l], "Age"] <- predictions[l]
message(paste0("[", Sys.time(), "][", dataset_name, "] Update sample #",
ids_missing_samples[l], " with age ", predictions[l]))
}
# Save clean data as TSV files.
pheno_data_clean_filename <- paste0(output_data_dir, dataset_name, "_clinical_updated.tsv")
write.table(pheno_data,
file = pheno_data_clean_filename,
sep = "\t",
quote = FALSE,
col.names = NA)
message(paste0("[", Sys.time(), "][", dataset_name, "] Clinical data updated."))
} # End if there exist/s sample/s with no age information.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -125,7 +125,11 @@ for (i in seq_len(length(config$datasets))) {
cofactor_name = "Tissue",
block_key = "Patient")
} else {
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters)
age_correct <- TRUE
if (sum(!is.na(pheno_data$Age)) != length(pheno_data$Age)) {
age_correct <- FALSE
}
fit <- ArrayUtils::run_limma(pheno_data, exp_eset, limma_parameters, age_correct)
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] Limma fit done."))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment