Commit 9b1ebb03 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 03 to process all datasets at once (microarray and RNA-seq).

parent 20994b82
......@@ -5,7 +5,7 @@
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --mem=12GB
#SBATCH --time=0-00:55:00
#SBATCH --time=0-01:30:00
#SBATCH -p batch
#SBATCH --qos=normal
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH --time=0-00:02:00
#SBATCH -p batch
#SBATCH --qos=normal
......
......@@ -14,4 +14,4 @@ make doc
```
# Prerequisites
The prerequisites are to have the raw data (mostly from GEO) in the Data folder and the pre-processed data from step 02. There should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files and a "ClinicalData.tsv" file for the annotations (including of course gender and age).
The prerequisites are to have the raw data (mostly from GEO) in the Data folder and the pre-processed data from step 02. There should be one folder per dataset, with a '/RAW/' folder with the raw data as CEL files (array data) or a TSV file with the pre-processed data (RNA-seq). In addition , a "ClinicalData.tsv" file should contain the clinical annotations (including of course gender and age).
......@@ -6,7 +6,7 @@
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -2,7 +2,7 @@ local_input_data_dir: !!str '02/'
local_data_dir: !!str '03/'
local_code_dir: !!str '03-Predict_missing/'
# Paramters for the age predicton models.
# Parameters for the age predicton models.
age_model_feat_min_cor: 0.65
age_model_feat_min_nb: 250
age_model_feat_max_nb: 1000
......
......@@ -35,7 +35,8 @@ message(paste0("[", Sys.time(), "] Configuration done."))
#'
#' @description This function computes two correlations (Pearson and Spearman) between a feature
#' vector and a response vector. It then return the maximum absolute value among the two
#' (as to take into account both correlation and anti-correlation).
#' (as to take into account both correlation and anti-correlation as well as linear and non linear
#' correlations').
#'
#' @param x The feature vector.
#' @param resp The response vector.
......@@ -118,6 +119,7 @@ for (i in seq_len(length(config$datasets))) {
dplyr::filter(!is.na(Age)) %>%
dplyr::select(cel_name, Age, Gender, Disease.status)
nomissing_samples$cel_name <- gsub(".txt", "", nomissing_samples$cel_name)
nomissing_samples$cel_name <- gsub("-", ".", nomissing_samples$cel_name)
nomissing_ages <- nomissing_samples$Age
nomissing_genders <- nomissing_samples$Gender
nomissing_statuses <- nomissing_samples$Disease.status
......
......@@ -6,7 +6,7 @@
#SBATCH -n 4
#SBATCH --time=0-10:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -4,9 +4,9 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:05:00
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -42,9 +42,16 @@ do
wget -O ${OUTPUT_FOLDER}${platformName}_chromY.tsv 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6"><Dataset name = "hsapiens_gene_ensembl" interface = "default"><Filter name = "chromosome_name" value = "Y"/><Attribute name = "'${platformBiomartName}'"/></Dataset></Query>'
sleep 2s
else
# We use the GEO data
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformGEOName}_chromX.tsv > ${OUTPUT_FOLDER}${platformName}_chromX.tsv
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformGEOName}_chromY.tsv > ${OUTPUT_FOLDER}${platformName}_chromY.tsv
if [ "${platformGEOName}" != "NA" ]
then
# We use the GEO data
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformGEOName}_chromX.tsv > ${OUTPUT_FOLDER}${platformName}_chromX.tsv
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformGEOName}_chromY.tsv > ${OUTPUT_FOLDER}${platformName}_chromY.tsv
else
# We use manually curated data.
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformName}_chromX.tsv > ${OUTPUT_FOLDER}${platformName}_chromX.tsv
cut -f 1 ${GEO_PLAFORM_FOLDER}${platformName}_chromY.tsv > ${OUTPUT_FOLDER}${platformName}_chromY.tsv
fi
fi
done
......
......@@ -2,7 +2,7 @@
The first objectives of this step is clean the datasets, i.e., remove the arrays that have been flagged during the previous steps due to various errors (referred to as QC-I, PROC-no-converg, QC-II-SCAN, QC-II-GCRMA, CLIN-no-age or CLIN-no-gender). A second objective is to check the expression levels of pre-defined biomarkers.
# Details and instructions
The datasets are processed one by one to remove the bad arrays and update the clinical files accordingly. The clinical data is also updated to include gender predictions. The necessary information is sorted in the local configuration file, that needs to be manually updated based on the outputs of the previous steps. Please note that the 'Batch.tsv' file is also updated since it might be used after preprocessing (for limma).
The datasets are processed one by one to remove the bad arrays and update the clinical files accordingly. The clinical data is also updated to include gender predictions. The necessary information is described in the local configuration file, that needs to be manually updated based on the outputs of the previous steps. Please note that the 'Batch.tsv' file is also updated since it might be used after preprocessing (for limma).
```
make clean_outputs
make data
......
......@@ -6,7 +6,7 @@
#SBATCH -n 2
#SBATCH --time=0-00:03:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -105,6 +105,7 @@ for (i in seq_len(length(config$datasets))) {
vsn <- config$variance_methods[[j]]
# We define the dataset specific I/Os that depends on VSN.
# We also only take care of the normG00 scheme witout batch correction.
exp_data_fn <- paste0(input_data_dir, dataset_name, "/", "normG00/", dataset_name,
"_normalized_normG00_nobatchcorrection", vsn$file_tag, ".tsv")
......
......@@ -6,7 +6,7 @@
#SBATCH -n 2
#SBATCH --time=0-00:02:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -12,6 +12,8 @@ library("u133x3p.db")
library("illuminaHumanv3.db")
library("hgug4112a.db")
library("hgfocus.db")
library("huex10sttranscriptcluster.db")
library("hugene10sttranscriptcluster.db")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -37,7 +39,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# We do all platforms iteratively.
for (i in seq_len(length(config$platforms))) {
# We get the first dataset using that platform.
# We get the datasets using that platform.
platform <- config$platforms[[i]]
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Starting probe collection."))
relevant_datasets <- get_datasets_from_platform(config$datasets, platform$platform_name)
......@@ -50,7 +52,7 @@ for (i in seq_len(length(config$platforms))) {
# VSN has no impact on the probe list since it does not do any filtering.
# We have exactly the same probes whether we use VSN or not so we will just pick
# the first configuration item (will therefore worl in case there is only one).
# the first configuration item (will therefore work in case there is only one).
vsn <- config$variance_methods[[1]]
# We set the I/Os.
......
......@@ -4,9 +4,9 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-00:02:00
#SBATCH --time=0-00:03:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -6,7 +6,7 @@
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -4,9 +4,9 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:10:00
#SBATCH --time=0-0:15:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -26,7 +26,7 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Function originally from R package rowr.
# Transfered here because rowr is not supported any longer on 3.6.0.
# Will be removed once it is supporteg again.
# Will be removed once it is supported again.
vert <- function(object) {
if (is.list(object)) {
object <- cbind(object)
......@@ -78,7 +78,8 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
# We treat the different array types independently.
for (array_type in names(gene_probes)) {
# We check if we need to do angene_probes_sel <- vector(length = length(gene_probes))ything.
# We check if we need to do anything.
# gene_probes_sel <- vector(length = length(gene_probes)) ??
all_probes <- unlist(stringr::str_split(gene_probes[array_type], "\\|"))
if (length(all_probes) == 1) {
......@@ -101,7 +102,7 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
local_inputs <- lapply(local_inputs, vert)
local_maxlens <- max(unlist(lapply(local_inputs, len)))
buffered_inputs <- lapply(local_inputs, buffer, length.out = local_maxlens,
fill = NA, preserveClass = FALSE)
fill = NA, preserve_class = FALSE)
exp_data <- Reduce(cbind.data.frame, buffered_inputs)
row.names(exp_data) <- exp_data_rn
rm(exp_data_rn)
......@@ -148,6 +149,7 @@ message(paste0("[", Sys.time(), "] Gene-probe matching loaded."))
# We collect the expression data.
sn_indx <- c()
da_indx <- c()
ip_indx <- c()
exp_data_all <- list()
exp_data_names <- list()
sn_exp_data_vsn <- list()
......@@ -158,6 +160,10 @@ da_exp_data_vsn <- list()
da_exp_data_novsn <- list()
da_exp_data_vsn_zsc <- list()
da_exp_data_novsn_zsc <- list()
ip_exp_data_vsn <- list()
ip_exp_data_novsn <- list()
ip_exp_data_vsn_zsc <- list()
ip_exp_data_novsn_zsc <- list()
for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
......@@ -184,6 +190,7 @@ for (i in seq_len(length(config$datasets))) {
"_normalized_noVSN_zscores.tsv")
utils::write.table(exp_data_vsn_zsc, file = exp_data_vsn_zsc_fn, sep = "\t", quote = FALSE)
utils::write.table(exp_data_novsn_zsc, file = exp_data_novsn_zsc_fn, sep = "\t", quote = FALSE)
rm(exp_data_vsn_zsc_fn, exp_data_novsn_zsc_fn)
# We save the data in the correct structure depending on the tissue type.
exp_data_all[[i]] <- exp_data_vsn
......@@ -206,9 +213,18 @@ for (i in seq_len(length(config$datasets))) {
da_exp_data_novsn[[length(da_indx)]] <- exp_data_novsn
da_exp_data_vsn_zsc[[length(da_indx)]] <- exp_data_vsn_zsc
da_exp_data_novsn_zsc[[length(da_indx)]] <- exp_data_novsn_zsc
} else if (dataset$tissue == "iPSC-DA") {
# We save the index
ip_indx <- c(ip_indx, i)
# We save the data.
ip_exp_data_vsn[[length(ip_indx)]] <- exp_data_vsn
ip_exp_data_novsn[[length(ip_indx)]] <- exp_data_novsn
ip_exp_data_vsn_zsc[[length(ip_indx)]] <- exp_data_vsn_zsc
ip_exp_data_novsn_zsc[[length(ip_indx)]] <- exp_data_novsn_zsc
} else {
message(paste0("[Warning] The tissue of dataset ", dataset$dataset_name,
" is neither SN nor DA."))
" is neither SN, nor DA, nor iPSC-DA."))
}
message(paste0("[", Sys.time(), "][", dataset$dataset_name, "] Expression data loaded."))
rm(dataset, exp_data_vsn, exp_data_novsn, exp_data_vsn_zsc, exp_data_novsn_zsc)
......@@ -275,16 +291,16 @@ for (i in seq_len(length(config$biomarkers))) {
da_data_vsn_zsc <- vector()
da_data_novsn_zsc <- vector()
for (ii in seq_len(length(da_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- da_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
......@@ -294,7 +310,7 @@ for (i in seq_len(length(config$biomarkers))) {
exp_data_novsn_zsc <- data.frame(Expression = da_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# We update the global data-frame.
da_data_vsn <- rbind(da_data_vsn, exp_data_vsn)
da_data_novsn <- rbind(da_data_novsn, exp_data_novsn)
......@@ -306,6 +322,43 @@ for (i in seq_len(length(config$biomarkers))) {
}
rm(ii)
# Third, the iPSC-DA data.
ip_data_vsn <- vector()
ip_data_novsn <- vector()
ip_data_vsn_zsc <- vector()
ip_data_novsn_zsc <- vector()
for (ii in seq_len(length(ip_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- ip_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
exp_data_vsn <- t(ip_exp_data_vsn[[ii]][probe_id, ])
exp_data_novsn <- t(ip_exp_data_novsn[[ii]][probe_id, ])
exp_data_vsn_zsc <- data.frame(Expression = ip_exp_data_vsn_zsc[[ii]][probe_id, ])
exp_data_novsn_zsc <- data.frame(Expression = ip_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# We update the global data-frame.
ip_data_vsn <- rbind(ip_data_vsn, exp_data_vsn)
ip_data_novsn <- rbind(ip_data_novsn, exp_data_novsn)
ip_data_vsn_zsc <- rbind(ip_data_vsn_zsc, exp_data_vsn_zsc)
ip_data_novsn_zsc <- rbind(ip_data_novsn_zsc, exp_data_novsn_zsc)
rm(exp_data_vsn, exp_data_novsn, exp_data_vsn_zsc, exp_data_novsn_zsc)
}
rm(dataset, dataset_arraytype, probe_id, jj)
}
rm(ii)
# Final merge before plotting to get all in one.
plot_data <- rbind(
data.frame(sn_data_vsn) %>%
......@@ -331,12 +384,25 @@ for (i in seq_len(length(config$biomarkers))) {
dplyr::mutate(category = "VSN (ZSCORES)"),
data.frame(da_data_novsn_zsc) %>%
dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "NO-VSN (ZSCORES)"),
data.frame(ip_data_vsn) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "VSN (EXPRS)"),
data.frame(ip_data_novsn) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "NO-VSN (EXPRS)"),
data.frame(ip_data_vsn_zsc) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "VSN (ZSCORES)"),
data.frame(ip_data_novsn_zsc) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "NO-VSN (ZSCORES)"))
plot_data$category <- factor(plot_data$category, levels = c("VSN (EXPRS)", "NO-VSN (EXPRS)",
"VSN (ZSCORES)", "NO-VSN (ZSCORES)"))
rm(sn_data_vsn, sn_data_novsn, sn_data_vsn_zsc, sn_data_novsn_zsc)
rm(da_data_vsn, da_data_novsn, da_data_vsn_zsc, da_data_novsn_zsc)
rm(ip_data_vsn, ip_data_novsn, ip_data_vsn_zsc, ip_data_novsn_zsc)
# We plot the gene.
bm_plot_fn <- paste0(output_data_dir, "Biomarker_", str_replace_all(biomarker$tissue, " ", "_"),
"_", biomarker$name, ".png")
......@@ -364,9 +430,10 @@ for (i in seq_len(length(config$biomarkers))) {
rm(bm_plot_fn, myplot, plot_data)
rm(biomarker, bm_ps_data_sel)
} # End for each biomarker.
rm(i, gene_probe_matching, sn_indx, da_indx, exp_data_all)
rm(i, gene_probe_matching, sn_indx, da_indx, ip_indx, exp_data_all)
rm(sn_exp_data_vsn, sn_exp_data_novsn, sn_exp_data_vsn_zsc, sn_exp_data_novsn_zsc)
rm(da_exp_data_vsn, da_exp_data_novsn, da_exp_data_vsn_zsc, da_exp_data_novsn_zsc)
rm(ip_exp_data_vsn, ip_exp_data_novsn, ip_exp_data_vsn_zsc, ip_exp_data_novsn_zsc)
# We log the session details for reproducibility.
rm(config, input_data_dir, output_data_dir, raw_data_dir)
......
......@@ -6,7 +6,7 @@
#SBATCH -n 2
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......
......@@ -78,7 +78,8 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
# We treat the different array types independently.
for (array_type in names(gene_probes)) {
# We check if we need to do angene_probes_sel <- vector(length = length(gene_probes))ything.
# We check if we need to do anything.
# gene_probes_sel <- vector(length = length(gene_probes)) ??
all_probes <- unlist(stringr::str_split(gene_probes[array_type], "\\|"))
if (length(all_probes) == 1) {
......@@ -101,7 +102,7 @@ select_most_suitable_probes <- function(gene_probes, exp_data_all) {
local_inputs <- lapply(local_inputs, vert)
local_maxlens <- max(unlist(lapply(local_inputs, len)))
buffered_inputs <- lapply(local_inputs, buffer, length.out = local_maxlens,
fill = NA, preserveClass = FALSE)
fill = NA, preserve_class = FALSE)
exp_data <- Reduce(cbind.data.frame, buffered_inputs)
row.names(exp_data) <- exp_data_rn
rm(exp_data_rn)
......@@ -148,6 +149,7 @@ message(paste0("[", Sys.time(), "] Gene-probe matching loaded."))
# We collect the expression data.
sn_indx <- c()
da_indx <- c()
ip_indx <- c()
exp_data_all <- list()
exp_data_names <- list()
sn_exp_data_vsn <- list()
......@@ -158,8 +160,13 @@ da_exp_data_vsn <- list()
da_exp_data_novsn <- list()
da_exp_data_vsn_zsc <- list()
da_exp_data_novsn_zsc <- list()
ip_exp_data_vsn <- list()
ip_exp_data_novsn <- list()
ip_exp_data_vsn_zsc <- list()
ip_exp_data_novsn_zsc <- list()
sn_clin_data <- list()
da_clin_data <- list()
ip_clin_data <- list()
for (i in seq_len(length(config$datasets))) {
# We get the dataset details.
......@@ -208,16 +215,26 @@ for (i in seq_len(length(config$datasets))) {
} else if (dataset$tissue == "DA") {
# We save the index
da_indx <- c(da_indx, i)
# We save the data.
da_exp_data_vsn[[length(da_indx)]] <- exp_data_vsn
da_exp_data_novsn[[length(da_indx)]] <- exp_data_novsn
da_exp_data_vsn_zsc[[length(da_indx)]] <- exp_data_vsn_zsc
da_exp_data_novsn_zsc[[length(da_indx)]] <- exp_data_novsn_zsc
da_clin_data[[length(da_indx)]] <- clin_data
} else if (dataset$tissue == "iPSC-DA") {
# We save the index
ip_indx <- c(ip_indx, i)
# We save the data.
ip_exp_data_vsn[[length(ip_indx)]] <- exp_data_vsn
ip_exp_data_novsn[[length(ip_indx)]] <- exp_data_novsn
ip_exp_data_vsn_zsc[[length(ip_indx)]] <- exp_data_vsn_zsc
ip_exp_data_novsn_zsc[[length(ip_indx)]] <- exp_data_novsn_zsc
ip_clin_data[[length(ip_indx)]] <- clin_data
} else {
message(paste0("[Warning] The tissue of dataset ", dataset$dataset_name,
" is neither SN nor DA."))
" is neither SN, nor DA, nor iPSC-DA."))
}
message(paste0("[", Sys.time(), "][", dataset$dataset_name,
"] Expression and clinical data loaded."))
......@@ -295,20 +312,20 @@ for (i in seq_len(length(config$biomarkers))) {
da_data_novsn_zsc <- vector()
da_clin <- vector()
for (ii in seq_len(length(da_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- da_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
# First, the expression data.
exp_data_vsn <- t(da_exp_data_vsn[[ii]][probe_id, ])
exp_data_novsn <- t(da_exp_data_novsn[[ii]][probe_id, ])
......@@ -316,12 +333,12 @@ for (i in seq_len(length(config$biomarkers))) {
exp_data_novsn_zsc <- data.frame(Expression = da_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# Second, the clinical data.
clin_data <- data.frame(Disease.status = da_clin_data[[ii]]$Disease.status,
row.names = rownames(da_clin_data[[ii]])) %>%
mutate(Dataset = dataset$dataset_name)
# We update the global data-frame.
da_data_vsn <- rbind(da_data_vsn, exp_data_vsn)
da_data_novsn <- rbind(da_data_novsn, exp_data_novsn)
......@@ -333,6 +350,52 @@ for (i in seq_len(length(config$biomarkers))) {
rm(dataset, dataset_arraytype, probe_id, jj)
}
rm(ii)
# Third, the iPSC-DA data.
ip_data_vsn <- vector()
ip_data_novsn <- vector()
ip_data_vsn_zsc <- vector()
ip_data_novsn_zsc <- vector()
ip_clin <- vector()
for (ii in seq_len(length(ip_exp_data_vsn))) {
# We define the second index as to switch between the global list (full)
# and current list (selection of SN datasets).
jj <- ip_indx[ii]
# We get the dataset details (name), and retrieve the correct probe id.
dataset <- config$datasets[[jj]]
dataset_arraytype <- str_replace(dataset$array_type, "-", ".")
probe_id <- bm_ps_data_sel[dataset_arraytype]
# We read the expression data.
# The dataset might not have a probe for that gene.
if (!is.null(probe_id) & !is.na(probe_id) & probe_id != "") {
# First, the expression data.
exp_data_vsn <- t(ip_exp_data_vsn[[ii]][probe_id, ])
exp_data_novsn <- t(ip_exp_data_novsn[[ii]][probe_id, ])
exp_data_vsn_zsc <- data.frame(Expression = ip_exp_data_vsn_zsc[[ii]][probe_id, ])
exp_data_novsn_zsc <- data.frame(Expression = ip_exp_data_novsn_zsc[[ii]][probe_id, ])
colnames(exp_data_vsn) <- "Expression"
colnames(exp_data_novsn) <- "Expression"
# Second, the clinical data.
clin_data <- data.frame(Disease.status = ip_clin_data[[ii]]$Disease.status,
row.names = rownames(ip_clin_data[[ii]])) %>%
mutate(Dataset = dataset$dataset_name)
# We update the global data-frame.
ip_data_vsn <- rbind(ip_data_vsn, exp_data_vsn)
ip_data_novsn <- rbind(ip_data_novsn, exp_data_novsn)
ip_data_vsn_zsc <- rbind(ip_data_vsn_zsc, exp_data_vsn_zsc)
ip_data_novsn_zsc <- rbind(ip_data_novsn_zsc, exp_data_novsn_zsc)
ip_clin <- rbind(ip_clin, clin_data)
rm(exp_data_vsn, exp_data_novsn, exp_data_vsn_zsc, exp_data_novsn_zsc, clin_data)
}
rm(dataset, dataset_arraytype, probe_id, jj)
}
rm(ii)
# We combine the expression and clinical data (row-wise).
sn_data_vsn <- cbind(sn_data_vsn, sn_clin)
......@@ -343,7 +406,11 @@ for (i in seq_len(length(config$biomarkers))) {
da_data_novsn <- cbind(da_data_novsn, da_clin)
da_data_vsn_zsc <- cbind(da_data_vsn_zsc, da_clin)
da_data_novsn_zsc <- cbind(da_data_novsn_zsc, da_clin)
rm(da_clin, sn_clin)
ip_data_vsn <- cbind(ip_data_vsn, ip_clin)
ip_data_novsn <- cbind(ip_data_novsn, ip_clin)
ip_data_vsn_zsc <- cbind(ip_data_vsn_zsc, ip_clin)
ip_data_novsn_zsc <- cbind(ip_data_novsn_zsc, ip_clin)
rm(da_clin, sn_clin, ip_clin)
# Final merge before plotting to get all in one.
plot_data <- rbind(
......@@ -370,12 +437,25 @@ for (i in seq_len(length(config$biomarkers))) {
dplyr::mutate(category = "VSN (ZSCORES)"),
data.frame(da_data_novsn_zsc) %>%
dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "NO-VSN (ZSCORES)"),
data.frame(ip_data_vsn) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "VSN (EXPRS)"),
data.frame(ip_data_novsn) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
dplyr::mutate(category = "NO-VSN (EXPRS)"),
data.frame(ip_data_vsn_zsc) %>%
dplyr::mutate(tissue = "iPSC-DA") %>%
<