Commit fa8ebdeb authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Adaptation of step 06 to the new configuration (2nd half).

parent 94852040
......@@ -34,11 +34,11 @@ message(paste0("[", Sys.time(), "] Data read."))
# We prepare the data for ggplot by adding extra columns to define classes.
D <- SCAN_log %>%
mutate(class_tag = ifelse(convergence < 1,
ifelse(prop_back < 0.85,
"Normal",
"High background (>85% probes)")
, "No convergence"))
dplyr::mutate(class_tag = ifelse(convergence < 1,
ifelse(prop_back < 0.85,
"Normal",
"High background (>85% probes)")
, "No convergence"))
rm(SCAN_log)
message(paste0("[", Sys.time(), "] Data filtered."))
......
......@@ -66,8 +66,8 @@ for (i in seq_len(length(config$datasets))) {
# We load the data (clinical).
pheno_data <- Biobase::pData(ArrayUtils::load_clinical_data(raw_data_subdir))
pheno_data <- pheno_data %>% mutate(cel_name = row.names(pheno_data)) %>%
mutate(Age = as.numeric(as.character(Age)))
pheno_data <- pheno_data %>% dplyr::mutate(cel_name = row.names(pheno_data)) %>%
dplyr::mutate(Age = as.numeric(as.character(Age)))
age_sd <- round(sd(pheno_data$Age, na.rm = TRUE), 1)
# If there is no patient age at all, we skip the dataset since we can not make predictions.
......@@ -113,8 +113,8 @@ for (i in seq_len(length(config$datasets))) {
# (A) We keep only the full profiles (no real NA).
message(paste0("[", Sys.time(), "] [", dataset_name, "][", normalization_name,
"] [", bc_tag, "] Starting cross-validation."))
nomissing_samples <- pheno_data %>% filter(!is.na(Age)) %>%
select(cel_name, Age, Gender, Disease.status)
nomissing_samples <- pheno_data %>% dplyr::filter(!is.na(Age)) %>%
dplyr::select(cel_name, Age, Gender, Disease.status)
nomissing_samples$cel_name <- gsub(".txt", "", nomissing_samples$cel_name)
nomissing_ages <- nomissing_samples$Age
nomissing_genders <- nomissing_samples$Gender
......@@ -144,9 +144,9 @@ for (i in seq_len(length(config$datasets))) {
all_cors <- all_cors[!is.na(all_cors)]
best_feats <- all_cors[all_cors > feat_min_cor]
if (length(best_feats) > feat_max_nb) {
best_feats <- sort(all_cors) %>% tail(feat_max_nb)
best_feats <- sort(all_cors) %>% utils::tail(feat_max_nb)
} else if (length(best_feats) < feat_min_nb) {
best_feats <- sort(all_cors) %>% tail(feat_min_nb)
best_feats <- sort(all_cors) %>% utils::tail(feat_min_nb)
}
best_feats <- names(best_feats)
rm(other_samples, nomissing_exp_data_subset, nomissing_ages_subset, all_cors)
......@@ -188,7 +188,7 @@ for (i in seq_len(length(config$datasets))) {
# We compute summary statistics.
df <- data.frame(Age = nomissing_ages, Pred = predictions)
loocv_all <- rbind(loocv_all, df %>% mutate(dataset = dataset_name))
loocv_all <- rbind(loocv_all, df %>% dplyr::mutate(dataset = dataset_name))
mue <- round(mean(abs(df$Age - df$Pred)), 1)
mse <- round(mean(abs(df$Age - df$Pred) ^ 2), 1)
message(paste0("[", Sys.time(), "][", dataset_name, "][", normalization_name, "][",
......@@ -200,7 +200,7 @@ for (i in seq_len(length(config$datasets))) {
# We only do this if it is necessary.
message(paste0("[", Sys.time(), "] [", dataset_name, "][", normalization_name,
"] [", bc_tag, "] Starting predictions."))
all_samples <- pheno_data %>% select(cel_name, Age, Gender, Disease.status)
all_samples <- pheno_data %>% dplyr::select(cel_name, Age, Gender, Disease.status)
if (sum(is.na(all_samples)) > 0) {
all_ages <- all_samples$Age
all_genders <- all_samples$Gender
......@@ -214,9 +214,9 @@ for (i in seq_len(length(config$datasets))) {
all_cors <- all_cors[!is.na(all_cors)]
best_feats <- all_cors[all_cors > feat_min_cor]
if (length(best_feats) > feat_max_nb) {
best_feats <- sort(best_feats) %>% tail(feat_max_nb)
best_feats <- sort(best_feats) %>% utils::tail(feat_max_nb)
} else if (length(best_feats) < feat_min_nb) {
best_feats <- sort(best_feats) %>% tail(feat_min_nb)
best_feats <- sort(best_feats) %>% utils::tail(feat_min_nb)
}
best_feats <- names(best_feats)
rm(all_cors, nomissing_ages, nomissing_exp_data)
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:02:00
#SBATCH --time=0-00:03:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
......@@ -43,7 +43,7 @@ for (i in seq_len(length(config$datasets))) {
# We prepare the clinical data for ggplot by adding an extra column (Gender * Disease.status).
pheno_data <- pheno_data %>%
mutate(gender_disease_status = factor(paste(Gender, Disease.status, sep = ".")))
dplyr::mutate(gender_disease_status = factor(paste(Gender, Disease.status, sep = ".")))
# We repeat the analysis for all VSN usages.
for (j in seq_len(length(config$variance_methods))) {
......
......@@ -107,7 +107,8 @@ for (i in seq_len(length(config$datasets))) {
# We prepare the data for ggplot by adding an extra column (Gender * Disease.status).
D <- pheno_data %>%
mutate(gender_disease_status = factor(paste(Gender, Disease.status, sep = ".")))
dplyr::mutate(gender_disease_status = factor(paste(Gender, Disease.status,
sep = ".")))
message(paste0("[", Sys.time(), "][", dataset_name, "] Data filtered."))
# Compute the AAD metrics (Average Age Difference) between the ages of
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:01:00
#SBATCH --time=0-00:02:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......@@ -22,8 +22,24 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datase
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual job.
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Actual cleaning job.
Rscript --vanilla ${CODE_FOLDER}clean_datasets.R > ${OUTPUT_FOLDER}clean_log.out 2> ${OUTPUT_FOLDER}clean_log.err
# Counting the number of samples per category and per dataset.
rm -rf ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
cut -f 2-3 ${OUTPUT_FOLDER}${datasetName}_clinical.tsv | grep -v Disease | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $3, $2, $1}' | sed -r 's/\t/./' | sed -r 's/^/'"${datasetName}"'\t/g' >> ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
done
# Refine the counts.
Rscript --vanilla ${CODE_FOLDER}refine_counts.R > ${OUTPUT_FOLDER}counts_log.out 2> ${OUTPUT_FOLDER}counts_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -90,9 +90,9 @@ for (i in seq_len(length(config$platforms))) {
# Then, we make sure we deal with duplicates and reorder the df to match the list we have.
# We do not use EntrezGene and EnsEMBL ids.
gene_annots_raw %>%
group_by(PROBEID) %>%
summarize_at(.vars = c("SYMBOL"), .funs = collapser) %>%
ungroup -> gene_annots
dplyr::group_by(PROBEID) %>%
dplyr::summarize_at(.vars = c("SYMBOL"), .funs = collapser) %>%
dplyr::ungroup -> gene_annots
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, unique_probe_ids)), ]
rm(gene_annots_raw)
......
......@@ -157,21 +157,21 @@ for (i in seq_len(length(config$platforms))) {
# We remove the unnecessary data to prepare the matching.
deps_annotated <- deps %>%
select(PROBEID, SYMBOL) %>%
rename(probeset = PROBEID)
dplyr::select(PROBEID, SYMBOL) %>%
dplyr::rename(probeset = PROBEID)
# We count the number of genes and add all subsets as extra columns.
deps_annotated <- deps_annotated %>%
rowwise() %>%
mutate(nb_symbols = 1 + str_count(SYMBOL, fixed("|")))
dplyr::rowwise() %>%
dplyr::mutate(nb_symbols = 1 + str_count(SYMBOL, fixed("|")))
# We do not consider the probes with more than X genes in any case.
# X is defined in the local configuration file (default to 5).
deps_annotated <- deps_annotated %>%
mutate(symbol_short = ifelse(nb_symbols < config$max_gene_annots, SYMBOL, ""))
dplyr::mutate(symbol_short = ifelse(nb_symbols < config$max_gene_annots, SYMBOL, ""))
deps_annotated <- deps_annotated %>%
rowwise() %>%
mutate(subsets = get_all_subsets(array_as_str = symbol_short))
dplyr::rowwise() %>%
dplyr::mutate(subsets = get_all_subsets(array_as_str = symbol_short))
rm(deps)
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Data ready."))
......@@ -182,8 +182,8 @@ for (i in seq_len(length(config$platforms))) {
# Ex: ACT1A|ACT2 --> 654321_s_at
genes_probe_matching <- vector()
tmp <- deps_annotated %>%
rowwise() %>%
do(tmp = update_map(.$probeset, .$symbol_short))
dplyr::rowwise() %>%
dplyr::do(tmp = update_map(.$probeset, .$symbol_short))
rm(tmp)
# We create the second map by looping over the rows once more.
......@@ -194,8 +194,8 @@ for (i in seq_len(length(config$platforms))) {
# Ex: ACT2 --> ACT1A|ACT2
geneset_probe_matching <- vector()
tmp <- deps_annotated %>%
rowwise() %>%
do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
dplyr::rowwise() %>%
dplyr::do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
rm(tmp)
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Maps ready."))
......@@ -207,8 +207,8 @@ for (i in seq_len(length(config$platforms))) {
# measured independently via another probe or it is lost. Should 2_at not exist, we would
# keep 1_at and consider that a|b is one single entity (for now).
deps_annotated <- deps_annotated %>%
rowwise() %>%
mutate(conflicts = tag_probe(symbol_short, subsets))
dplyr::rowwise() %>%
dplyr::mutate(conflicts = tag_probe(symbol_short, subsets))
# We remove the probes that target a set of genes if there is a probe that targets a subset,
# possibly in addition to other genes.
......@@ -217,13 +217,14 @@ for (i in seq_len(length(config$platforms))) {
# Focus is on 1_at. We tag it for removal since it is not possible to disentangle
# the signal. Probe 2 is removed for the same reason (when focus is on 2_at).
deps_annotated <- deps_annotated %>%
rowwise() %>%
mutate(conflicts = tag_probe_cmplx(symbol_short, subsets, conflicts))
dplyr::rowwise() %>%
dplyr::mutate(conflicts = tag_probe_cmplx(symbol_short, subsets, conflicts))
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Tagging done."))
# We tag the probes with conflicts with an extra column and save the data.
deps_annotated <- deps_annotated %>%
mutate(use_tag = ifelse(conflicts != "", FALSE, ifelse(symbol_short == "", FALSE, TRUE)))
dplyr::mutate(use_tag = ifelse(conflicts != "", FALSE,
ifelse(symbol_short == "", FALSE, TRUE)))
write.table(deps_annotated,
file = deps_matching_details_fn,
sep = "\t",
......@@ -253,7 +254,7 @@ for (i in seq_len(length(config$platforms))) {
# We replace the NA values.
deps_matching_loc[is.na(deps_matching_loc)] <- ""
deps_matching <- rbind(deps_matching, deps_matching_loc %>%
mutate(source = platform$platform_name))
dplyr::mutate(source = platform$platform_name))
rm(deps_matching_loc_fn, deps_matching_loc, platform)
} # End for each platform.
rm(i)
......@@ -266,8 +267,8 @@ message(paste0("[", Sys.time(), "][Combined] Data ready."))
# Ex: ACT1A|ACT2 --> 654321_s_at
genes_probe_matching <- vector()
tmp <- deps_matching %>%
rowwise() %>%
do(tmp = update_map(.$probeset, .$symbol_short))
dplyr::rowwise() %>%
dplyr::do(tmp = update_map(.$probeset, .$symbol_short))
rm(tmp)
# We create the second map by looping over the rows once more (now across datasets).
......@@ -278,8 +279,8 @@ rm(tmp)
# Ex: ACT2 --> ACT1A|ACT2
geneset_probe_matching <- vector()
tmp <- deps_matching %>%
rowwise() %>%
do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
dplyr::rowwise() %>%
dplyr::do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
rm(tmp, deps_matching)
message(paste0("[", Sys.time(), "][Combined] Maps ready."))
......@@ -316,8 +317,8 @@ for (i in seq_len(length(config$platforms))) {
# measured independently via another probe or it is lost. Should 2_at not exist, we would
# keep 1_at and consider that a|b is one single entity (for now).
deps_matching <- deps_matching %>%
rowwise() %>%
mutate(conflicts_multiple_ds = tag_probe(symbol_short, subsets))
dplyr::rowwise() %>%
dplyr::mutate(conflicts_multiple_ds = tag_probe(symbol_short, subsets))
# We remove the probes that target a set of genes if there is a probe that targets a subset,
# possibly in addition to other genes (from another array type).
......@@ -326,13 +327,14 @@ for (i in seq_len(length(config$platforms))) {
# Focus is on 1_at. We tag it for removal since it is not possible to disentangle
# the signal. Probe 2 is removed for the same reason (when focus is on 2_at).
deps_matching <- deps_matching %>%
rowwise() %>%
mutate(conflicts_multiple_ds = tag_probe_cmplx(symbol_short, subsets, conflicts_multiple_ds))
dplyr::rowwise() %>%
dplyr::mutate(conflicts_multiple_ds = tag_probe_cmplx(symbol_short, subsets,
conflicts_multiple_ds))
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Tagging done."))
# We tag the probes with conflicts with an extra column and save the data.
deps_matching <- deps_matching %>%
mutate(use_tag_multiple_ds = ifelse(conflicts_multiple_ds != "", FALSE, TRUE))
dplyr::mutate(use_tag_multiple_ds = ifelse(conflicts_multiple_ds != "", FALSE, TRUE))
write.table(deps_matching,
file = deps_matching_details_fn,
sep = "\t",
......@@ -352,17 +354,17 @@ for (i in seq_len(length(config$platforms))) {
# We create in addition the "gene entity to probes" map for the current array type.
deps_matching <- deps_matching %>%
filter(use_tag == TRUE, use_tag_multiple_ds == TRUE) %>%
select(probeset, SYMBOL)
dplyr::filter(use_tag == TRUE, use_tag_multiple_ds == TRUE) %>%
dplyr::select(probeset, SYMBOL)
deps_matching <- aggregate(deps_matching$probeset,
by = list(genes = deps_matching$SYMBOL),
FUN = paste)
deps_matching <- deps_matching %>%
rename(probesets = x) %>%
rowwise() %>%
mutate(probesets = paste(unlist(strsplit(probesets, ", ")), collapse = "|"))
dplyr::rename(probesets = x) %>%
dplyr::rowwise() %>%
dplyr::mutate(probesets = paste(unlist(strsplit(probesets, ", ")), collapse = "|"))
global_matching_data[[i]] <- deps_matching %>%
rename(!!platform$platform_name := probesets)
dplyr::rename(!!platform$platform_name := probesets)
rm(deps_matching, matching_dataset, platform)
} # End for each platform.
rm(i, genes_probe_matching, geneset_probe_matching)
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:20:00
#SBATCH --time=0-0:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
......@@ -265,22 +265,22 @@ for (i in seq_len(length(config$biomarkers))) {
# Final merge before plotting to get all in one.
plot_data <- rbind(
data.frame(SN_data_vsn) %>% mutate(tissue = "SN") %>%
mutate(category = "VSN (EXPRS)"),
data.frame(SN_data_novsn) %>% mutate(tissue = "SN") %>%
mutate(category = "NO-VSN (EXPRS)"),
data.frame(SN_data_vsn_zsc) %>% mutate(tissue = "SN") %>%
mutate(category = "VSN (ZSCORES)"),
data.frame(SN_data_novsn_zsc) %>% mutate(tissue = "SN") %>%
mutate(category = "NO-VSN (ZSCORES)"),
data.frame(DA_data_vsn) %>% mutate(tissue = "DA") %>%
mutate(category = "VSN (EXPRS)"),
data.frame(DA_data_novsn) %>% mutate(tissue = "DA") %>%
mutate(category = "NO-VSN (EXPRS)"),
data.frame(DA_data_vsn_zsc) %>% mutate(tissue = "DA") %>%
mutate(category = "VSN (ZSCORES)"),
data.frame(DA_data_novsn_zsc) %>% mutate(tissue = "DA") %>%
mutate(category = "NO-VSN (ZSCORES)"))
data.frame(SN_data_vsn) %>% dplyr::mutate(tissue = "SN") %>%
dplyr::mutate(category = "VSN (EXPRS)"),
data.frame(SN_data_novsn) %>% dplyr::mutate(tissue = "SN") %>%
dplyr::mutate(category = "NO-VSN (EXPRS)"),
data.frame(SN_data_vsn_zsc) %>% dplyr::mutate(tissue = "SN") %>%
dplyr::mutate(category = "VSN (ZSCORES)"),
data.frame(SN_data_novsn_zsc) %>% dplyr::mutate(tissue = "SN") %>%
dplyr::mutate(category = "NO-VSN (ZSCORES)"),
data.frame(DA_data_vsn) %>% dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "VSN (EXPRS)"),
data.frame(DA_data_novsn) %>% dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "NO-VSN (EXPRS)"),
data.frame(DA_data_vsn_zsc) %>% dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "VSN (ZSCORES)"),
data.frame(DA_data_novsn_zsc) %>% dplyr::mutate(tissue = "DA") %>%
dplyr::mutate(category = "NO-VSN (ZSCORES)"))
plot_data$category <- factor(plot_data$category, levels = c("VSN (EXPRS)", "NO-VSN (EXPRS)",
"VSN (ZSCORES)", "NO-VSN (ZSCORES)"))
rm(SN_data_vsn, SN_data_novsn, SN_data_vsn_zsc, SN_data_novsn_zsc)
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:15:00
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("reshape2")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Main
# ================================================================================================
# We read the raw counts performed in bash.
nb_samples_fn <- paste0(output_data_dir, "clinical_categories_summarized.tsv")
nb_samples_raw <- read.table(nb_samples_fn,
header = FALSE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
names(nb_samples_raw) <- c("Dataset", "Cond", "Nb")
# We go from the long to the wide format.
nb_samples <- dcast(nb_samples_raw, Dataset ~ Cond, value.var = "Nb")
nb_samples[is.na(nb_samples)] <- 0
# We add more columns based on the 4 initial categories.
# We first sum to get the total counts for more generic categories (i.e., F = F.PD + F.Control).
# We then also compute the minimum of two or more categories (to assess whether we can run a
# differential analysis or whether there are not enough samples).
nb_samples <- nb_samples %>%
dplyr::mutate(F = F.PD + F.Control, M = M.PD + M.Control, PD = M.PD + F.PD,
Control = M.Control + F.Control, All = PD + Control) %>%
dplyr::rowwise() %>%
dplyr::mutate(Gender = min(F, M), Disease_status = min(PD, Control)) %>%
dplyr::mutate(Disease_status_females = min(F.PD, F.Control),
Disease_status_males = min(M.PD, M.Control),
Gender_PD = min(M.PD, F.PD),
Gender_Control = min(M.Control, F.Control),
Gender_disease_status = min(F.PD, F.Control, M.PD, M.Control))
# We finalize the data-frame with row names and no missing values.
nb_samples[is.na(nb_samples)] <- 0
nb_samples <- data.frame(nb_samples)
row.names(nb_samples) <- str_replace_all(nb_samples$Dataset, "-", ".")
# We save it again (to the same file).
write.table(nb_samples,
file = nb_samples_fn,
sep = "\t",
quote = FALSE)
remove(nb_samples_raw, nb_samples, nb_samples_fn)
......@@ -30,6 +30,17 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Main
# ================================================================================================
# For the integration, we need to know how many samples there are in each dataset
# and for each category (M.PD, F.PD, M.Control, F.Control).
nb_samples_fn <- paste0(input_data_dir, "clinical_categories_summarized.tsv")
nb_samples <- read.table(nb_samples_fn,
header = TRUE,
sep = "\t",
as.is = TRUE,
row.names = 1)
remove(nb_samples_fn)
# We do all datasets one by one.
for (i in seq_len(length(config$datasets))) {
......@@ -43,9 +54,9 @@ for (i in seq_len(length(config$datasets))) {
pheno_data_fn,
use_factors = FALSE))
pheno_data <- pheno_data %>%
mutate(gender_disease_status = paste(pheno_data$Gender,
pheno_data$Disease.status,
sep = "."))
dplyr::mutate(gender_disease_status = paste(pheno_data$Gender,
pheno_data$Disease.status,
sep = "."))
rm(pheno_data_fn)
# We get the batch data from the batch file if it exists.
......@@ -58,22 +69,6 @@ for (i in seq_len(length(config$datasets))) {
}
rm(batch_data_file)
# We count the number of samples per category to only perform the limma analyses
# we can perform (aka, we do not perform the factorial ones if we have only 1 sample
# in any of the four categories).
counts <- list()
counts$Gender <- min(table(pheno_data$Gender))
counts$Disease_status <- min(table(pheno_data$Disease.status))
counts$Disease_status_females <- min(table( (pheno_data %>%
filter(Gender == "F"))$Disease.status))
counts$Disease_status_males <- min(table( (pheno_data %>%
filter(Gender == "M"))$Disease.status))
counts$Gender_PD <- min(table( (pheno_data %>%
filter(Disease.status == "PD"))$Gender))
counts$Gender_Control <- min(table( (pheno_data %>%
filter(Disease.status == "Control"))$Gender))
counts$Gender_disease_status <- min(counts$Gender_PD, counts$Gender_Control)
# We repeat the analysis for all VSN usages.
for (j in seq_len(length(config$variance_methods))) {
......@@ -108,9 +103,10 @@ for (i in seq_len(length(config$datasets))) {
# Then, we make sure we deal with duplicates and reorder the df to match the eset.
gene_annots <- gene_annots_raw %>%
group_by(PROBEID) %>%
summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"), .funs = collapser) %>%
ungroup
dplyr::group_by(PROBEID) %>%
dplyr::summarize_at(.vars = c("SYMBOL", "ENSEMBL", "ENTREZID"),
.funs = collapser) %>%
dplyr::ungroup
gene_annots <- gene_annots[order(match(gene_annots$PROBEID, rownames(exp_eset))), ]
rm(gene_annots_raw)
message(paste0("[", Sys.time(), "][", dataset_name, "][", vsn$name,
......@@ -150,7 +146,7 @@ for (i in seq_len(length(config$datasets))) {
# We only do the limma analysis when it is possible, that is when we have at least 2 samples
# for each of the category we are comparing. For instance, when there is no female PD
# patients (GSE7307), we can not do any comparison that involves female PD patients.
if (counts[[limma_parameters$clinical_factor]] < 2) {
if (nb_samples[dataset_name, limma_parameters$clinical_factor] < 2) {
next;
}
......@@ -206,9 +202,9 @@ for (i in seq_len(length(config$datasets))) {
} # End for each Limma analysis.
rm(k, vsn, exp_eset)
} # End of each VSN configuration.
rm(j, dataset, dataset_name, counts, pheno_data)
rm(j, dataset, dataset_name, pheno_data)
} # End for each dataset.
rm(i, config, raw_data_dir, output_data_dir, input_data_dir)
rm(i, config, raw_data_dir, output_data_dir, input_data_dir, nb_samples)
# We log the session details for reproducibility.
sessionInfo()
......@@ -15,6 +15,7 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/05-Get_DEGs/
......@@ -25,5 +26,8 @@ module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual job.
Rscript --vanilla ${CODE_FOLDER}get_DEGs.R > ${OUTPUT_FOLDER}run_log.out 2> ${OUTPUT_FOLDER}run_log.err
# We also copy the sample counts per category and per dataset to the output folder for the next step.
cp ${INPUT_FOLDER}clinical_categories_summarized.tsv ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
......@@ -5,12 +6,15 @@ clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}*
@cp ${INPUT_FOLDER}Combined_probe_matching.tsv ${OUTPUT_FOLDER}Combined_probe_matching.tsv
summarize:
@sbatch ${CODE_FOLDER}summarize.sh
integrate:
@sbatch ${CODE_FOLDER}integrate.sh
analyse:
@sbatch ${CODE_FOLDER}analyse.sh
check:
@sbatch ${CODE_FOLDER}check.sh
doc:
@sbatch ${CODE_FOLDER}doc.sh
xp:
......
......@@ -9,21 +9,17 @@ make summarize
```