Commit fdf6b966 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Prepare the rankings for all integration scheme instead of only the main scheme.

parent af9a48ef
......@@ -62,86 +62,89 @@ compute_overlap <- function(row) {
# avoid inconsistencies, we compute the overlap as to be able to select the genes for which the
# male and female signals are consistent within most the datasets.
# We do only the SNage integration scheme.
i <- 3
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
selection <- config$selections[[l]]
# A- we first handle the results of the female analysis. We will read the raw results
# and identify which datasets have been used for each gene (in females).
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
selection <- config$selections[[l]]
# A- we first handle the results of the female analysis. We will read the raw results
# and identify which datasets have been used for each gene (in females).
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_females)[2:9]
datasets <- datasets[datasets != "GSE20164"]
res_females$used_datasets <- apply(res_females, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_females_fn)
# B- we then handle the results of the male analysis. We will read the raw results again
# and identify which datasets have been used for each gene (in males).
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_females)[2:9]
datasets <- datasets[datasets != "GSE20164"]
res_females$used_datasets <- apply(res_females, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_females_fn)
# B- we then handle the results of the male analysis. We will read the raw results again
# and identify which datasets have been used for each gene (in males).
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_males)[2:9]
datasets <- datasets[datasets != "Moreira"]
res_males$used_datasets <- apply(res_males, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_males_fn)
# C- We compute the overlap between the used datasets in the female and male analyses.
# This is done per gene again. We additionally get the number of used datasets for female,
# male, common datasets. This overlap is also expressed as a percentage with respect to
# smallest of the female and male sets of datasets.
FM <- merge(x = res_females, y = res_males, by = "SYMBOL") %>%
mutate(female_used_datasets = used_datasets.x, male_used_datasets = used_datasets.y) %>%
select(SYMBOL, female_used_datasets, male_used_datasets)
x <- t(apply(FM, 1, compute_overlap))
FM$overlap_datasets <- x[,1]
FM$female_nb_used_datasets <- as.numeric(x[,2])
FM$male_nb_used_datasets <- as.numeric(x[,3])
FM$nb_overlap_datasets <- as.numeric(x[,4])
FM$overlap_percentage <- as.numeric(x[,5])
rm(x, res_females, res_males)
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
# We save the complete refining data as to keep track of things.
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap_fulldata.tsv")
write.table(FM, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM, refining_fn)
# We save the simplified data (to be used directly for merging).
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
write.table(FM_ref, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM_ref, refining_fn)
datasets <- colnames(res_males)[2:9]
datasets <- datasets[datasets != "Moreira"]
res_males$used_datasets <- apply(res_males, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_males_fn)
# C- We compute the overlap between the used datasets in the female and male analyses.
# This is done per gene again. We additionally get the number of used datasets for female,
# male, common datasets. This overlap is also expressed as a percentage with respect to
# smallest of the female and male sets of datasets.
FM <- merge(x = res_females, y = res_males, by = "SYMBOL") %>%
mutate(female_used_datasets = used_datasets.x, male_used_datasets = used_datasets.y) %>%
select(SYMBOL, female_used_datasets, male_used_datasets)
x <- t(apply(FM, 1, compute_overlap))
FM$overlap_datasets <- x[,1]
FM$female_nb_used_datasets <- as.numeric(x[,2])
FM$male_nb_used_datasets <- as.numeric(x[,3])
FM$nb_overlap_datasets <- as.numeric(x[,4])
FM$overlap_percentage <- as.numeric(x[,5])
rm(x, res_females, res_males)
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
# We save the complete refining data as to keep track of things.
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap_fulldata.tsv")
write.table(FM, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM, refining_fn)
# We save the simplified data (to be used directly for merging).
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
write.table(FM_ref, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM_ref, refining_fn)
rm(l, selection, vsn, j, integration, int_criteria)
} # End for each integration scheme.
# We clean the workspace and log the session details for reproducibility.
rm(l, selection, vsn, j, integration, int_criteria, i)
rm(config, output_data_dir, input_data_dir, input_edata_dir, input_rsedata_dir)
rm(i, config, output_data_dir, input_data_dir, input_edata_dir, input_rsedata_dir)
sessionInfo()
......@@ -387,16 +387,18 @@ for (i in seq_len(length(config$integrations))) {
# We now need to compute the confidence score that is based on the number of supporting,
# conflicting and missing P values.
# At some point, we wanted to normalize the dataset weights so that they sum up to one,
# but it is no longer the case.
dw <- dataset_weights # / sum(dataset_weights)
da <- pvalues_values_csss
db <- pvalues_values_conf
da[!is.na(da)] <- 1
db[!is.na(db)] <- 1
sum_a <- apply(t(t(da) * dataset_weights), 1, sum, na.rm = TRUE)
sum_b <- apply(t(t(db) * dataset_weights), 1, sum, na.rm = TRUE)
sum_c <- sum(dataset_weights) - sum_a - sum_b
sum_a <- apply(t(t(da) * dw), 1, sum, na.rm = TRUE)
sum_b <- apply(t(t(db) * dw), 1, sum, na.rm = TRUE)
sum_c <- sum(dw) - sum_a - sum_b
gamma <- 1 / (max(sum_c) + 1)
foldchanges[["confidence_score"]] <- (sum_a - sum_b + gamma * sum_c) /
sum(dataset_weights)
foldchanges[["confidence_score"]] <- (sum_a - sum_b + gamma * sum_c) / sum(dw)
rm(da, db, sum_a, sum_b, sum_c, gamma)
# We then filter the data again based on the confidence.
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-00:05:00
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment