Commit 69511fa6 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 06 to process all datasets at once (Part III).

parent d3a82871
......@@ -69,47 +69,45 @@ for (i in seq_len(length(config$integrations))) {
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We perform the analysis only for the "VSN" case.
j <- 1
vsn <- config$variance_methods[[j]]
# We use the first gene-probe selection method (max-avg) only.
l <- 1
selection <- config$selections[[l]]
# A- we first handle the results of the female analysis. We will read the raw results
# and identify which datasets have been used for each gene (in females).
k <- 5
# A- we first read the raw results of the female analysis.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
female_datasets <- str_replace(colnames(res_females)[startsWith(colnames(res_females),
"P_")], "P_", "")
rm(k, limma_analysis, res_females_fn)
# B- we first read the raw results of the male analysis.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
res_males_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_females <- read.delim(res_females_fn,
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_females)[2:9]
datasets <- datasets[datasets != "GSE20164"]
male_datasets <- str_replace(colnames(res_males)[startsWith(colnames(res_males),
"P_")], "P_", "")
rm(k, limma_analysis, res_males_fn)
# C- We then identify which datasets have been used for each gene (for both
# males and females).
datasets <- intersect(female_datasets, male_datasets)
res_females$used_datasets <- apply(res_females, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_females_fn)
res_males$used_datasets <- apply(res_males, 1, identify_datasets, datasets = datasets)
rm(datasets, female_datasets, male_datasets)
# B- we then handle the results of the male analysis. We will read the raw results again
# and identify which datasets have been used for each gene (in males).
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
limma_analysis$name, "_", selection$name, "_integration_raw.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
quote = "",
stringsAsFactors = FALSE)
datasets <- colnames(res_males)[2:9]
datasets <- datasets[datasets != "Moreira"]
res_males$used_datasets <- apply(res_males, 1, identify_datasets, datasets = datasets)
rm(k, limma_analysis)
rm(datasets, res_males_fn)
# C- We compute the overlap between the used datasets in the female and male analyses.
# D- We compute the overlap between the used datasets in the female and male analyses.
# This is done per gene again. We additionally get the number of used datasets for female,
# male, common datasets. This overlap is also expressed as a percentage with respect to
# smallest of the female and male sets of datasets.
......@@ -126,7 +124,7 @@ for (i in seq_len(length(config$integrations))) {
FM_ref <- FM %>% select(SYMBOL, overlap_percentage)
# We save the complete refining data as to keep track of things.
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
refining_fn <- paste0(output_data_dir, integration$name, "_ALL_",
selection$name, "_datasetoverlap_fulldata.tsv")
write.table(FM, refining_fn,
quote = FALSE,
......@@ -135,14 +133,14 @@ for (i in seq_len(length(config$integrations))) {
rm(FM, refining_fn)
# We save the simplified data (to be used directly for merging).
refining_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
refining_fn <- paste0(output_data_dir, integration$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
write.table(FM_ref, refining_fn,
quote = FALSE,
sep = "\t",
row.names = FALSE)
rm(FM_ref, refining_fn)
rm(l, selection, vsn, j, integration, int_criteria)
rm(l, selection, integration, int_criteria)
} # End for each integration scheme.
# We clean the workspace and log the session details for reproducibility.
......
......@@ -175,13 +175,13 @@ for (i in seq_len(length(config$integrations))) {
# and the raw results contain more of these non expressed or non differentially
# expressed genes).
limmas <- config$limma_analyses
F_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
F_fn <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_integration.tsv")
Fr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
Fr_fn <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_integration_raw.tsv")
M_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
M_fn <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_max-avg_integration.tsv")
Mr_fn <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
Mr_fn <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_max-avg_integration_raw.tsv")
F <- read.delim(F_fn, stringsAsFactors = FALSE, row.names = 1)
Fr <- read.delim(Fr_fn, stringsAsFactors = FALSE, row.names = 1)
......@@ -191,8 +191,8 @@ for (i in seq_len(length(config$integrations))) {
# We start by merging the male and female rankings. Most genes are present in both and the few
# that are not are removed (since we can not say whether they are gender specific or not).
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
## FM <- merge(x = F, y = Mr, by = "SYMBOL", all.x = TRUE)
## MF <- merge(x = M, y = Fr, by = "SYMBOL", all.x = TRUE)
FM <- merge(x = Fr, y = Mr, by = "SYMBOL")
MF <- merge(x = Mr, y = Fr, by = "SYMBOL")
rm(Fr, Mr, F, M)
......@@ -243,18 +243,18 @@ for (i in seq_len(length(config$integrations))) {
rm(FM, MF)
# We save the full data.
FM_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
FM_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_genderspecificityscore_fulldata.tsv")
MF_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
MF_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_max-avg_genderspecificityscore_fulldata.tsv")
write.table(FM_enriched, file = FM_ofile, sep = "\t", quote = FALSE, col.names = NA)
write.table(MF_enriched, file = MF_ofile, sep = "\t", quote = FALSE, col.names = NA)
rm(FM_ofile, MF_ofile)
# We save the simplified data (to be merged later on).
FM_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
FM_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_genderspecificityscore.tsv")
MF_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
MF_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_max-avg_genderspecificityscore.tsv")
FM_enriched_short <- FM_enriched %>% select(Gene, ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC,
ctrl_Pval, ctrl_adj_Pval, gender_specific_score)
......@@ -280,9 +280,9 @@ for (i in seq_len(length(config$integrations))) {
# This display the effect of the correction (vertically shift = correction effect).
# The bottom line is the genes for which there is litle effect. Its position
# also reflects how many genes are corrected.
FM_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[5]]$name,
FM_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[5]]$name,
"_max-avg_specific_deltaplot.png")
MF_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_VSN_", limmas[[6]]$name,
MF_deltaplot_ofile <- paste0(output_data_dir, integration$name, "_", limmas[[6]]$name,
"_max-avg_specific_deltaplot.png")
png(FM_deltaplot_ofile)
plot(FM_enriched$ref_pivalue_rank, rank(FM_enriched$Delta))
......
This diff is collapsed.
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-0:10:00
#SBATCH --time=0-0:12:00
#SBATCH -p batch
#SBATCH --qos=normal
......
......@@ -64,9 +64,7 @@ for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# Default parameters for the VSN and probe selection method.
j <- 1
vsn <- config$variance_methods[[j]]
# Default parameters for the probe selection method.
l <- 1
selection <- config$selections[[l]]
......@@ -74,7 +72,7 @@ for (i in seq_len(length(config$integrations))) {
# A: reading the results of the previous steps.
#################################################################################################
# We first read the dataset overlap data (by definition, common to males and females).
data_over_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_ALL_",
data_over_fn <- paste0(output_data_dir, integration$name, "_ALL_",
selection$name, "_datasetoverlap.tsv")
data_over <- read.delim(data_over_fn,
quote = "",
......@@ -85,7 +83,7 @@ for (i in seq_len(length(config$integrations))) {
# We then read the results of the integration of the female analyses.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
res_females_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_females <- read.delim(res_females_fn,
row.names = 1,
......@@ -98,7 +96,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn)
# We then read the gender-specificity scores for the female analysis.
spec_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
spec_females_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_genderspecificityscore.tsv")
spec_females <- read.delim(spec_females_fn,
row.names = 1,
......@@ -110,7 +108,7 @@ for (i in seq_len(length(config$integrations))) {
# We repeat the same procedure for the results of the integration of the males analyses.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
res_males_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_males <- read.delim(res_males_fn,
row.names = 1,
......@@ -122,8 +120,8 @@ for (i in seq_len(length(config$integrations))) {
log_fold_change, P_value, adj_P_value)
rm(res_males_fn)
# And the gender-specificity scores for the female analysis.
spec_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
# And the gender-specificity scores for the male analysis.
spec_males_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_genderspecificityscore.tsv")
spec_males <- read.delim(spec_males_fn,
row.names = 1,
......@@ -135,7 +133,7 @@ for (i in seq_len(length(config$integrations))) {
# We repeat the same procedure for the results of the integration of the full analyses (GDS).
k <- 7
limma_analysis <- config$limma_analyses[[k]]
res_gds_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_",
res_gds_fn <- paste0(output_data_dir, integration$name, "_",
limma_analysis$name, "_", selection$name, "_integration.tsv")
res_gds <- read.delim(res_gds_fn,
row.names = 1,
......@@ -239,7 +237,7 @@ for (i in seq_len(length(config$integrations))) {
# We save the files we have generated, starting with the female gender-specific genes.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
GS_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
GS_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_table.tsv")
write.table(GS_females, GS_females_fn,
quote = FALSE,
......@@ -250,7 +248,7 @@ for (i in seq_len(length(config$integrations))) {
# And the male gender-specific genes.
k <- 6
limma_analysis <- config$limma_analyses[[k]]
GS_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
GS_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_genderspecific_table.tsv")
write.table(GS_males, GS_males_fn,
quote = FALSE,
......@@ -261,7 +259,7 @@ for (i in seq_len(length(config$integrations))) {
# The gender-dimorphic genes.
k <- 7
limma_analysis <- config$limma_analyses[[k]]
GD_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
GD_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_genderdimorphic_table.tsv")
write.table(GD, GD_fn,
quote = FALSE,
......@@ -280,7 +278,7 @@ for (i in seq_len(length(config$integrations))) {
# We start with the rankings based on pi values, for all genes.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_pivalue_rankings.tsv")
write.table(res_females %>%
mutate(ranking_value = pi_value) %>%
......@@ -293,7 +291,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_pivalue_rankings.tsv")
write.table(res_males %>%
mutate(ranking_value = pi_value) %>%
......@@ -308,7 +306,7 @@ for (i in seq_len(length(config$integrations))) {
# We continue with the rankings based on gender-specificity scores, for all genes.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_gdrscore_rankings.tsv")
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_females %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -321,7 +319,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_gdrscore_rankings.tsv")
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_males %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -336,7 +334,7 @@ for (i in seq_len(length(config$integrations))) {
# We continue with the rankings based on both metrics, for all genes.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_combinedscores_rankings.tsv")
n <- dim(res_females)[1]
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -351,7 +349,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis, n)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_all_combinedscores_rankings.tsv")
n <- dim(res_males)[1]
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -368,7 +366,7 @@ for (i in seq_len(length(config$integrations))) {
# We continue with the rankings based on pi values, for the gender-specific genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_pivalue_rankings.tsv")
write.table(GS_females %>%
mutate(log_fold_change = log_fold_change.F, P_value = P_value.F, adj_P_value = adj_P_value.F, pi_value = pi_value.F) %>%
......@@ -383,7 +381,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_pivalue_rankings.tsv")
write.table(GS_males %>%
mutate(log_fold_change = log_fold_change.M, P_value = P_value.M, adj_P_value = adj_P_value.M, pi_value = pi_value.M) %>%
......@@ -401,7 +399,7 @@ for (i in seq_len(length(config$integrations))) {
# genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_gdrscore_rankings.tsv")
write.table(merge(GS_females %>%
mutate(log_fold_change = log_fold_change.F, P_value = P_value.F, adj_P_value = adj_P_value.F, pi_value = pi_value.F) %>%
......@@ -418,7 +416,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_gdrscore_rankings.tsv")
write.table(merge(GS_males %>%
mutate(log_fold_change = log_fold_change.M, P_value = P_value.M, adj_P_value = adj_P_value.M, pi_value = pi_value.M) %>%
......@@ -438,7 +436,7 @@ for (i in seq_len(length(config$integrations))) {
# genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_combinedscores_rankings.tsv")
n <- dim(GS_females)[1]
write.table(merge(GS_females %>%
......@@ -456,7 +454,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis, n)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gs_combinedscores_rankings.tsv")
n <- dim(GS_males)[1]
write.table(merge(GS_males %>%
......@@ -476,7 +474,7 @@ for (i in seq_len(length(config$integrations))) {
# We continue with the rankings based on pi values, for the gender-dimorphic genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalue_rankings.tsv")
write.table(res_females %>%
select(Gene, log_fold_change, P_value, adj_P_value, pi_value) %>%
......@@ -490,7 +488,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalue_rankings.tsv")
write.table(res_males %>%
select(Gene, log_fold_change, P_value, adj_P_value, pi_value) %>%
......@@ -507,7 +505,7 @@ for (i in seq_len(length(config$integrations))) {
# genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_gdrscore_rankings.tsv")
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_females %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -522,7 +520,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_gdrscore_rankings.tsv")
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_males %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -540,7 +538,7 @@ for (i in seq_len(length(config$integrations))) {
# genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_combinedscores_rankings.tsv")
n <- dim(res_females)[1]
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -556,7 +554,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis, n)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_combinedscores_rankings.tsv")
n <- dim(res_males)[1]
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -575,7 +573,7 @@ for (i in seq_len(length(config$integrations))) {
# gender-dimorphic genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_pivalue_rankings.tsv")
write.table(res_females %>%
select(Gene, log_fold_change, P_value, adj_P_value, pi_value) %>%
......@@ -589,7 +587,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_pivalue_rankings.tsv")
write.table(res_males %>%
select(Gene, log_fold_change, P_value, adj_P_value, pi_value) %>%
......@@ -606,7 +604,7 @@ for (i in seq_len(length(config$integrations))) {
# gender-dimorphic genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_gdrscore_rankings.tsv")
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_females %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -621,7 +619,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_gdrscore_rankings.tsv")
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
spec_males %>% mutate(ranking_value = gender_specific_score) %>%
......@@ -639,7 +637,7 @@ for (i in seq_len(length(config$integrations))) {
# gender-dimorphic genes only.
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_combinedscores_rankings.tsv")
n <- dim(res_females)[1]
write.table(merge(res_females %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -655,7 +653,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis, n)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gsgd_combinedscores_rankings.tsv")
n <- dim(res_males)[1]
write.table(merge(res_males %>% select(Gene, log_fold_change, P_value, adj_P_value, pi_value),
......@@ -671,7 +669,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_males_fn, k, limma_analysis, n)
rm(GS_females, GS_males)
rm(potGD, res_females, res_males, spec_females, spec_males)
rm(l, selection, vsn, j, integration)
rm(l, selection, integration)
} # End for each integration scheme.
# We clean the workspace and log the session details for reproducibility.
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-00:15:00
#SBATCH --time=0-00:20:00
#SBATCH -p batch
#SBATCH --qos=normal
......@@ -31,7 +31,7 @@ Rscript --vanilla ${CODE_FOLDER}/merge_and_filter_rankings.R > ${OUTPUT_FOLDER}m
Rscript --vanilla ${CODE_FOLDER}/refine_GD_rankings.R > ${OUTPUT_FOLDER}refine_rankings_log.out 2> ${OUTPUT_FOLDER}refine_rankings_log.err
# Additional file that contains the genes that have different probe mappings between the male and female analyses.
diff <(sort -k2,2 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_matching_data.tsv | cut -f 2,12,15,18,21,24) <(sort -k2,2 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_matching_data.tsv | cut -f 2,12,15,21,24,27) | grep '>' | cut -f 1 | sed -r "s/> //g" > ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_vs_males_matching_data_problems.tsv
diff <(sort -k3,3 ${OUTPUT_FOLDER}SNage_PDVsControl_females_matching_data.tsv | cut -f 3,16,19,22,25,28) <(sort -k3,3 ${OUTPUT_FOLDER}SNage_PDVsControl_males_matching_data.tsv | cut -f 3,16,19,25,28,31) | grep '>' | cut -f 1 | sed -r "s/> //g" > ${OUTPUT_FOLDER}SNage_PDVsControl_females_vs_males_matching_data_problems.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -35,9 +35,7 @@ for (i in seq_len(length(config$integrations))) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# Default parameters for the VSN and probe selection method.
j <- 1
vsn <- config$variance_methods[[j]]
# Default parameters for the probe selection method.
l <- 1
selection <- config$selections[[l]]
......@@ -46,7 +44,7 @@ for (i in seq_len(length(config$integrations))) {
#################################################################################################
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalue_rankings.tsv")
GD_females <- read.table(res_females_fn,
sep = "\t",
......@@ -54,7 +52,7 @@ for (i in seq_len(length(config$integrations))) {
rm(res_females_fn, k, limma_analysis)
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalue_rankings.tsv")
GD_males <- read.table(res_males_fn,
sep = "\t",
......@@ -69,7 +67,7 @@ for (i in seq_len(length(config$integrations))) {
arrange(desc(ranking_value))
k <- 5
limma_analysis <- config$limma_analyses[[k]]
res_females_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_females_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalueref_rankings.tsv")
write.table(GD_FM, res_females_fn,
quote = FALSE,
......@@ -83,14 +81,14 @@ for (i in seq_len(length(config$integrations))) {
arrange(desc(ranking_value))
k <- 6
limma_analysis <- config$limma_analyses[[k]]
res_males_fn <- paste0(output_data_dir, integration$name, "_", vsn$name, "_", limma_analysis$name,
res_males_fn <- paste0(output_data_dir, integration$name, "_", limma_analysis$name,
"_", selection$name, "_gd_pivalueref_rankings.tsv")
write.table(GD_MF, res_males_fn,