Commit ec0926c4 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Step 06: continued with the refactoring of the code.

parent 76793668
......@@ -58,8 +58,8 @@ for (i in seq_len(length(config$datasets))) {
# Cleaning.
remove(raw_data_subdir, output_data_subdir)
}
}
} # End if dataset should be analysed.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -63,8 +63,8 @@ for (i in seq_len(length(config$datasets))) {
raw_data_subdir,
output_data_subdir,
phenotype_groups = dataset_groups)
}
}
} # End if dataset should be analysed.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -146,8 +146,8 @@ for (i in seq_len(length(config$datasets))) {
# Plot of the Y vs X signal.
plot_filename <- paste0(output_data_dir, dataset_name, "_plot_YvsX.png")
plot_expr_ratios(exp_data_xonly, exp_data_yonly, pheno_data, plot_filename)
}
}
} # End if dataset should be analysed.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -165,7 +165,7 @@ for (i in seq_len(length(config$datasets))) {
min_y = min_age_across_datasets,
max_y = max_age_across_datasets)
message(paste0("[", Sys.time(), "][", dataset_name, "] Plots created."))
}
} # End for each dataset.
# Summary plot of the Average Age Differences for all datasets and the two
# comparisons of interest.
......
......@@ -97,8 +97,8 @@ for (i in seq_len(length(config$datasets))) {
quote = FALSE,
col.names = NA)
message(paste0("[", Sys.time(), "][", dataset_name, "] Data saved."))
}
}
} # End if dataset should be analysed.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -135,9 +135,9 @@ for (i in seq_len(length(config$datasets))) {
}
message(paste0("[", Sys.time(), "][", dataset_name, "][", j, "] DEG extraction done."))
}
}
}
}
} # End for each Limma analysis.
} # End if dataset should be analysed.
} # End for each dataset.
# We log the session details for reproducibility.
sessionInfo()
......@@ -44,4 +44,4 @@ done
Rscript --vanilla ${CODE_FOLDER}/integrate_datasets.R > ${OUTPUT_FOLDER}/integrate_log.out 2> ${OUTPUT_FOLDER}/integrate_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
This diff is collapsed.
......@@ -167,7 +167,7 @@ for (i in seq_len(length(config$platforms))) {
deps_annotated <- deps_annotated %>%
rowwise() %>%
mutate(nb_symbols = 1 + str_count(SYMBOL, fixed("|")))
# We do not consider the probes with more than X genes in any case.
# X is defined in the local configuration file (default to 5).
deps_annotated <- deps_annotated %>%
......@@ -233,9 +233,7 @@ for (i in seq_len(length(config$platforms))) {
quote = FALSE,
col.names = NA)
remove(deps_matching_details_fn)
}
# TODO: The above produce strnage results (with NAs) for i == 5, G4112F.
} # End for each platform.
# We do an additional run of cleaning to remove the probes conflicting across datasets.
# The same rules apply but the different matches come from different datasets.
......@@ -253,9 +251,11 @@ for (i in seq_len(length(config$platforms))) {
header = TRUE,
as.is = TRUE,
row.names = 1)
# We replace the NA values.
deps_matching_loc[is.na(deps_matching_loc)] <- ""
deps_matching <- rbind(deps_matching, deps_matching_loc %>%
mutate(source = platform$platform_name))
}
} # End for each platform.
remove(deps_matching_loc_fn, deps_matching_loc)
message(paste0("[", Sys.time(), "][Combined] Data ready."))
......@@ -289,13 +289,12 @@ global_matching_data <- list()
# We clean again all array types one by one.
# This time by considering the cross-platform conflicts (based on the
# maps that were just built).
# We do all platforms.
for (i in seq_len(length(config$platforms))) {
# We get the first dataset using that platform.
platform <- config$platforms[[i]]
matching_dataset <- get_dataset_from_platform(config$datasets, platform$platform_name)
# We set the I/Os.
deps_matching_details_fn <- paste0(output_data_dir,
platform$platform_name,
......@@ -309,6 +308,9 @@ for (i in seq_len(length(config$platforms))) {
as.is = TRUE,
row.names = 1)
# We replace the NA values.
deps_matching[is.na(deps_matching)] <- ""
# We remove the probes that target a set of genes if there is a probe that targets
# one of its subset (possibly from another array type).
# Ex: [hgu133a] 1_at -> to a|b
......@@ -362,18 +364,23 @@ for (i in seq_len(length(config$platforms))) {
mutate(probesets = paste(unlist(strsplit(probesets, ", ")), collapse = "|"))
global_matching_data[[i]] <- deps_matching %>%
rename(!!platform$platform_name := probesets)
}
} # End for each platform.
# We merge the dataframes, create one global matching dataframe and save it.
deps_matching_fn <- paste0(output_data_dir, "Combined_probe_matching.tsv")
global_matching_data_merged <- merge(x = global_matching_data[[1]],
y = merge(x = global_matching_data[[2]],
y = global_matching_data[[3]],
by = "genes",
all = TRUE),
by = "genes", all = TRUE)
# TODO: The above statment has to be replaced by a loop over all platforms (or directly concatenate from the above loop using the same strategy than before).
global_matching_data_merged <- NULL
for (i in seq_len(length(config$platforms))) {
if (i == 1) {
global_matching_data_merged <- global_matching_data[[1]]
} else {
global_matching_data_merged <- merge(x = global_matching_data_merged,
y = global_matching_data[[i]],
by = "genes",
all = TRUE)
}
}
# We save the merged data-frame.
write.table(global_matching_data_merged,
file = deps_matching_fn,
sep = "\t",
......
This diff is collapsed.
integrations:
-
name: SN
criteria: tissue;SN
-
name: DA
criteria: tissue;DA
limma_analyses:
-
factor: Gender
coefficients: ["F - M"]
names: ["FemaleVsMale"]
factorial: FALSE
-
factor: Disease.status
coefficients: ["PD - Control"]
names: ["PDVsControl"]
factorial: FALSE
-
factor: gender_disease_status
coefficients: ["F.PD - F.Control", "M.PD - M.Control", "(F.PD - F.Control) - (M.PD - M.Control)"]
names: ["PDVsControl_females", "PDVsControl_males", "Gender_disease_status"]
factorial: TRUE
-
factor: gender_disease_status
coefficients: ["F.PD - M.PD", "F.Control - M.Control", "(F.PD - M.PD) - (F.Control - M.Control)"]
names: ["FemaleVsMale_PD", "FemaleVsMale_control", "Disease_status_gender"]
factorial: TRUE
global_raw_data_dir: !!str "/home/users/ltranchevent/Data/GeneDER/Original/"
global_data_dir: !!str "/home/users/ltranchevent/Data/GeneDER/Analysis/"
global_code_dir: !!str "/home/users/ltranchevent/Projects/GeneDER/Analysis/"
# Limma analyses and comparisons.
limma_analyses:
-
factor: Gender
coefficients: ["F - M"]
names: ["FemaleVsMale"]
factorial: FALSE
sample_groups: ["All"]
-
factor: Disease.status
coefficients: ["PD - Control"]
names: ["PDVsControl"]
factorial: FALSE
sample_groups: ["All"]
-
factor: gender_disease_status
coefficients: ["F.PD - F.Control", "M.PD - M.Control", "(F.PD - F.Control) - (M.PD - M.Control)"]
names: ["PDVsControl_females", "PDVsControl_males", "Gender_disease_status"]
factorial: TRUE
sample_groups: ["F", "M", "All"]
-
factor: gender_disease_status
coefficients: ["F.PD - M.PD", "F.Control - M.Control", "(F.PD - M.PD) - (F.Control - M.Control)"]
names: ["FemaleVsMale_PD", "FemaleVsMale_control", "Disease_status_gender"]
factorial: TRUE
sample_groups: ["PD", "Control", "All"]
# Integration schemes
integrations:
-
name: SN
criteria: tissue;SN
-
name: DA
criteria: tissue;DA
# Probe selection methods
selections:
-
short_name: avg
long_name: best_avg
-
short_name: pval
long_name: best_pval
......@@ -28,7 +28,7 @@ up_hpc:
@make up_hpc_data
install_lib:
# ii
mu
# mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
@Rscript --vanilla ${FOLDER}/libs/utils/install.R
up_annex:
......
......@@ -3,6 +3,7 @@
# Necessary libraries
library("tidyverse")
library("survcomp")
#' @title Collapse rows of a data-frame.
#'
......@@ -121,4 +122,78 @@ get_all_subsets <- function(array_as_str, sep0 = "|", sep1 = ",", sep2 = "|") {
} else {
return("")
}
}
\ No newline at end of file
}
#' @title Combination of several P values into a global P value.
#'
#' @description This functions integrates a set of P values to compute a single
#' global P value. The integration is performed using either a method implemented
#' in combine.test (Z-transform, Fisher) or the Marot-Mayer method suggested by EG.
#'
#' @param pvals The array of P values to combine. NA values will be ignored.
#' @param nb_samples The number of samples of the datasets for each of these P values.
#' @param method The name of the method to use for the integration. Default to
#' "marot.mayer".
#' @return The integrated P value.
integrate_pvals <- function(pvals, nb_samples, method = "marot.mayer") {
# We select only the P values to combine (not NA).
pvals_clean <- pvals[!is.na(pvals)]
nb_samples_clean <- nb_samples[!is.na(pvals)]
# Depending on the method, we use combine.test or not.
int_pval <- NULL
if (method == "marot.mayer") {
# We compute the Z scores of the P values.
zscores <- qnorm(pvals_clean, lower.tail = FALSE)
# We normalize the weights.
weights <- sqrt(nb_samples_clean / sum(nb_samples_clean))
# We compute the weighted sum and its pnorm (aka P value).
int_pval <- 2 * (pnorm(sum(weights * zscores), lower.tail = FALSE))
int_pval <- 2 * (pnorm(sum(weights * zscores), lower.tail = FALSE))
int_pval <- ifelse(int_pval > 1, 1, int_pval)
} else {
# Else, we fall back on combine.test.
int_pval <- 2 * combine.test(pvals_clean, weight = nb_samples_clean, method = method)
int_pval <- ifelse(int_pval > 1, 1, int_pval)
}
# We return the P value of the integration.
return(int_pval)
}
#' @title Identify strongest signal in an array of values.
#'
#' @description This function takes an array of values and determine whether most values
#' are positives or negatives in order to determine the strongest signal. It does that by
#' computing the median across all values and by using the sign of the median as the
#' strongest signal. Using the median nistead of the average or even the number of
#' positive/negative values circumvents the problem of extremes values (a single large
#' positive value with a majority of negative values, or a majority of small negative
#' values and a minority of positive values). The function also computes the average of
#' the strongest signal.
#'
#' It can be used for intance to compute the number of fold changes that are of the
#' same sign than the median fold change. In this case, it also returns the average
#' over relevant fold changes.
#'
#' @param x A list of values. Possibly with NA values.
#' @return A vector of two values corresponding to, first, the number of original values
#' that are of the same sign as the median of the values and, second, the average of
#' these values.
identify_strongest_signal <- function(x) {
med <- median(unlist(x), na.rm = TRUE)
nb_na <- sum(is.na(x))
nb_pos <- length(x[x >= 0]) - nb_na
avg_pos <- mean(x[x >= 0], na.rm = TRUE)
nb_neg <- length(x[x < 0]) - nb_na
avg_neg <- mean(x[x < 0], na.rm = TRUE)
if (med >= 0) {
return(c(nb_pos, avg_pos))
} else {
return(c(nb_neg, avg_neg))
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment