Commit 50bd863c authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Switched to Rlib (step 6).

parent 2ef0fd75
......@@ -87,7 +87,7 @@ for (i in seq_len(length(row.names(datasets_config)))) {
# We update the clinical information if necessary.
if (length(sample_update_list) > 0) {
for (j in 1:length(sample_update_list)) {
for (j in seq_len(length(sample_update_list))) {
update_info <- strsplit(sample_update_list[j], ";")[[1]]
pheno_data_clean[update_info[1], update_info[2]] <- as.factor(update_info[3])
}
......
......@@ -69,7 +69,8 @@ do
echo ' (Middle center) Status based analysis (PD patients vs controls) for females' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Middle right) Status based analysis (PD patients vs controls) for males' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom left) First factorial analysis (focus on disease status differences for the different genders).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status).}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom right) Second factorial analysis (focus on gender differences for the different disease status).' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Note: the last two plots should be exactly the same.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Venn diagrams.
......@@ -77,8 +78,8 @@ do
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_FemaleVsMale.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_PDVsControl.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_Disease_status_gender.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.35]{'"$OUTPUT_FOLDER"''"$i"'_venn_Gender_disease_status.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Venn diagrams of the significant genes for all Limma analyses and for dataset '"$i"'.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top left) Gender based analysis (Females vs males)' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Top right) Status based analysis (PD patients vs controls)' >> ${OUTPUT_FOLDER}/results_summary.tex
......
......@@ -4,15 +4,11 @@
# Libraries
# ================================================================================================
library("yaml")
library("zeallot")
library("Biobase")
#library("limma")
library("hgu133plus2.db")
library("hgu133a.db")
library("u133x3p.db")
#library("AnnotationDbi")
library("tidyverse")
#library("statmod")
library("ArrayUtils")
message(paste0("[", Sys.time(), "] Libraries loaded."))
......@@ -22,7 +18,6 @@ message(paste0("[", Sys.time(), "] Libraries loaded."))
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
raw_data_dir <- global_config$global_raw_data_dir
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
datasets_raw <- as.matrix(unlist(global_config$datasets))
......
......@@ -9,3 +9,5 @@ match:
@sbatch ${CODE_FOLDER}/match.sh
summarize:
@sbatch ${CODE_FOLDER}/summarize.sh
doc:
@/bin/bash ${CODE_FOLDER}/doc.sh
......@@ -8,7 +8,14 @@ make clean_outputs
make match
make summarize
```
The results are list of DEGs (instead of differentially expressed probes).
The results are list of DEGs (instead of differentially expressed probes) with NA for the genes taht are not present in some of the datasets.
TODO: do the real data integration.
A document that contains all figures can then be generated (locally, not on the HPC).
```
make doc
```
# Prerequisites
A prerequisite is to have the results of the limma analysis for all datasets / meta-datasets (Step 05).
# I/Os and parameters
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
ANNEX=/home/leon/Projects/GeneDER/Documents/WorkReport/Annexes
# Limma analyses
DATASETS=(regular meta)
ANALYSES=(FemaleVsMale FemaleVsMale_PD FemaleVsMale_control PDVsControl PDVsControl_females PDVsControl_males Disease_status_gender Gender_disease_status)
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
# Print header
echo '\documentclass[]{article}' > ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\usepackage{graphicx}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\title{GeneDER - step 06 - Data integration}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\author{Leon-Charles Tranchevent}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\maketitle' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 06-Data\_integration. For each integration' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'scheme and for each limma analysis, the distributions of fold changes and p-values are plotted' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '(before data integration). TODO: add final plots after integration.\\' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'Note: this document is automatically generated.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
for i in "${DATASETS[@]}"
do
for j in "${ANALYSES[@]}"
do
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.34]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_fc_dist.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.34]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_pval_dist.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Distributions of fold changes (left) and P values (right) for integration scheme \textit{'"$i"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"$j"'}. One plot per dataset / meta-dataset.}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
done
done
# Print footer
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
# Compilation
pdflatex -synctex=1 -interaction=nonstopmode ${OUTPUT_FOLDER}/results_summary.tex
cp results_summary.pdf ${ANNEX}/06_summary_results.pdf
mv results_summary.pdf ${OUTPUT_FOLDER}/
rm results_summary*
......@@ -13,12 +13,11 @@ dataset_groups:
- 'meta'
- ['HG-U133A', 'HG-U133_Plus_2', 'E-MEXP-1416']
analyses:
- ['gender', FALSE]
- ['gender_controls', TRUE]
- ['gender_patients', TRUE]
- ['gender_disease_status_factorial', TRUE]
- ['disease_status', FALSE]
- ['disease_status_males', TRUE]
- ['disease_status_females', TRUE]
- ['disease_status_gender_factorial', TRUE]
- ['FemaleVsMale', FALSE]
- ['FemaleVsMale_control', TRUE]
- ['FemaleVsMale_PD', TRUE]
- ['Gender_disease_status', TRUE]
- ['PDVsControl', FALSE]
- ['PDVsControl_females', TRUE]
- ['PDVsControl_males', TRUE]
- ['Disease_status_gender', TRUE]
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-1:00:00
#SBATCH --time=0-0:20:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......@@ -24,3 +24,6 @@ module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
# Actual job
Rscript --vanilla ${CODE_FOLDER}/match_probes.R > ${OUTPUT_FOLDER}/match_log.out 2> ${OUTPUT_FOLDER}/match_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -15,6 +15,17 @@ global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
datasets_raw <- as.matrix(unlist(global_config$datasets))
dim(datasets_raw) <- c(length(global_config$datasets[[1]]), length(global_config$datasets))
datasets_config <- data.frame(t(datasets_raw), row.names = datasets_raw[1, ]) %>%
select(X2, X3, X4, X5, X6, X7) %>%
rename(array_type = X2,
is_compressed = X3,
clinical_descriptors = X4,
is_meta_dataset = X5,
has_female_PD = X6,
has_paired_samples = X7)
remove(datasets_raw)
message(paste0("[", Sys.time(), "] Configuration read."))
# ================================================================================================
......@@ -32,7 +43,7 @@ subsets_size_n <- function(array, n, current = vector()) {
return(current)
} else {
results <- vector()
for (i in 1:length(array)) {
for (i in seq_len(length(array))) {
next_i <- i + 1
new_array <- vector()
if (next_i <= length(array)) {
......@@ -59,9 +70,9 @@ get_all_subsets <- function(array_as_str, sep0 = "|", sep1 = ",", sep2 = "|") {
if (grepl(sep0, array_as_str, fixed = TRUE)[1]) {
array <- unlist(strsplit(array_as_str, sep0, fixed = TRUE))
all_subsets <- vector()
for (i in 1:(length(array) - 1)) {
for (i in seq_len((length(array) - 1))) {
res <- subsets_size_n(array, i, vector())
for (j in 1:dim(res)[2]) {
for (j in seq_len(dim(res)[2])) {
all_subsets <- c(all_subsets, paste(res[1:i, j], sep = "", collapse = sep2))
}
}
......@@ -162,14 +173,15 @@ tag_probe_cmplx <- function(genes, subsets_as_string, current_conflict, sep = ",
# Main
# ================================================================================================
# We do all array types one by one.
for (i in 1:length(local_config$matching)) {
for (i in seq_len(length(local_config$matching))) {
# We get the dataset name for that matching analysis (only one).
matching_dataset <- local_config$matching[[i]]
# We set the I/Os.
# We read gender data but it does not matter since we only look at the probe and genes.
deps_gender_fn <- paste0(input_data_dir, matching_dataset[1], "_toptable_gender.tsv")
# We read gender data but it does not matter since we only look at the probe and genes,
# and not at the expression levels.
deps_gender_fn <- paste0(input_data_dir, matching_dataset[1], "_toptable_FemaleVsMale.tsv")
deps_matching_fn <- paste0(output_data_dir, matching_dataset[2], "_probe_matching.tsv")
deps_matching_details_fn <- paste0(output_data_dir, matching_dataset[2],
"_probe_matching_details.tsv")
......@@ -287,7 +299,7 @@ global_matching_data <- list()
# We clean again all array types one by one.
# This time by considering the cross-platform conflicts.
for (i in 1:length(local_config$matching)) {
for (i in seq_len(length(local_config$matching))) {
# We get the dataset name for that matching analysis (only one).
matching_dataset <- local_config$matching[[i]]
......
......@@ -23,3 +23,7 @@ module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
# Actual job
Rscript --vanilla ${CODE_FOLDER}/summarize_gene_level.R > ${OUTPUT_FOLDER}/summarize_log.out 2> ${OUTPUT_FOLDER}/summarize_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -7,8 +7,6 @@ library("yaml")
library("tidyverse")
library("ggplot2")
library("reshape2")
library("doParallel")
doParallel::registerDoParallel(3)
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
......@@ -73,23 +71,21 @@ select_best <- function(probes_lists, array_types) {
# ================================================================================================
# We do all analyses one by one.
# One analysis is, in part, defined by the datasets that need to be integrated.
for (i in 1:length(local_config$dataset_groups)) {
#foreach (i = 1:length(local_config$dataset_groups)) %dopar% {
for (i in seq_len(length(local_config$dataset_groups))) {
# We get the dataset names for that matching analysis.
integration_name <- local_config$dataset_groups[[i]][[1]]
datasets <- local_config$dataset_groups[[i]][[2]]
datasets <- local_config$dataset_groups[[i]][[2]]
# We have different limma analyses and we loop over these.
for (j in 1:length(local_config$analyses)) {
#foreach (j = 1:length(local_config$analyses)) %dopar% {
for (j in seq_len(length(local_config$analyses))) {
# We define the current focus.
analysis <- unlist(local_config$analyses[j])
datasets_data <- list()
# We read the limma data of all datasets.
for (k in 1:length(datasets)) {
for (k in seq_len(length(datasets))) {
# If the current analysis needs female.PD patients and the current dataset has not,
# we skip the dataset.
......@@ -108,14 +104,17 @@ for (i in 1:length(local_config$dataset_groups)) {
row.names = NULL,
as.is = TRUE)
rownames(datasets_data[[k]]) <- datasets_data[[k]]$X
datasets_data[[k]] <- datasets_data[[k]] %>% mutate(dataset = datasets[k], array_type = datasets_config[datasets[k], "array_type"])
message(paste0("[", Sys.time(), "][", integration_name, "][", analysis[1], "][", datasets[k], "] Limma data read."))
datasets_data[[k]] <- datasets_data[[k]] %>%
mutate(dataset = datasets[k], array_type = datasets_config[datasets[k], "array_type"])
message(paste0("[", Sys.time(), "][", integration_name, "][", analysis[1],
"][", datasets[k], "] Limma data read."))
} # End for each dataset.
# We create one large dataframe that contains all data.
datasets_data_df <- do.call(rbind, c(datasets_data, make.row.names = FALSE))
remove(datasets_data)
message(paste0("[", Sys.time(), "][", integration_name, "][", analysis[1], "] Expression data read."))
message(paste0("[", Sys.time(), "][", integration_name, "][", analysis[1],
"] Expression data read."))
# We plot the distributions of p-values and fold changes across all datasets.
fc_dist_before_fn <- paste0(output_data_dir, integration_name, "_", analysis[1], "_fc_dist.png")
......@@ -175,7 +174,7 @@ for (i in 1:length(local_config$dataset_groups)) {
remove(matched_expr_best_avg_hgu133a, matched_expr_best_avg_hgu133plus2, matched_expr_best_avg_u133x3p)
matched_expr_best_avg_wide <- dcast(matched_expr_best_avg, SYMBOL ~ dataset, value.var="P.Value")
# For the case where se select the probe with the best P value.
# For the case where we select the probe with the best P value.
matched_expr_best_pval_hgu133a <- merge(x =datasets_data_df, y = matching_data, by.x = "X", by.y = "HGU133A_best_pval") %>%
filter(array_type == "HGU133A") %>% select(SYMBOL, P.Value, dataset)
matched_expr_best_pval_hgu133plus2 <- merge(x =datasets_data_df, y = matching_data, by.x = "X", by.y = "HGU133Plus2_best_pval") %>%
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("lintr")
# ================================================================================================
# Main
# ================================================================================================
# Step 01
lint("../01-Quality_control/quality_control.R", linters = with_defaults(line_length_linter(100)))
# Step 02
lint("../02-Preprocessing/preprocess.R", linters = with_defaults(line_length_linter(100)))
lint("../02-Preprocessing/plot_scan_log.R", linters = with_defaults(line_length_linter(100)))
# Step 03
lint("../03-Predict_gender/predict_gender.R", linters = with_defaults(line_length_linter(100)))
# Step 04
lint("../04-Clean_datasets/clean_datasets.R", linters = with_defaults(line_length_linter(100)))
lint("../04-Clean_datasets/check_clinical_balance.R",
linters = with_defaults(line_length_linter(100)))
# Step 05
lint("../05-Get_DEGs/get_DEGs.R", linters = with_defaults(line_length_linter(100)))
# Self lintr
lint("./lint_r_scripts.R", linters = with_defaults(line_length_linter(100)))
......@@ -3,17 +3,17 @@ FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/
clean:
@rm -rf *~
up_loc_code:
@rsync -avzu iris:/home/users/ltranchevent/Projects/GeneDER/Analysis/ /home/leon/Projects/GeneDER/Analysis/
@rsync -vazur --delete iris:/home/users/ltranchevent/Projects/GeneDER/Analysis/ /home/leon/Projects/GeneDER/Analysis/
up_hpc_code:
@rsync -avzu /home/leon/Projects/GeneDER/Analysis/ iris:/home/users/ltranchevent/Projects/GeneDER/Analysis/
@rsync -vazur --delete /home/leon/Projects/GeneDER/Analysis/ iris:/home/users/ltranchevent/Projects/GeneDER/Analysis/
up_loc_lib:
@rsync -avzu iris:/home/users/ltranchevent/Projects/Rlibs/ /home/leon/Projects/Rlibs/
@rsync -vazur --delete iris:/home/users/ltranchevent/Projects/Rlibs/ /home/leon/Projects/Rlibs/
up_hpc_lib:
@rsync -avzu /home/leon/Projects/Rlibs/ iris:/home/users/ltranchevent/Projects/Rlibs/
@rsync -vazur --delete /home/leon/Projects/Rlibs/ iris:/home/users/ltranchevent/Projects/Rlibs/
up_loc_data:
@rsync -avzu iris:/home/users/ltranchevent/Data/GeneDER/ /home/leon/Data/GeneDER/
@rsync -vazur --delete iris:/home/users/ltranchevent/Data/GeneDER/ /home/leon/Data/GeneDER/
up_hpc_data:
@rsync -avzu /home/leon/Data/GeneDER/ iris:/home/users/ltranchevent/Data/GeneDER/
@rsync -vazur --delete /home/leon/Data/GeneDER/ iris:/home/users/ltranchevent/Data/GeneDER/
up_loc:
@make up_loc_code
@make up_loc_lib
......@@ -23,4 +23,6 @@ up_hpc:
@make up_hpc_lib
@make up_hpc_data
install_lib:
# ii
# module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
@Rscript --vanilla ${FOLDER}/utils/install.R
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment