Commit 76793668 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Step 06: started to adapt to new configuration and new datasets.

parent 5489a1ec
......@@ -8,6 +8,7 @@ library("readr")
library("tidyverse")
library("ArrayUtils")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
......@@ -66,56 +67,6 @@ plot_age_dist <- function(D, split_by, output_figure_file,
dev.off()
}
#' @title Reduces a set to a give nsize (only to make ti smaller).
#'
#' @description This function accepts a set of numeric values and a given desired size.
#' It then creates a new set of the desired size that can be considered as a reduction
#' of the original set (it will try to keep the same properties (min, max, mean)).
#' The function will either select values from the original array or create new values from
#' the original values by combining them.
#'
#' Example: large_set <- c(10, 20, 30, 40, 50, 60)
#' desired_size <- 4
#' large_new_indexes <- c(1, 2.67, 4.33, 6)
#' large_set_selection <- c(10, 26.67, 43.3, 60)
#' 26.67 is 20 + (30 - 20) * 0.67
#' @param large_set A vector of numeric values to be reduced.
#' @param desired_size The length of the array after reduction.
#' @return The reduced array of the desired size with numeric values.
reduce_set <- function(large_set, desired_size) {
# This is the global scaling factor to reduce the set from its
# current size to the desired size.
mult_factor <- ( (length(large_set) - 1 ) / (desired_size - 1))
# This is the raw scaled indexes. These can be integer as well as
# floats and will need to be refined (see below).
large_newindexes <- 1 + ( (seq(1, desired_size) - 1) * mult_factor)
# This is the reduced large set (init).
large_set_selection <- vector(length = desired_size)
# We now refine the new indexes. If the value is an integer, it is easy and we just
# pick this value from the original set. If this is a float, we do our best to create
# a new value that represents the new index.
for (k in seq_len(desired_size)) {
j <- large_newindexes[k]
j_int <- trunc(j)
j_dec <- j - j_int
# In case we have a float, we will take the weighted average of the two
# neighboring values. If this is an integer, this will take the exact value.
first_value <- large_set[j_int]
second_value <- first_value
j_next <- j_int + 1
if (j_next <= length(large_set)) {
# We do not have the last value (last therefore without next value).
second_value <- large_set[j_next]
}
large_set_selection[k] <- first_value + (second_value - first_value) * j_dec
}
return(large_set_selection)
}
# ================================================================================================
# Main
# ================================================================================================
......
......@@ -26,10 +26,6 @@ if (length(args) > 0) {
}
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# ================================================================================================
# Main
# ================================================================================================
......
......@@ -20,7 +20,7 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/05-Get_DEGs/
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/dataset
create_variables ../Confs/datasets_config.yml
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
......
......@@ -14,6 +14,7 @@ library("hgfocus.db")
library("ArrayUtils")
library("tidyverse")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
......@@ -33,20 +34,6 @@ if (length(args) > 0) {
}
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
# Utility to collapse duplicate rows of a data-frame by
# keeping a single instance of a key column and concatenating the
# unique values of the other columns with a pipe.
collapser <- function(x) {
x %>%
unique %>%
sort %>%
paste(collapse = "|")
}
# ================================================================================================
# Main
# ================================================================================================
......@@ -83,7 +70,7 @@ for (i in seq_len(length(config$datasets))) {
gene_annots_raw <- ArrayUtils::get_gene_annots_from_package(platform_config$library_name,
rownames(exp_eset))
} else {
# TODO: here read instead the sorted GPL file for special case.
# Here, we read instead the sorted GPL file for special cases with no Biomart library.
gpl_annot_folder <- paste0(raw_data_dir, "Platforms/")
gpl_annot_filename <- paste0(platform_config$geo_name, "_gene_annots.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(gpl_annot_folder,
......
......@@ -14,5 +14,5 @@ integrate:
analyse:
@sbatch ${CODE_FOLDER}/analyse.sh
doc:
@/bin/bash ${CODE_FOLDER}/doc_a.sh
@/bin/bash ${CODE_FOLDER}/doc_b.sh
@sbatch ${CODE_FOLDER}/doc_a.sh
@sbatch ${CODE_FOLDER}/doc_b.sh
......@@ -2,13 +2,13 @@
The objectives of this step is to integrate the results of the differential expression analysis across several datasets in order to identify similarities and overlaps and to identify robust DEGs.
# Details and instructions
The datasets / meta-datasets are first summarized at the gene level (limma analyses are performed at the probe level). Conflicts and non unique mappings are handled to create a unique list of DEGS.
The datasets are first summarized at the gene level (limma analyses are performed at the probe level). Conflicts and non unique mappings are handled to create a unique list of DEGS.
```
make clean_outputs
make match
make summarize
```
The results are list of DEGs (instead of differentially expressed probes) with NA for the genes taht are not present in some of the datasets.
The results are lists of DEGs (instead of differentially expressed probes) with NA for the genes that are not present in some of the datasets.
The integration itself is then computed and results are analyzed.
```
......@@ -16,10 +16,10 @@ make integrate
make analyse
```
A document that contains all figures can then be generated (locally, not on the HPC).
A document that contains all figures can then be generated.
```
make doc
```
# Prerequisites
A prerequisite is to have the results of the limma analysis for all datasets / meta-datasets (Step 05).
A prerequisite is to have the results of the limma analysis for all datasets (Step 05).
......@@ -15,13 +15,12 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}/analyse_integration_results.R > ${OUTPUT_FOLDER}/analyse_log.out 2> ${OUTPUT_FOLDER}/analyse_log.err
......
This diff is collapsed.
#!/bin/bash -l
#SBATCH -J geneder:06:docb
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-00:01:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# I/Os and parameters
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
ANNEX=/home/leon/Projects/GeneDER/Documents/WorkReport/Annexes
# Limma analyses
SCHEMES=(regular meta)
ANALYSES=(FemaleVsMale FemaleVsMale_PD FemaleVsMale_control PDVsControl PDVsControl_females PDVsControl_males Disease_status_gender Gender_disease_status)
DATASETS=(GSE20163 GSE20164 GSE20292 GSE8397 Simunovic GSE20141 GSE7307 GSE7621 E.MEXP.1416 HG.U133A HG.U133_Plus_2)
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
......@@ -53,100 +67,148 @@ echo ' \end{center}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{table}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
for i in "${SCHEMES[@]}"
# For each integration scheme.
nbSchemes=${#integrations__name[@]}
for (( i=0; i<=$nbSchemes; i++ ))
do
for j in "${ANALYSES[@]}"
do
# Overlap table.
echo '\begin{table}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \begin{center}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \setlength{\tabcolsep}{0.5\tabcolsep}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \resizebox{\linewidth}{!}{\pgfplotstabletypeset[' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' col sep=tab,' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' every head row/.style={' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' before row={\toprule},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' after row={\midrule}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' },' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' every last row/.style={after row=\bottomrule},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/0/.style={string type,column type={l}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/1/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/2/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/3/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/4/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/5/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/6/.style={string type,column type={r}}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' ]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_overlap.tsv}}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Significant gene overlap between the different configurations for integration scheme \textit{'"$i"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"$j"'}. (Upper triangle) Raw counts.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Lower triangle) Percentage of the overlap with respect to the smallest set of the two.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' NOTE: Row headers contain the set sizes.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \end{center}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{table}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
done
integrationName=${integrations__name[$i]}
if [ "${integrationName}" != "" ]
then
# For each Limma analysis.
nbAnalyses=${#limma_analyses__names[@]}
for (( j=0; j<=$nbAnalyses; j++ ))
do
analysisNames=${limma_analyses__names[$j]}
if [ "${analysisNames}" != "" ]
then
analysisNamesClean=$(echo $analysisNames | sed -r 's/[\",\[]+//g' | sed -r 's/\]//g')
allComparisons=(${analysisNamesClean})
# For each Limma comparison.
nbComparisons=${#allComparisons[@]}
for (( k=0; k<=$nbComparisons; k++ ))
do
comparisonName=${allComparisons[$k]}
if [ "${comparisonName}" != "" ]
then
# Overlap table.
echo '\begin{table}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \begin{center}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \setlength{\tabcolsep}{0.5\tabcolsep}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \resizebox{\linewidth}{!}{\pgfplotstabletypeset[' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' col sep=tab,' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' every head row/.style={' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' before row={\toprule},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' after row={\midrule}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' },' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' every last row/.style={after row=\bottomrule},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/0/.style={string type,column type={l}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/1/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/2/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/3/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/4/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/5/.style={string type,column type={r}},' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' display columns/6/.style={string type,column type={r}}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' ]{'"$OUTPUT_FOLDER"''"${integrationName}"'_'"${comparisonName}"'_overlap.tsv}}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Significant gene overlap between the different configurations for integration scheme \textit{'"${integrationName}"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"${comparisonName}"'}. (Upper triangle) Raw counts.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Lower triangle) Percentage of the overlap with respect to the smallest set of the two.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' NOTE: Row headers contain the set sizes.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \end{center}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{table}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
done
fi
done
fi
done
echo '\clearpage' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Use counts and top hits.
for i in "${SCHEMES[@]}"
# For each integration scheme.
nbSchemes=${#integrations__name[@]}
for (( i=0; i<=$nbSchemes; i++ ))
do
for j in "${ANALYSES[@]}"
do
# Use counts.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_avg_use_counts.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_pval_use_counts.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Dataset use counts for integration scheme \textit{'"$i"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"$j"'}. Each bar represents the number of times each dataset was used' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' for integration (only when the corresponding fold change is of the same sign than the median fold' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' change across datasets). (Top) Gene probes selected based on the highest average expression (AVG).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom) Gene probes selected based on the best P value (PVAL).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) All genes considered.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Only significant genes.}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Up vs Down bias (via ECDFs).
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"$i"'_'"$j"'_eCDFs.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Empirical cumulative distribution functions (ECDF) of P values for integration scheme \textit{'"$i"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"$j"'}. Only the avg Marot-Mayer P values are plotted.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Genes are split according to whether they are up- or down- regulated.}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Top 10 hits for this configuration (if done - otherwise nothing).
LL=(${OUTPUT_FOLDER}/${i}_${j}_geneplot_*)
for k in "${LL[@]}"
integrationName=${integrations__name[$i]}
if [ "${integrationName}" != "" ]
then
# For each Limma analysis.
nbAnalyses=${#limma_analyses__names[@]}
for (( j=0; j<=$nbAnalyses; j++ ))
do
if [[ $k =~ "png" ]]
analysisNames=${limma_analyses__names[$j]}
if [ "${analysisNames}" != "" ]
then
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$k"'}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{One of the top hit for integration scheme \textit{'"$i"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"$j"'}. Expression values are plotted for each dataset (subplots)' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and for each relevant clinical descriptor (colors).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' NOTE: Gene probes selected based on the highest average expression (AVG).}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
analysisNamesClean=$(echo $analysisNames | sed -r 's/[\",\[]+//g' | sed -r 's/\]//g')
allComparisons=(${analysisNamesClean})
# For each Limma comparison.
nbComparisons=${#allComparisons[@]}
for (( k=0; k<=$nbComparisons; k++ ))
do
comparisonName=${allComparisons[$k]}
if [ "${comparisonName}" != "" ]
then
# Use counts.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"${integrationName}"'_'"${comparisonName}"'_avg_use_counts.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"${integrationName}"'_'"${comparisonName}"'_pval_use_counts.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Dataset use counts for integration scheme \textit{'"${integrationName}"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"${comparisonName}"'}. Each bar represents the number of times each dataset was used' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' for integration (only when the corresponding fold change is of the same sign than the median fold' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' change across datasets). (Top) Gene probes selected based on the highest average expression (AVG).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom) Gene probes selected based on the best P value (PVAL).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Left) All genes considered.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Right) Only significant genes.}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Up vs Down bias (via ECDFs).
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$OUTPUT_FOLDER"''"${integrationName}"'_'"${comparisonName}"'_eCDFs.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Empirical cumulative distribution functions (ECDF) of P values for integration scheme \textit{'"${integrationName}"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"${comparisonName}"'}. Only the avg Marot-Mayer P values are plotted.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Genes are split according to whether they are up- or down- regulated.}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Top 10 hits for this configuration (if done - otherwise nothing).
LL=(${OUTPUT_FOLDER}/${integrationName}_${comparisonName}_geneplot_*)
for l in "${LL[@]}"
do
if [[ $l =~ "png" ]]
then
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.42]{'"$l"'}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{One of the top hit for integration scheme \textit{'"${integrationName}"'}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and Limma analysis \textit{'"${comparisonName}"'}. Expression values are plotted for each dataset (subplots)' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' and for each relevant clinical descriptor (colors).' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' NOTE: Gene probes selected based on the highest average expression (AVG).}' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
done
echo '\clearpage' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
done
fi
done
echo '\clearpage' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
done
fi
done
# Print footer
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
# # Compilation
# Compilation
pdflatex -synctex=1 -interaction=nonstopmode ${OUTPUT_FOLDER}/results_summary.tex
cp results_summary.pdf ${ANNEX}/06b_summary_results.pdf
mv results_summary.pdf ${OUTPUT_FOLDER}/results_summary_p2.pdf
mv results_summary.pdf ${OUTPUT_FOLDER}/results_summary_b.pdf
rm results_summary*
rm ${OUTPUT_FOLDER}/results_summary.tex
\ No newline at end of file
rm ${OUTPUT_FOLDER}/results_summary.tex
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -19,22 +19,29 @@ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
DATASETS=(GSE20163 GSE20164 GSE20292 GSE8397 Simunovic GSE20141 GSE7307 GSE7621 E-MEXP-1416 HG-U133A HG-U133_Plus_2)
# Loading modules.
module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Preparing the job.
rm -rf ${OUTPUT_FOLDER}/clinical_categories_summarized.tsv
for i in "${DATASETS[@]}"
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
do
cut -f 2 ${INPUT_FOLDER}${i}_clinical_clean.tsv | grep -v Disease | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $2, $1}' | sed -r 's/^/'"$i"'\t/g' >> ${OUTPUT_FOLDER}/clinical_categories_summarized.tsv
cut -f 3 ${INPUT_FOLDER}${i}_clinical_clean.tsv | grep -v Gender | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $2, $1}' | sed -r 's/^/'"$i"'\t/g' >> ${OUTPUT_FOLDER}/clinical_categories_summarized.tsv
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
cut -f 2 ${INPUT_FOLDER}${datasetName}_clinical_clean.tsv | grep -v Disease | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $2, $1}' | sed -r 's/^/'"${datasetName}"'\t/g' >> ${OUTPUT_FOLDER}/clinical_categories_summarized.tsv
cut -f 3 ${INPUT_FOLDER}${datasetName}_clinical_clean.tsv | grep -v Gender | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $2, $1}' | sed -r 's/^/'"${datasetName}"'\t/g' >> ${OUTPUT_FOLDER}/clinical_categories_summarized.tsv
fi
done
# Actual job
Rscript --vanilla ${CODE_FOLDER}/integrate_datasets.R > ${OUTPUT_FOLDER}/integrate_log.out 2> ${OUTPUT_FOLDER}/integrate_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -50,7 +50,7 @@ count_relevant_fc <- function(x) {
}
}
# Function that perform the integration according to the paper suggested by EG.
# Function that performs the integration according to the paper suggested by EG.
integrate_pvals <- function(pvals, nb_samples, method = "marot.mayer") {
# We select only the P values to combine (not NA).
......
......@@ -2,23 +2,24 @@ local_input_data_dir: !!str '05/'
local_data_dir: !!str '06/'
local_code_dir: !!str '06-Data_integration/'
nb_min_pval: 3
matching:
- ['GSE20141', 'HGU133Plus2']
- ['GSE20163', 'HGU133A']
- ['E-MEXP-1416', 'U133_X3P']
dataset_groups:
-
- 'regular'
- ['GSE20141', 'GSE20163', 'GSE20164', 'GSE20292', 'GSE7307', 'GSE7621', 'GSE8397', 'Simunovic', 'E-MEXP-1416']
-
- 'meta'
- ['HG-U133A', 'HG-U133_Plus_2', 'E-MEXP-1416']
analyses:
- ['FemaleVsMale', FALSE, 'All']
- ['FemaleVsMale_control', TRUE, 'Control']
- ['FemaleVsMale_PD', TRUE, 'PD']
- ['Gender_disease_status', TRUE, 'All']
- ['PDVsControl', FALSE, 'All']
- ['PDVsControl_females', TRUE, 'F']
- ['PDVsControl_males', TRUE, 'M']
- ['Disease_status_gender', TRUE, 'All']
max_gene_annots: 5
# matching:
# - ['GSE20141', 'HGU133Plus2']
# - ['GSE20163', 'HGU133A']
# - ['E-MEXP-1416', 'U133_X3P']
# dataset_groups:
# -
# - 'regular'
# - ['GSE20141', 'GSE20163', 'GSE20164', 'GSE20292', 'GSE7307', 'GSE7621', 'GSE8397', 'Simunovic', 'E-MEXP-1416']
# -
# - 'meta'
# - ['HG-U133A', 'HG-U133_Plus_2', 'E-MEXP-1416']
# analyses:
# - ['FemaleVsMale', FALSE, 'All']
# - ['FemaleVsMale_control', TRUE, 'Control']
# - ['FemaleVsMale_PD', TRUE, 'PD']
# - ['Gender_disease_status', TRUE, 'All']
# - ['PDVsControl', FALSE, 'All']
# - ['PDVsControl_females', TRUE, 'F']
# - ['PDVsControl_males', TRUE, 'M']
# - ['Disease_status_gender', TRUE, 'All']
......@@ -19,7 +19,8 @@ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual job
Rscript --vanilla ${CODE_FOLDER}/match_probes.R > ${OUTPUT_FOLDER}/match_log.out 2> ${OUTPUT_FOLDER}/match_log.err
......
This diff is collapsed.
......@@ -19,7 +19,8 @@ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.4.4-intel-2018a-X11-20180131-bare
mu
module load lang/R/3.6.0-foss-2018a-X11-20180131-bare
# Actual job
Rscript --vanilla ${CODE_FOLDER}/summarize_gene_level.R > ${OUTPUT_FOLDER}/summarize_log.out 2> ${OUTPUT_FOLDER}/summarize_log.err
......
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age suitable_for_factorial_analysis
GSE20141 Affymetrix HGU133Plus2 D FALSE TRUE FALSE TRUE TRUE
GSE20163 Affymetrix HGU133A D FALSE FALSE FALSE TRUE TRUE
GSE20164 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE
GSE20292 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE
GSE7307 Affymetrix HGU133Plus2 DG FALSE TRUE FALSE FALSE FALSE
GSE7621 Affymetrix HGU133Plus2 DG FALSE FALSE FALSE FALSE TRUE
GSE8397 Affymetrix HGU133A DG TRUE FALSE FALSE TRUE TRUE
Simunovic Affymetrix HGU133A DG FALSE TRUE FALSE TRUE TRUE
GSE24378 Affymetrix U133_X3P D FALSE TRUE FALSE TRUE TRUE
GSE20333 Affymetrix HG-Focus DG FALSE FALSE FALSE TRUE TRUE
Moreira Agilent G4112F DG FALSE FALSE FALSE TRUE TRUE
GSE20159 Illumina HumanV3 DG FALSE FALSE FALSE TRUE TRUE
GSE26927 Illumina HumanV2 DGA FALSE FALSE TRUE TRUE TRUE
GSE49036 Affymetrix HGU133Plus2 D FALSE FALSE FALSE TRUE TRUE
\ No newline at end of file
dataset_name platform array_type clinical_descriptors has_paired_samples has_batches cleaning has_age suitable_for_factorial_analysis tissue
GSE20141 Affymetrix HGU133Plus2 D FALSE TRUE FALSE TRUE TRUE DA
GSE20163 Affymetrix HGU133A D FALSE FALSE FALSE TRUE TRUE SN
GSE20164 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE SN
GSE20292 Affymetrix HGU133A DG FALSE FALSE FALSE TRUE TRUE SN
GSE7307 Affymetrix HGU133Plus2 DG FALSE TRUE FALSE FALSE FALSE SN
GSE7621 Affymetrix HGU133Plus2 DG FALSE FALSE FALSE FALSE TRUE SN
GSE8397 Affymetrix HGU133A DG TRUE FALSE FALSE TRUE TRUE SN
Simunovic Affymetrix HGU133A DG FALSE TRUE FALSE TRUE TRUE DA
GSE24378 Affymetrix U133_X3P D FALSE TRUE FALSE TRUE TRUE DA
GSE20333 Affymetrix HG-Focus DG FALSE FALSE FALSE TRUE TRUE SN
Moreira Agilent G4112F DG FALSE FALSE FALSE TRUE TRUE SN
GSE20159 Illumina HumanV3 DG FALSE FALSE FALSE TRUE TRUE SN
GSE26927 Illumina HumanV2 DGA FALSE FALSE TRUE TRUE TRUE SN
GSE49036 Affymetrix HGU133Plus2 D FALSE FALSE FALSE TRUE TRUE SN
\ No newline at end of file
......@@ -9,6 +9,7 @@ datasets:
cleaning: 'FALSE'