Commit 94852040 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Code refactoring (VSN configuration / optimization / cleaning).

parent 4415e972
......@@ -4,6 +4,6 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/01-Quality_contro
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
@rm -rf ${OUTPUT_FOLDER}*
run_qc:
@sbatch ${CODE_FOLDER}/run_qc.sh
@sbatch ${CODE_FOLDER}run_qc.sh
......@@ -29,18 +29,15 @@ create_variables ../Confs/datasets_config.yml
# Actual jobs
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}/${datasetName}/
mkdir ${OUTPUT_FOLDER}/${datasetName}/
Rscript --vanilla ${CODE_FOLDER}/quality_control.R ${datasetName} > ${OUTPUT_FOLDER}/${datasetName}/quality_control_log.out 2> ${OUTPUT_FOLDER}/${datasetName}/quality_control_log.err
echo "== Job $i ended (${datasetName}) =="
fi
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}/
mkdir ${OUTPUT_FOLDER}${datasetName}/
Rscript --vanilla ${CODE_FOLDER}quality_control.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/quality_control_log.out 2> ${OUTPUT_FOLDER}${datasetName}/quality_control_log.err
echo "== Job $i ended (${datasetName}) =="
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -4,14 +4,14 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/02-Preprocessing/
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
@rm -rf ${OUTPUT_FOLDER}*
preprocess:
@sbatch ${CODE_FOLDER}/preprocess.sh
@sbatch ${CODE_FOLDER}preprocess.sh
vsn:
@sbatch ${CODE_FOLDER}/stabilize_variance.sh
@sbatch ${CODE_FOLDER}stabilize_variance.sh
vsn_raw:
@sbatch ${CODE_FOLDER}/stabilize_variance_raw.sh
@sbatch ${CODE_FOLDER}stabilize_variance_raw.sh
get_log:
@sbatch ${CODE_FOLDER}/get_log.sh
@sbatch ${CODE_FOLDER}get_log.sh
doc:
@sbatch ${CODE_FOLDER}/doc.sh
@sbatch ${CODE_FOLDER}doc.sh
......@@ -24,124 +24,118 @@ create_variables ../Confs/datasets_config.yml
create_variables ../Confs/project_config.yml
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
rm -rf ${OUTPUT_FOLDER}results_summary.*
# Print header
echo '\documentclass[]{article}' > ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\usepackage{graphicx}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\title{GeneDER - step 02 - Pre-processing}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\author{Leon-Charles Tranchevent}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\maketitle' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 02-Preprocessing, in which raw data are pre-processed.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'In particular, this document focuses on the necessity (or not) to stabilize the variance (using vsn). \\' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'Note: this document is automatically generated.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\documentclass[]{article}' > ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\usepackage{graphicx}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\title{GeneDER - step 02 - Pre-processing}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\author{Leon-Charles Tranchevent}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\begin{document}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\maketitle' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'This document summarizes the results of the step 02-Preprocessing, in which raw data are pre-processed.' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'In particular, this document focuses on the necessity (or not) to stabilize the variance (using vsn). \\' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'Note: this document is automatically generated.' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{abstract}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Mean vs SD plots for all datasets.
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
# If possible, we display the results on the raw data.
if [ -f ${OUTPUT_FOLDER}${datasetName}/${datasetName}_raw_meansd_ranks.png ]
then
# If possible, we display the results on the raw data.
if [ -f ${OUTPUT_FOLDER}${datasetName}/${datasetName}_raw_meansd_ranks.png ]
then
# Only mean vs SD plots here.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_ranks.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_vals.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset (on raw data). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
fi
# Only mean vs SD plots here.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_ranks.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_vals.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${datasetName}"'_raw_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset (on raw data). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# For each normalization method.
nbNorms=${#normalizations__name[@]}
for (( j=0; j<$nbNorms; j++ ))
do
normName=${normalizations__name[$j]}
# By default, first no batch correction.
# Start with the mean vs SD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_ranks.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_vals.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset ('"${normName}"', no batch correction). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Then the scatterplot.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.25]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_data_vs_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Scatter plot between the original data (pre-processed) and the same data after variance stabilization for' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' the '"$datasetName"' dataset ('"${normName}"', no batch correction).}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# echo '\clearpage' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# If possible, second with batch correction.
if [ -f ${OUTPUT_FOLDER}${datasetName}/${normName}/${datasetName}_normalized_${normName}_batchcorrection_meansd_ranks.png ]
then
# Start with the mean vs SD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_ranks.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_vals.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset ('"${normName}"', batch correction). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Then the scatterplot.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.25]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_data_vs_vsn.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Scatter plot between the original data (pre-processed) and the same data after variance stabilization for' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' the '"$datasetName"' dataset ('"${normName}"', batch correction).}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\clearpage' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
fi
# For each normalization method.
nbNorms=${#normalizations__name[@]}
for (( j=0; j<=$nbNorms; j++ ))
do
normName=${normalizations__name[$j]}
if [ "${normName}" != "" ]
then
# By default, first no batch correction.
# Start with the mean vs SD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_ranks.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_vals.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset ('"${normName}"', no batch correction). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Then the scatterplot.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.25]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_data_vs_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Scatter plot between the original data (pre-processed) and the same data after variance stabilization for' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' the '"$datasetName"' dataset ('"${normName}"', no batch correction).}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# echo '\clearpage' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# If possible, second with batch correction.
if [ -f ${OUTPUT_FOLDER}${datasetName}/${normName}/${datasetName}_normalized_${normName}_batchcorrection_meansd_ranks.png ]
then
# Start with the mean vs SD plots.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_ranks.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_vals.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_ranks_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.28]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_batchcorrection_meansd_vals_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Mean vs sd plots for the '"$datasetName"' dataset ('"${normName}"', batch correction). (Top) Before applying variance stabilization.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' (Bottom) After applying variance stabilization. (Left) Ranked based plots. (Right) Intensity based plots.}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
# Then the scatterplot.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.25]{'"$OUTPUT_FOLDER"''"${datasetName}"'/'"${normName}"'/'"${datasetName}"'_normalized_'"${normName}"'_nobatchcorrection_data_vs_vsn.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Scatter plot between the original data (pre-processed) and the same data after variance stabilization for' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' the '"$datasetName"' dataset ('"${normName}"', no batch correction).}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\clearpage' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
fi
fi
done
fi
done
done
# Print footer
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{document}' >> ${OUTPUT_FOLDER}results_summary.tex
# Compilation
pdflatex -synctex=1 -interaction=nonstopmode ${OUTPUT_FOLDER}/results_summary.tex
mv results_summary.pdf ${OUTPUT_FOLDER}/
pdflatex -synctex=1 -interaction=nonstopmode ${OUTPUT_FOLDER}results_summary.tex
mv results_summary.pdf ${OUTPUT_FOLDER}
rm results_summary*
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -30,17 +30,17 @@ create_variables ../Confs/datasets_config.yml
function extract_data_from_log() {
# We first start by getting the array names.
grep -i 'Reading' ${OUTPUT_FOLDER}/$1/preprocessing_log.out | cut -f 10 -d '/' | sed -r 's/^/'"$1"'\t/g' | grep -i cel > ${OUTPUT_FOLDER}/$1/preprocessing_log_short0.tsv
grep -i 'Reading' ${OUTPUT_FOLDER}$1/preprocessing_log.out | cut -f 10 -d '/' | sed -r 's/^/'"$1"'\t/g' | grep -i cel > ${OUTPUT_FOLDER}$1/preprocessing_log_short0.tsv
# Then, we get the final model (Number of iterations and proportion of background probes).
grep -i 'iterations' ${OUTPUT_FOLDER}/$1/preprocessing_log.err | cut -f 3,9 -d ' ' | sed -r 's/limit...//g' | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}/$1/preprocessing_log_short1.tsv
grep -i 'iterations' ${OUTPUT_FOLDER}$1/preprocessing_log.err | cut -f 3,9 -d ' ' | sed -r 's/limit...//g' | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}$1/preprocessing_log_short1.tsv
# We then obtain the convergence value of the last iteration (c).
grep -B 1 -i 'iterations' ${OUTPUT_FOLDER}/$1/preprocessing_log.err | grep Attempting | cut -f 4,7 -d ' ' | sed -r 's/,//g' | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}/$1/preprocessing_log_short2.tsv
grep -B 1 -i 'iterations' ${OUTPUT_FOLDER}$1/preprocessing_log.err | grep Attempting | cut -f 4,7 -d ' ' | sed -r 's/,//g' | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}$1/preprocessing_log_short2.tsv
# Last, we combine all together in one file per dataset and clean up temporaty files.
paste ${OUTPUT_FOLDER}/$1/preprocessing_log_short*.tsv > ${OUTPUT_FOLDER}/$1/scanlog.tsv
rm -rf ${OUTPUT_FOLDER}/$1/preprocessing_log_short*.tsv
paste ${OUTPUT_FOLDER}$1/preprocessing_log_short*.tsv > ${OUTPUT_FOLDER}$1/scanlog.tsv
rm -rf ${OUTPUT_FOLDER}$1/preprocessing_log_short*.tsv
}
# We extract data for all datasets.
......@@ -53,11 +53,11 @@ do
done
# We concatenate across datasets.
echo 'dataset sample nb_iterations prop_back last_iteration convergence' > ${OUTPUT_FOLDER}/SCANlog_combined.tsv
cat ${OUTPUT_FOLDER}/*/scanlog.tsv >> ${OUTPUT_FOLDER}/SCANlog_combined.tsv
echo 'dataset sample nb_iterations prop_back last_iteration convergence' > ${OUTPUT_FOLDER}SCANlog_combined.tsv
cat ${OUTPUT_FOLDER}*/scanlog.tsv >> ${OUTPUT_FOLDER}SCANlog_combined.tsv
# We create a plot based on the collected log.
Rscript --vanilla ${CODE_FOLDER}/plot_scan_log.R > ${OUTPUT_FOLDER}/plot_log.out 2> ${OUTPUT_FOLDER}/plot_log.err
Rscript --vanilla ${CODE_FOLDER}plot_scan_log.R > ${OUTPUT_FOLDER}plot_log.out 2> ${OUTPUT_FOLDER}plot_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -33,17 +33,14 @@ create_variables ../Confs/datasets_config.yml
# II- General workflow with all processing methods.
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}/
mkdir ${OUTPUT_FOLDER}${datasetName}/
Rscript --vanilla ${CODE_FOLDER}preprocess.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.out 2> ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.err
echo "== Job $i ended (${datasetName}) =="
fi
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}/
mkdir ${OUTPUT_FOLDER}${datasetName}/
Rscript --vanilla ${CODE_FOLDER}preprocess.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.out 2> ${OUTPUT_FOLDER}${datasetName}/preprocessing_log.err
echo "== Job $i ended (${datasetName}) =="
done
# Moving the slurm log file to data
......
......@@ -2,7 +2,6 @@
# I/Os and parameters
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/02/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/02-Preprocessing/
# Load configuration
source ../libs/conf/confSH.sh
......@@ -11,49 +10,42 @@ create_variables ../Confs/project_config.yml
create_variables ../Confs/platforms_config.yml
# We set up the results folder that will contain all files for all datasets.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma
mkdir ${OUTPUT_FOLDER}/apt_gcrma
rm -rf ${OUTPUT_FOLDER}apt_gcrma
mkdir ${OUTPUT_FOLDER}apt_gcrma
# We run the APT-GCRMA for all datasets (or at least all the Affymetrix ones).
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
platformName=${datasets__platform[$i]}
if [ "${platformName}" == "Affymetrix" ]
then
platformName=${datasets__platform[$i]}
if [ "${platformName}" == "Affymetrix" ]
then
# We set up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma_temp
mkdir ${OUTPUT_FOLDER}/apt_gcrma_temp
# We get the correct CDF file based on the dataset / platform configurations.
datasetArrayType=${datasets__array_type[$i]}
cdfName="NA"
nbPlatforms=${#platforms__platform_name[@]}
for (( j=0; j<=$nbPlatforms; j++ ))
do
platformName=${platforms__platform_name[$j]}
if [ "${platformName}" == "${datasetArrayType}" ]
then
cdfName=${platforms__cdf_name[$j]}
fi
done
# We prepare and run the APT command.
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
eval "$apt_cmd"
# We rename and copy the result file to the real apt folder.
mv ${OUTPUT_FOLDER}apt_gcrma_temp/gc-correction.scale-intensities.rma-bg.quant-norm.pm-only.med-polish.summary.txt ${OUTPUT_FOLDER}apt_gcrma/${datasetName}.tsv
# We clean up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}/apt_gcrma_temp
fi
# We set up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}apt_gcrma_temp
mkdir ${OUTPUT_FOLDER}apt_gcrma_temp
# We get the correct CDF file based on the dataset / platform configurations.
datasetArrayType=${datasets__array_type[$i]}
cdfName="NA"
nbPlatforms=${#platforms__platform_name[@]}
for (( j=0; j<$nbPlatforms; j++ ))
do
platformName=${platforms__platform_name[$j]}
if [ "${platformName}" == "${datasetArrayType}" ]
then
cdfName=${platforms__cdf_name[$j]}
fi
done
# We prepare and run the APT command.
apt_cmd="${apt_script} -a ${apt_script_method} -d ${global_raw_data_dir}Platforms/${cdfName} -o ${OUTPUT_FOLDER}apt_gcrma_temp ${global_raw_data_dir}${datasetName}/RAW/*"
eval "$apt_cmd"
# We rename and copy the result file to the real apt folder.
mv ${OUTPUT_FOLDER}apt_gcrma_temp/gc-correction.scale-intensities.rma-bg.quant-norm.pm-only.med-polish.summary.txt ${OUTPUT_FOLDER}apt_gcrma/${datasetName}.tsv
# We clean up the temporary folder for that run.
rm -rf ${OUTPUT_FOLDER}apt_gcrma_temp
fi
done
done
\ No newline at end of file
......@@ -28,16 +28,13 @@ create_variables ../Confs/datasets_config.yml
# Actual jobs
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}/stabilize_variance.R ${datasetName} > ${OUTPUT_FOLDER}/${datasetName}/vsn_log.out 2> ${OUTPUT_FOLDER}/${datasetName}/vsn_log.err
echo "== Job $i ended (${datasetName}) =="
fi
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}stabilize_variance.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/vsn_log.out 2> ${OUTPUT_FOLDER}${datasetName}/vsn_log.err
echo "== Job $i ended (${datasetName}) =="
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -28,16 +28,13 @@ create_variables ../Confs/datasets_config.yml
# Actual jobs
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}/stabilize_variance_raw.R ${datasetName} > ${OUTPUT_FOLDER}/${datasetName}/vsn_raw_log.out 2> ${OUTPUT_FOLDER}/${datasetName}/vsn_raw_log.err
echo "== Job $i ended (${datasetName}) =="
fi
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}stabilize_variance_raw.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/vsn_raw_log.out 2> ${OUTPUT_FOLDER}${datasetName}/vsn_raw_log.err
echo "== Job $i ended (${datasetName}) =="
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -4,9 +4,9 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/03-Predict_missin
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
@rm -rf ${OUTPUT_FOLDER}*
predict:
@sbatch ${CODE_FOLDER}/predict_gender.sh
@sbatch ${CODE_FOLDER}/predict_age.sh
@sbatch ${CODE_FOLDER}predict_gender.sh
@sbatch ${CODE_FOLDER}predict_age.sh
doc:
@sbatch ${CODE_FOLDER}/doc.sh
@sbatch ${CODE_FOLDER}doc.sh
......@@ -24,94 +24,88 @@ create_variables ../Confs/datasets_config.yml
create_variables ../Confs/project_config.yml
# Clean start
rm -rf ${OUTPUT_FOLDER}/results_summary.*
rm -rf ${OUTPUT_FOLDER}results_summary.*
# Print header
echo '\documentclass[]{article}' > ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\usepackage{graphicx}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\title{GeneDER - step 03 - Gender prediction}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\author{Leon-Charles Tranchevent}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{document}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\maketitle' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'This document summarizes the results of the step 03-Predict\_missing, in which gender and age missing values are predicted.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'For age, the scatter plot of the real versus predicted values is displayed. For gender, heatmaps of the expression' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'signal of Y-chromosome probes are displayed for all datasets. In addition, clustering results are displayed on top of' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'the heatmaps to see if samples with shared gender are clustered together. Last, signal from' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'Y-chromosome probes versus signal from X-chromosome probes is are also plotted. Blue samples are' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'females, green are males, grey are unknown (missing data - to be predicted). \\' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo 'Note: this document is automatically generated.' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{abstract}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\documentclass[]{article}' > ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\usepackage{graphicx}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\title{GeneDER - step 03 - Gender prediction}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\author{Leon-Charles Tranchevent}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\begin{document}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\maketitle' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\textsl{}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\begin{abstract}' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'This document summarizes the results of the step 03-Predict\_missing, in which gender and age missing values are predicted.' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'For age, the scatter plot of the real versus predicted values is displayed. For gender, heatmaps of the expression' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'signal of Y-chromosome probes are displayed for all datasets. In addition, clustering results are displayed on top of' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'the heatmaps to see if samples with shared gender are clustered together. Last, signal from' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'Y-chromosome probes versus signal from X-chromosome probes is are also plotted. Blue samples are' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'females, green are males, grey are unknown (missing data - to be predicted). \\' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
echo 'Note: this document is automatically generated.' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{abstract}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Age prediction.
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \includegraphics[scale=0.32]{'"$OUTPUT_FOLDER"'global_age_predictions.png}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' \caption{Scatter plots (per dataset) of the known age versus the predicted age.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}/results_summary.tex
echo ' Lines indicate the perfect match (solid line) or the 5- and 10-year intervals (dashed lines).}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '' >> ${OUTPUT_FOLDER}/results_summary.tex
echo '\begin{figure}[ht]' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \centering' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \includegraphics[scale=0.32]{'"$OUTPUT_FOLDER"'global_age_predictions.png}' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' \caption{Scatter plots (per dataset) of the known age versus the predicted age.' | sed -r 's/_/\\_/g' >> ${OUTPUT_FOLDER}results_summary.tex
echo ' Lines indicate the perfect match (solid line) or the 5- and 10-year intervals (dashed lines).}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '\end{figure}' >> ${OUTPUT_FOLDER}results_summary.tex
echo '' >> ${OUTPUT_FOLDER}results_summary.tex
# Gender prediction.
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<=$nbDatasets; i++ ))
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
if [ "${datasetName}" != "" ]
then
# For each normalization method.
nbNorms=${#normalizations__name[@]}
for (( j=0; j<=$nbNorms; j++ ))
do