Commit fe85f14c authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 06 to process all datasets at once.

parent 96c62eee
......@@ -9,10 +9,6 @@ data:
@sbatch ${CODE_FOLDER}clean_datasets.sh
check:
@sbatch ${CODE_FOLDER}check.sh
names:
@sbatch ${CODE_FOLDER}get_names.sh
ps:
@sbatch ${CODE_FOLDER}create_probelists.sh
match:
@sbatch ${CODE_FOLDER}match_probes.sh
doc:
......
#!/bin/bash -l
#SBATCH -J geneder:04:probelists
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-00:05:00
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Actual job.
Rscript --vanilla ${CODE_FOLDER}create_probelists.R > ${OUTPUT_FOLDER}create_ps_log.out 2> ${OUTPUT_FOLDER}create_ps_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
#!/bin/bash -l
#SBATCH -J geneder:04:getnames
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GEO_PLAFORM_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Platforms/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/platforms_config.yml
# Get the biomart data.
nbPlatforms=${#platforms__platform_name[@]}
for (( i=0; i<$nbPlatforms; i++ ))
do
platformName=${platforms__platform_name[$i]}
platformBiomartName=${platforms__biomart_name[$i]}
platformGEOName=${platforms__geo_name[$i]}
if [ "${platformBiomartName}" != "NA" ]
then
# Get the official gene names.
wget -O ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6"><Dataset name = "hsapiens_gene_ensembl" interface = "default"><Attribute name = "'${platformBiomartName}'"/><Attribute name = "external_gene_name"/></Dataset></Query>'
awk 'BEGIN{FS=OFS="\t"}{if ($1 != "") {print $0}}' ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv | awk '{if (t[$1]) {t[$1]=t[$1]"|"$2} else {t[$1]=$2}} END{for (i in t) {if (i != "") {print i"\t"t[i]}}}' | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
rm ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv
sleep 2s
else
if [ "${platformGEOName}" != "NA" ]
then
# We use the GEO data
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformGEOName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
else
# We use manually curated data.
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
fi
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:20:00
#SBATCH --time=0-0:35:00
#SBATCH -p batch
#SBATCH --qos=normal
......@@ -15,13 +15,46 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GEO_PLAFORM_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Platforms/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Actual job
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/platforms_config.yml
# Get the biomart data.
nbPlatforms=${#platforms__platform_name[@]}
for (( i=0; i<$nbPlatforms; i++ ))
do
platformName=${platforms__platform_name[$i]}
platformBiomartName=${platforms__biomart_name[$i]}
platformGEOName=${platforms__geo_name[$i]}
if [ "${platformBiomartName}" != "NA" ]
then
# Get the official gene names.
wget -O ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6"><Dataset name = "hsapiens_gene_ensembl" interface = "default"><Attribute name = "'${platformBiomartName}'"/><Attribute name = "external_gene_name"/></Dataset></Query>'
awk 'BEGIN{FS=OFS="\t"}{if ($1 != "") {print $0}}' ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv | awk '{if (t[$1]) {t[$1]=t[$1]"|"$2} else {t[$1]=$2}} END{for (i in t) {if (i != "") {print i"\t"t[i]}}}' | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
rm ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv
sleep 2s
else
if [ "${platformGEOName}" != "NA" ]
then
# We use the GEO data
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformGEOName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
else
# We use manually curated data.
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
fi
fi
done
# Refining the biomart matchings.
Rscript --vanilla ${CODE_FOLDER}create_probelists.R > ${OUTPUT_FOLDER}create_ps_log.out 2> ${OUTPUT_FOLDER}create_ps_log.err
Rscript --vanilla ${CODE_FOLDER}match_probes.R > ${OUTPUT_FOLDER}match_log.out 2> ${OUTPUT_FOLDER}match_log.err
# Moving the slurm log file to data
......
#!/bin/bash -l
#SBATCH -J geneder:04:plotbm
#SBATCH -J geneder:05:plotbm
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
......@@ -15,8 +15,8 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/05-Get_DEGs/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
......
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
RS_INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/15/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/16-Data_integration_all/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
SHELL=/bin/bash
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}*
@paste <(cut -f -2 ${RS_INPUT_FOLDER}NBB_matchingdata_formatted.tsv | head -n 1) <(head -n 1 ${INPUT_FOLDER}Combined_probe_matching.tsv) <(echo "RNAseq_EG") | cut -f 2,5- > ${OUTPUT_FOLDER}Combined_probe_matching.tsv
@grep -wv genes ${INPUT_FOLDER}Combined_probe_matching.tsv | sort -t $$'\t' -k2,2 > ${OUTPUT_FOLDER}MA_combined_probe_matching.tsv
@cut -f -2 ${RS_INPUT_FOLDER}NBB_matchingdata_formatted.tsv | grep -wv SYMBOL | awk '{if (t[$$2]) {t[$$2]=t[$$2]"|"$$1} else {t[$$2]=$$1}} END{for (i in t) {if (i != "") {print i"\t"t[i]}}}' | sort -t $$'\t' -k1,1 > ${OUTPUT_FOLDER}RS_combined_probe_matching.tsv
@join -a 1 -a 2 -e "NA" -o auto -1 2 -2 1 -t $$'\t' ${OUTPUT_FOLDER}MA_combined_probe_matching.tsv ${OUTPUT_FOLDER}RS_combined_probe_matching.tsv | cut -f 1,3- >> ${OUTPUT_FOLDER}Combined_probe_matching.tsv
@cp -rf ${INPUT_FOLDER}Combined_probe_matching.tsv ${OUTPUT_FOLDER}Combined_probe_matching.tsv
summarize:
@sbatch ${CODE_FOLDER}summarize.sh
integrate:
......
#!/bin/bash -l
#SBATCH -J geneder:16:analyse
#SBATCH -J geneder:06:analyse
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-2:30:00
#SBATCH --time=0-2:45:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,8 +15,8 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/16-Data_integration_all/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
......
#!/bin/bash -l
#SBATCH -J geneder:16:check
#SBATCH -J geneder:06:check
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:03:00
#SBATCH --time=0-0:05:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,8 +15,8 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/16-Data_integration_all/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
......
#!/bin/bash -l
#SBATCH -J geneder:16:gexpr
#SBATCH -J geneder:06:gexpr
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 5
#SBATCH --time=0-00:25:00
#SBATCH --time=0-00:35:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,8 +15,8 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/16-Data_integration_all/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
......
#!/bin/bash -l
#SBATCH -J geneder:16:heatmaps
#SBATCH -J geneder:06:heatmaps
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --time=0-0:05:00
#SBATCH --time=0-0:10:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
......@@ -15,8 +15,8 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/16/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/16-Data_integration_all/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment