Commit 7e1bb0ce authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactored the code for regulatory network analysis according to the latest GeneGo results.

parent e8f7adc4
......@@ -17,11 +17,15 @@ getdb:
@echo "*************************************************************************"
GRNdot:
@sbatch ${CODE_FOLDER}extract_GRN_dorothea.sh
# From now on, we need GeneGo results.
enrich:
@sbatch ${CODE_FOLDER}run_dorothea_enrichment.sh
# From now on, we need GeneGo results.
preprefine:
@sbatch ${CODE_FOLDER}prep_refine.sh
@echo "********************************************************************************"
@echo "* PLEASE PROCEED WITH THE GENEGO PROCESS OF THE genego_extended_newTFs.tsv *"
@echo "* FILE LIKE DONE BEFORE FOR THE OTHER MAPPING (OFFICIAL ONLY). *"
@echo "********************************************************************************"
refineGRN1:
@sbatch ${CODE_FOLDER}refine_GeneGo_mappings.sh
@echo "********************************************************************************"
......
......@@ -19,7 +19,7 @@ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (top DEGs based on pi values).1
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (top DEGs based on pi values).
# We update the database based on the DoRothEA_unmapped_mapped_clean.tsv file.
cp ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv ${OUTPUT_FOLDER}Dorothea_temp.tsv
......@@ -35,34 +35,34 @@ done < "$input"
mv ${OUTPUT_FOLDER}Dorothea_temp.tsv ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# First the female data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Female_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Female_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Female_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Female_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Female_dorothea_GRN_internal.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
# Second the male data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Male_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Male_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Male_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Male_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Male_dorothea_GRN_internal.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
# Third the GDS data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_internal.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_internal.tsv
# Fourth, the combinaison of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_internal.tsv
# Fourth, the combination of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
# Fifth the PDvsControl data (gender not taken into account, as a baseline).
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_internal.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_internal.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -28,7 +28,8 @@ curl -o ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv "http://omnipathdb.org/interac
# We remove the unnecessary fields. So far, we do not filter the TF that are very frequent.
# The most frequent one (with evidence A, B and C) has ~1k targets.
# This can be changed if we also change the evidence levels to include D as well.
cut -f 3,4,6,7,13 ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv > ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3,4,6,7,13 > ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
grep -v "source_genesymbol" ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3,4,6,7,13 | sort -u >> ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# We process the files further to extract the full list of gene ids used by DoRothEA.
cat <(cut -f 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) | grep -v "source_genesymbol" | grep -v "target_genesymbol" | sort -u > ${OUTPUT_FOLDER}Dorothea_geneids.tsv
......
......@@ -15,20 +15,23 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GG_DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Else/GeneGo/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash.
# Copy the GeneGo data (must have been run manually before).
cp -rf ${GG_DATA_FOLDER}* ${OUTPUT_FOLDER}
# We derive the list of TF present in the extended networks and not the internal networks.
# We will need this list to go through GeneGo as well to get the official gene names and thus potentially
# the geneder ids for these genes.
cut -f 1 ${OUTPUT_FOLDER}*genego*internal.tsv | sort -u > ${OUTPUT_FOLDER}F1
cut -f 1 ${OUTPUT_FOLDER}*genego*extended.tsv | sort -u > ${OUTPUT_FOLDER}F2
cut -f 2 ${OUTPUT_FOLDER}*genego*internal.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F1
cut -f 2 ${OUTPUT_FOLDER}*genego*extended.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F2
comm -13 ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2 > ${OUTPUT_FOLDER}genego_extended_newTFs.tsv
rm ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -37,12 +37,12 @@ cut -f 1,2,4 ${OUTPUT_FOLDER}Omnipath_network.tsv | awk -v FS=$'\t' -v OFS=$'\t'
# We don't do any additional mapping since we now have uniprot ids.
# Warning for user.
echo "*************************************************************************"
echo "* YOU MAY WANT TO MANUALLY REFINE THE MAPPING IN THE FILE"
echo "* ${OUTPUT_FOLDER}Omnipath_geneids_unmapped_mapped_clean.tsv"
echo "*************************************************************************"
#echo "*************************************************************************"
#echo "* YOU MAY WANT TO MANUALLY REFINE THE MAPPING IN THE FILE"
#echo "* ${OUTPUT_FOLDER}Omnipath_geneids_unmapped_mapped_clean.tsv"
#echo "*************************************************************************"
# Part II - we now take care of the PROGENy linear model *useful to create the CARNIVAL weight matrix).
# Part II - we now take care of the PROGENy linear model (useful to create the CARNIVAL weight matrix).
# We obtain the latest file from the CARNIVAL R package.
Rscript --vanilla ${CODE_FOLDER}extract_progeny_model.R > ${OUTPUT_FOLDER}extract_progeny_model_log.out 2> ${OUTPUT_FOLDER}extract_progeny_model_log.err
......
......@@ -21,14 +21,13 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# Actual job in bash.
# For each comparison of interest, we select the top 500 genes max, based on the pi-values.
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Female_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Female_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Male_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Male_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Gender_disease_status_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}PDvsControl_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_Gender_disease_status_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv
# We also concatenate all gender relevant files to create the integrated list.
# We do NOT include the PDvscontrol file as this one represents more or less the baseline.
cat ${OUTPUT_FOLDER}Female_PI_top_genes.tsv ${OUTPUT_FOLDER}Male_PI_top_genes.tsv ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv
cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -29,18 +29,18 @@ do
# First, we map the DoRothEA data, we add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-5,7,9 > ${OUTPUT_FOLDER}File_temp.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-5,7,9 > ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(echo $'Female.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval') > ${f2}
paste <(cut -f 1-5,8-9 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6-7,12-13 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 10-11 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
rm ${OUTPUT_FOLDER}File_temp*.tsv
......@@ -48,46 +48,46 @@ done
# Now, we take care of GeneGo.
# We first create the ALL_lists mapping by simple concatenation (was not done before since we were waiting for the manual curation to take place).
cat ${OUTPUT_FOLDER}Female_mapping_refined.tsv ${OUTPUT_FOLDER}Male_mapping_refined.tsv ${OUTPUT_FOLDER}Gender_disease_status_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}All_lists_mapping_refined.tsv
cat ${OUTPUT_FOLDER}All_lists_mapping_refined.tsv ${OUTPUT_FOLDER}PDvsControl_mapping_refined.tsv ${OUTPUT_FOLDER}extendedTF_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}Genego_mapping_refined.tsv
cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv
cat ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_genego_mapping_refined.tsv ${OUTPUT_FOLDER}genego_extended_newTFs_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}genego_mapping_refined.tsv
# Second, GeneGO files, we need to loop over relevant GRN files again.
# Note, we also do the extended networks despite the fact that we did not have the geneGo ids for all the new genes that GeneGo includes.
# These were obtained via an additional GeneGo network query.
# These were obtained via an additional GeneGo network query.
rm -rf ${OUTPUT_FOLDER}*_genego_GRN_*_refined.tsv
FILES=${OUTPUT_FOLDER}*_genego_GRN_*
for f in $FILES
do
f2=${f/\.tsv/_refined.tsv}
# Contray to the DoRothEA case, we first need to map the GeneGO ids to GeneDER ids, then we will be able to
# Contrary to the DoRothEA case, we first need to map the GeneGO ids to GeneDER ids, then we will be able to
# proceed with actually getting the differential expression data.
# We first map the target id (on position 2, will move to position 1).
join -t $'\t' -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}Genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
# We first map the target id (on position 4, will move to position 1).
join -t $'\t' -o auto -1 4 <(sort -t $'\t' -k4,4 ${f} | grep -v "Network Object") -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
# Then, we map the first field (TF), which is now located second and is moved back to first place.
join -t $'\t' -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}Genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(cut -f 7 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 1-5 ${OUTPUT_FOLDER}File_temp2.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
# Then, we map the TF, which is now located third and is moved to first place.
join -t $'\t' -o auto -1 3 <(sort -t $'\t' -k3,3 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(cut -f 12 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 11 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 1-10 ${OUTPUT_FOLDER}File_temp2.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
rm ${OUTPUT_FOLDER}File_temp2.tsv
# We then add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-12,14,16 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-14,16,18 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-13,15,17 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp2.tsv
echo $'Source\tTarget\tSource.ggid\tTarget.ggid\tRegulation\tCategory\tInformation\tFemale.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval' > ${f2}
paste <(cut -f 1-7,10-11 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 8-9,14-15 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 12-13 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
echo $'Source\tTarget\tSource.ggid\tTarget.ggid\tggid\tSource.type\tTarget.type\tRegulation\tCategory\tSpecies\tInformation\tReferences\tFemale.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval' > ${f2}
paste <(cut -f 1-12,15-16 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 13-14,19-20 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 17-18 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
rm ${OUTPUT_FOLDER}File_temp*.tsv
done
......
......@@ -24,32 +24,31 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# We need to loop over all mapping files.
rm -rf ${OUTPUT_FOLDER}*_mapping_refined.tsv
rm -rf ${OUTPUT_FOLDER}*_mapping_help.txt
for tag in "Female" "Male" "Gender_disease_status" "PDvsControl"
for tag in "PDVsControl_females" "PDVsControl_males" "Gender_disease_status" "PDVsControl"
do
# We join back the top PI genes and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere. This also allows us to have the real inputs ids that are
# not returned by GeneGo (despite the GUI saying otherwise).
join -t $'\t' -a 1 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) > ${OUTPUT_FOLDER}${tag}_mapping_refined.tsv
# We print additional information to help the manual check that should be done after the join ot fill in the missing ids.
echo "PI genes not mapped in GeneGo:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
join -t $'\t' -v 1 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
echo "" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
join -t $'\t' -v 2 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
join -t $'\t' -a 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_refined.tsv
# We print additional information to help the manual check that should be done after the join to fill in the missing ids.
echo "PI genes not mapped in GeneGo:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
done
# The extended_TF is a special case since we do not have relevant top genes based on PI.
# We instead use all genes reported in the rankings (with pi values).
tag="extendedTF"
tag="extended_newTFs"
# We join back the geneder genes (all) and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere.
join -t $'\t' -j 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) > ${OUTPUT_FOLDER}${tag}_mapping_refined.tsv
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
join -t $'\t' -v 2 -j 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt
join -t $'\t' -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_refined.tsv
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -21,15 +21,17 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# Actual job in bash.
# We first join all enrichment results across Female, Male and GDS to get a complete picture.
join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Female_TFactivities.tsv | grep -v Regulon) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Male_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_TFactivities.tsv | grep -v Regulon) > ${OUTPUT_FOLDER}TFactivities_merged.tsv
join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_TFactivities.tsv | grep -v Regulon) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_TFactivities.tsv | grep -v Regulon) > ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the female diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp.tsv
mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the male diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp.tsv
mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp.tsv
echo $'Regulon\tF.evdc_lvl\tF.Size\tF.NES\tF.p.value\tF.FDR\tM.evdc_lvl\tM.Size\tM.NES\tM.p.value\tM.FDR\tGDS.evdc_lvl\tGDS.Size\tGDS.NES\tGDS.p.value\tGDS.FDR\tF.logFC\tF.ajdP\tM.logFC\tM.ajdP' > ${OUTPUT_FOLDER}TFactivities_merged.tsv
cat ${OUTPUT_FOLDER}File_temp.tsv >> ${OUTPUT_FOLDER}TFactivities_merged.tsv
rm ${OUTPUT_FOLDER}File_temp.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -75,72 +75,92 @@ for (ii in seq_len(dim(update_info)[1])) {
names(viper_regulon) <- paste(vp_names, vp_fullnames[2, ], sep = "_")
rm(update_info, vp_fullnames, vp_names, ii)
# We loop over the different files to analyse.
for (i in seq_len(length(config$ranking_files))) {
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
# We only do the first three cases because then we do not have P values so
# we can not really create the "mySignature" object.
if (i > 4) {
# We get the integration name for that scheme.
integration <- config$integrations[[i]]
# We only do the enrichment if it is necessary.
if (integration$use_for_network == "FALSE") {
next
}
# We define the I/Os.
ranking_file <- paste0(input_data_dir, config$ranking_files[[i]])
file_prefix <- strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]]
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res, ranking)
# Estimate Z-score values for the GES (As in DoRothEA manual, cheeck VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics <- matrix(ranking_clean$LFC,
dimnames = list(ranking_clean$Gene, "logFC") )
mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))),
dimnames = list(ranking_clean$Gene, "pP.Value") )
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
mySignature <- mySignature[order(mySignature, decreasing = T)]
rm(ranking_clean, mypPvalue, myStatistics)
# Estimate TF activities via Viper.
mrs <- msviper(ges = mySignature, regulon = viper_regulon, minsize = 4, ges.filter = F)
# Format the results.
tf_name <- sapply(strsplit(names(mrs$es$nes), split = "_"), head, 1)
evidence_level <- sapply(strsplit(names(mrs$es$nes), split = "_"), tail, 1)
TF_activities <- data.frame(Regulon = tf_name,
evdc_lvl = evidence_level,
Size = mrs$es$size[ names(mrs$es$nes) ],
NES = mrs$es$nes,
p.value = mrs$es$p.value)
TF_activities <- TF_activities[ order(TF_activities$p.value), ] %>%
filter(evdc_lvl %in% c("A", "B", "C", "D")) %>%
mutate(FDR = p.adjust(p.value, method = "fdr"))
rm(mrs, mySignature, tf_name, evidence_level)
# Save results
TF_act_fn <- paste0(output_data_dir, file_prefix, "TFactivities.tsv")
write.table(TF_activities, file = TF_act_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(TF_act_fn, TF_activities, file_prefix)
} # End for each file to analyse.
# We do all limma comparisons one by one.
for (j in seq_len(length(config$limma_analyses))) {
# We extract the Limma parameters.
limma_parameters <- config$limma_analyses[[j]]
# We only do the enrichment if it is necessary.
if (limma_parameters$use_for_network == "FALSE") {
next
}
# We define the I/Os.
analysis_prefix <- paste0(integration$name, "_", limma_parameters$name)
ranking_file <- paste0(input_data_dir, analysis_prefix, "_rankings.tsv")
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# We clean genes (no duplicate entries with pipe separated gene symbols).
if (sum(grepl("\\|", ranking$Gene)) > 0) {
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res)
} else {
ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
ranking_clean <- ranking_clean %>% select(Gene, LFC, ranking_value) %>% arrange(desc(ranking_value))
}
rm(ranking)
# Estimate Z-score values for the GES (As in DoRothEA manual, cheeck VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
# In addition, we correct through the median logFC (since pi = -log(p) * logFC).
myStatistics <- matrix(ranking_clean$LFC,
dimnames = list(ranking_clean$Gene, "logFC") )
mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))),
dimnames = list(ranking_clean$Gene, "pP.Value") )
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
mySignature <- mySignature[order(mySignature, decreasing = T)]
rm(ranking_clean, mypPvalue, myStatistics)
# Estimate TF activities via Viper.
mrs <- msviper(ges = mySignature, regulon = viper_regulon, minsize = 4, ges.filter = F)
# Format the results.
tf_name <- sapply(strsplit(names(mrs$es$nes), split = "_"), head, 1)
evidence_level <- sapply(strsplit(names(mrs$es$nes), split = "_"), tail, 1)
TF_activities <- data.frame(Regulon = tf_name,
evdc_lvl = evidence_level,
Size = mrs$es$size[ names(mrs$es$nes) ],
NES = mrs$es$nes,
p.value = mrs$es$p.value)
TF_activities <- TF_activities[ order(TF_activities$p.value), ] %>%
filter(evdc_lvl %in% c("A", "B", "C", "D")) %>%
mutate(FDR = p.adjust(p.value, method = "fdr"))
rm(mrs, mySignature, tf_name, evidence_level)
# Save results
TF_act_fn <- paste0(output_data_dir, analysis_prefix, "_TFactivities.tsv")
write.table(TF_activities, file = TF_act_fn, sep = "\t", quote = FALSE, row.names = FALSE)
rm(TF_act_fn, TF_activities, analysis_prefix, limma_parameters)
} # End for each limma comparison.
rm(j, integration)
} # End for each integration.
rm(i, viper_regulon)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
sessionInfo()
......@@ -18,14 +18,10 @@ echo ""
module load lang/R/3.6.0-foss-2019a-bare
# Defining global parameters.
GG_DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Else/GeneGo/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Copy the GeneGo data (must have been run manually before).
cp -rf ${GG_DATA_FOLDER}* ${OUTPUT_FOLDER}
# Actual job in R.
Rscript --vanilla ${CODE_FOLDER}run_dorothea_enrichment.R > ${OUTPUT_FOLDER}dorothea_enrichment_log.out 2> ${OUTPUT_FOLDER}dorothea_enrichment_log.err
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment