Commit 7e1bb0ce authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactored the code for regulatory network analysis according to the latest GeneGo results.

parent e8f7adc4
...@@ -17,11 +17,15 @@ getdb: ...@@ -17,11 +17,15 @@ getdb:
@echo "*************************************************************************" @echo "*************************************************************************"
GRNdot: GRNdot:
@sbatch ${CODE_FOLDER}extract_GRN_dorothea.sh @sbatch ${CODE_FOLDER}extract_GRN_dorothea.sh
# From now on, we need GeneGo results.
enrich: enrich:
@sbatch ${CODE_FOLDER}run_dorothea_enrichment.sh @sbatch ${CODE_FOLDER}run_dorothea_enrichment.sh
# From now on, we need GeneGo results.
preprefine: preprefine:
@sbatch ${CODE_FOLDER}prep_refine.sh @sbatch ${CODE_FOLDER}prep_refine.sh
@echo "********************************************************************************"
@echo "* PLEASE PROCEED WITH THE GENEGO PROCESS OF THE genego_extended_newTFs.tsv *"
@echo "* FILE LIKE DONE BEFORE FOR THE OTHER MAPPING (OFFICIAL ONLY). *"
@echo "********************************************************************************"
refineGRN1: refineGRN1:
@sbatch ${CODE_FOLDER}refine_GeneGo_mappings.sh @sbatch ${CODE_FOLDER}refine_GeneGo_mappings.sh
@echo "********************************************************************************" @echo "********************************************************************************"
......
...@@ -19,7 +19,7 @@ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/ ...@@ -19,7 +19,7 @@ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (top DEGs based on pi values).1 # Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (top DEGs based on pi values).
# We update the database based on the DoRothEA_unmapped_mapped_clean.tsv file. # We update the database based on the DoRothEA_unmapped_mapped_clean.tsv file.
cp ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv ${OUTPUT_FOLDER}Dorothea_temp.tsv cp ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv ${OUTPUT_FOLDER}Dorothea_temp.tsv
...@@ -35,34 +35,34 @@ done < "$input" ...@@ -35,34 +35,34 @@ done < "$input"
mv ${OUTPUT_FOLDER}Dorothea_temp.tsv ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv mv ${OUTPUT_FOLDER}Dorothea_temp.tsv ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# First the female data. # First the female data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Female_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Female_dorothea_GRN_extended.tsv paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Female_dorothea_GRN_extended_raw.tsv rm ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Female_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Female_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Female_dorothea_GRN_internal.tsv join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
# Second the male data. # Second the male data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Male_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Male_dorothea_GRN_extended.tsv paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Male_dorothea_GRN_extended_raw.tsv rm ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Male_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Male_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Male_dorothea_GRN_internal.tsv join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
# Third the GDS data. # Third the GDS data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended.tsv paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended_raw.tsv rm ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Gender_disease_status_dorothea_GRN_internal.tsv join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_internal.tsv
# Fourth, the combinaison of all files as one. # Fourth, the combination of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended.tsv paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended_raw.tsv rm ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}All_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}All_lists_dorothea_GRN_internal.tsv join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
# Fifth the PDvsControl data (gender not taken into account, as a baseline). # Fifth the PDvsControl data (gender not taken into account, as a baseline).
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended.tsv paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended_raw.tsv rm ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}PDvsControl_dorothea_GRN_internal.tsv join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_internal.tsv
# Moving the slurm log file to data # Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/ mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
...@@ -28,7 +28,8 @@ curl -o ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv "http://omnipathdb.org/interac ...@@ -28,7 +28,8 @@ curl -o ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv "http://omnipathdb.org/interac
# We remove the unnecessary fields. So far, we do not filter the TF that are very frequent. # We remove the unnecessary fields. So far, we do not filter the TF that are very frequent.
# The most frequent one (with evidence A, B and C) has ~1k targets. # The most frequent one (with evidence A, B and C) has ~1k targets.
# This can be changed if we also change the evidence levels to include D as well. # This can be changed if we also change the evidence levels to include D as well.
cut -f 3,4,6,7,13 ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv > ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3,4,6,7,13 > ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
grep -v "source_genesymbol" ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3,4,6,7,13 | sort -u >> ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# We process the files further to extract the full list of gene ids used by DoRothEA. # We process the files further to extract the full list of gene ids used by DoRothEA.
cat <(cut -f 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) | grep -v "source_genesymbol" | grep -v "target_genesymbol" | sort -u > ${OUTPUT_FOLDER}Dorothea_geneids.tsv cat <(cut -f 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) | grep -v "source_genesymbol" | grep -v "target_genesymbol" | sort -u > ${OUTPUT_FOLDER}Dorothea_geneids.tsv
......
...@@ -15,20 +15,23 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}" ...@@ -15,20 +15,23 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo "" echo ""
# Defining global parameters. # Defining global parameters.
GG_DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Else/GeneGo/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash. # Actual job in bash.
# Copy the GeneGo data (must have been run manually before).
cp -rf ${GG_DATA_FOLDER}* ${OUTPUT_FOLDER}
# We derive the list of TF present in the extended networks and not the internal networks. # We derive the list of TF present in the extended networks and not the internal networks.
# We will need this list to go through GeneGo as well to get the official gene names and thus potentially # We will need this list to go through GeneGo as well to get the official gene names and thus potentially
# the geneder ids for these genes. # the geneder ids for these genes.
cut -f 2 ${OUTPUT_FOLDER}*genego*internal.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F1
cut -f 1 ${OUTPUT_FOLDER}*genego*internal.tsv | sort -u > ${OUTPUT_FOLDER}F1 cut -f 2 ${OUTPUT_FOLDER}*genego*extended.tsv | grep -v "Network Object " | grep -v "^$" | sort -u > ${OUTPUT_FOLDER}F2
cut -f 1 ${OUTPUT_FOLDER}*genego*extended.tsv | sort -u > ${OUTPUT_FOLDER}F2
comm -13 ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2 > ${OUTPUT_FOLDER}genego_extended_newTFs.tsv comm -13 ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2 > ${OUTPUT_FOLDER}genego_extended_newTFs.tsv
rm ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2 rm ${OUTPUT_FOLDER}F1 ${OUTPUT_FOLDER}F2
# Moving the slurm log file to data # Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/ mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
...@@ -37,12 +37,12 @@ cut -f 1,2,4 ${OUTPUT_FOLDER}Omnipath_network.tsv | awk -v FS=$'\t' -v OFS=$'\t' ...@@ -37,12 +37,12 @@ cut -f 1,2,4 ${OUTPUT_FOLDER}Omnipath_network.tsv | awk -v FS=$'\t' -v OFS=$'\t'
# We don't do any additional mapping since we now have uniprot ids. # We don't do any additional mapping since we now have uniprot ids.
# Warning for user. # Warning for user.
echo "*************************************************************************" #echo "*************************************************************************"
echo "* YOU MAY WANT TO MANUALLY REFINE THE MAPPING IN THE FILE" #echo "* YOU MAY WANT TO MANUALLY REFINE THE MAPPING IN THE FILE"
echo "* ${OUTPUT_FOLDER}Omnipath_geneids_unmapped_mapped_clean.tsv" #echo "* ${OUTPUT_FOLDER}Omnipath_geneids_unmapped_mapped_clean.tsv"
echo "*************************************************************************" #echo "*************************************************************************"
# Part II - we now take care of the PROGENy linear model *useful to create the CARNIVAL weight matrix). # Part II - we now take care of the PROGENy linear model (useful to create the CARNIVAL weight matrix).
# We obtain the latest file from the CARNIVAL R package. # We obtain the latest file from the CARNIVAL R package.
Rscript --vanilla ${CODE_FOLDER}extract_progeny_model.R > ${OUTPUT_FOLDER}extract_progeny_model_log.out 2> ${OUTPUT_FOLDER}extract_progeny_model_log.err Rscript --vanilla ${CODE_FOLDER}extract_progeny_model.R > ${OUTPUT_FOLDER}extract_progeny_model_log.out 2> ${OUTPUT_FOLDER}extract_progeny_model_log.err
......
...@@ -21,14 +21,13 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw ...@@ -21,14 +21,13 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# Actual job in bash. # Actual job in bash.
# For each comparison of interest, we select the top 500 genes max, based on the pi-values. # For each comparison of interest, we select the top 500 genes max, based on the pi-values.
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Female_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Female_PI_top_genes.tsv awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Male_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Male_PI_top_genes.tsv awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}Gender_disease_status_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}PDvsControl_rankings.tsv | sort -k9,9gr | head -n 500 | cut -f 2 | sort -u > ${OUTPUT_FOLDER}PDvsControl_PI_top_genes.tsv awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_Gender_disease_status_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv
# We also concatenate all gender relevant files to create the integrated list. # We also concatenate all gender relevant files to create the integrated list.
# We do NOT include the PDvscontrol file as this one represents more or less the baseline. # We do NOT include the PDvscontrol file as this one represents more or less the baseline.
cat ${OUTPUT_FOLDER}Female_PI_top_genes.tsv ${OUTPUT_FOLDER}Male_PI_top_genes.tsv ${OUTPUT_FOLDER}Gender_disease_status_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}All_lists_PI_top_genes.tsv cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv
# Moving the slurm log file to data # Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/ mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
...@@ -29,18 +29,18 @@ do ...@@ -29,18 +29,18 @@ do
# First, we map the DoRothEA data, we add female diff data and then male diff data. # First, we map the DoRothEA data, we add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first. # [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-5,7,9 > ${OUTPUT_FOLDER}File_temp.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-5,7,9 > ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place. # [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first. # [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place. # [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(echo $'Female.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval') > ${f2} paste <(head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(echo $'Female.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval') > ${f2}
paste <(cut -f 1-5,8-9 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6-7,12-13 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 10-11 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2} paste <(cut -f 1-5,8-9 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6-7,12-13 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 10-11 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
rm ${OUTPUT_FOLDER}File_temp*.tsv rm ${OUTPUT_FOLDER}File_temp*.tsv
...@@ -48,46 +48,46 @@ done ...@@ -48,46 +48,46 @@ done
# Now, we take care of GeneGo. # Now, we take care of GeneGo.
# We first create the ALL_lists mapping by simple concatenation (was not done before since we were waiting for the manual curation to take place). # We first create the ALL_lists mapping by simple concatenation (was not done before since we were waiting for the manual curation to take place).
cat ${OUTPUT_FOLDER}Female_mapping_refined.tsv ${OUTPUT_FOLDER}Male_mapping_refined.tsv ${OUTPUT_FOLDER}Gender_disease_status_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}All_lists_mapping_refined.tsv cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv
cat ${OUTPUT_FOLDER}All_lists_mapping_refined.tsv ${OUTPUT_FOLDER}PDvsControl_mapping_refined.tsv ${OUTPUT_FOLDER}extendedTF_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}Genego_mapping_refined.tsv cat ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_genego_mapping_refined.tsv ${OUTPUT_FOLDER}genego_extended_newTFs_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}genego_mapping_refined.tsv
# Second, GeneGO files, we need to loop over relevant GRN files again. # Second, GeneGO files, we need to loop over relevant GRN files again.
# Note, we also do the extended networks despite the fact that we did not have the geneGo ids for all the new genes that GeneGo includes. # Note, we also do the extended networks despite the fact that we did not have the geneGo ids for all the new genes that GeneGo includes.
# These were obtained via an additional GeneGo network query. # These were obtained via an additional GeneGo network query.
rm -rf ${OUTPUT_FOLDER}*_genego_GRN_*_refined.tsv rm -rf ${OUTPUT_FOLDER}*_genego_GRN_*_refined.tsv
FILES=${OUTPUT_FOLDER}*_genego_GRN_* FILES=${OUTPUT_FOLDER}*_genego_GRN_*
for f in $FILES for f in $FILES
do do
f2=${f/\.tsv/_refined.tsv} f2=${f/\.tsv/_refined.tsv}
# Contray to the DoRothEA case, we first need to map the GeneGO ids to GeneDER ids, then we will be able to # Contrary to the DoRothEA case, we first need to map the GeneGO ids to GeneDER ids, then we will be able to
# proceed with actually getting the differential expression data. # proceed with actually getting the differential expression data.
# We first map the target id (on position 2, will move to position 1). # We first map the target id (on position 4, will move to position 1).
join -t $'\t' -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}Genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp.tsv join -t $'\t' -o auto -1 4 <(sort -t $'\t' -k4,4 ${f} | grep -v "Network Object") -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
# Then, we map the first field (TF), which is now located second and is moved back to first place. # Then, we map the TF, which is now located third and is moved to first place.
join -t $'\t' -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}Genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -o auto -1 3 <(sort -t $'\t' -k3,3 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_mapping_refined.tsv) > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(cut -f 7 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 1-5 ${OUTPUT_FOLDER}File_temp2.tsv) > ${OUTPUT_FOLDER}File_temp.tsv paste <(cut -f 12 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 11 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 1-10 ${OUTPUT_FOLDER}File_temp2.tsv) > ${OUTPUT_FOLDER}File_temp.tsv
rm ${OUTPUT_FOLDER}File_temp2.tsv rm ${OUTPUT_FOLDER}File_temp2.tsv
# We then add female diff data and then male diff data. # We then add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first. # [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-12,14,16 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place. # [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-14,16,18 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first. # [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place. # [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-13,15,17 > ${OUTPUT_FOLDER}File_temp2.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp2.tsv
echo $'Source\tTarget\tSource.ggid\tTarget.ggid\tRegulation\tCategory\tInformation\tFemale.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval' > ${f2} echo $'Source\tTarget\tSource.ggid\tTarget.ggid\tggid\tSource.type\tTarget.type\tRegulation\tCategory\tSpecies\tInformation\tReferences\tFemale.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval' > ${f2}
paste <(cut -f 1-7,10-11 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 8-9,14-15 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 12-13 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2} paste <(cut -f 1-12,15-16 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 13-14,19-20 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 17-18 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
rm ${OUTPUT_FOLDER}File_temp*.tsv rm ${OUTPUT_FOLDER}File_temp*.tsv
done done
......
...@@ -24,32 +24,31 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw ...@@ -24,32 +24,31 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# We need to loop over all mapping files. # We need to loop over all mapping files.
rm -rf ${OUTPUT_FOLDER}*_mapping_refined.tsv rm -rf ${OUTPUT_FOLDER}*_mapping_refined.tsv
rm -rf ${OUTPUT_FOLDER}*_mapping_help.txt rm -rf ${OUTPUT_FOLDER}*_mapping_help.txt
for tag in "Female" "Male" "Gender_disease_status" "PDvsControl" for tag in "PDVsControl_females" "PDVsControl_males" "Gender_disease_status" "PDVsControl"
do do
# We join back the top PI genes and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo # We join back the top PI genes and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere. This also allows us to have the real inputs ids that are # is adding some irrelevant stuff from somewhere. This also allows us to have the real inputs ids that are
# not returned by GeneGo (despite the GUI saying otherwise). # not returned by GeneGo (despite the GUI saying otherwise).
join -t $'\t' -a 1 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) > ${OUTPUT_FOLDER}${tag}_mapping_refined.tsv join -t $'\t' -a 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_refined.tsv
# We print additional information to help the manual check that should be done after the join ot fill in the missing ids. # We print additional information to help the manual check that should be done after the join to fill in the missing ids.
echo "PI genes not mapped in GeneGo:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt echo "PI genes not mapped in GeneGo:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 1 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt join -t $'\t' -v 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt echo "" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -j 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_PI_top_genes.tsv) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt join -t $'\t' -v 2 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
done done
# The extended_TF is a special case since we do not have relevant top genes based on PI. # The extended_TF is a special case since we do not have relevant top genes based on PI.
# We instead use all genes reported in the rankings (with pi values). # We instead use all genes reported in the rankings (with pi values).
tag="extendedTF" tag="extended_newTFs"
# We join back the geneder genes (all) and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo # We join back the geneder genes (all) and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere. # is adding some irrelevant stuff from somewhere.
join -t $'\t' -j 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) > ${OUTPUT_FOLDER}${tag}_mapping_refined.tsv join -t $'\t' -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_refined.tsv
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -j 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}${tag}_mapping.tsv) >> ${OUTPUT_FOLDER}${tag}_mapping_help.txt join -t $'\t' -v 2 -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
# Moving the slurm log file to data # Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/ mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
...@@ -21,15 +21,17 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw ...@@ -21,15 +21,17 @@ CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetw
# Actual job in bash. # Actual job in bash.
# We first join all enrichment results across Female, Male and GDS to get a complete picture. # We first join all enrichment results across Female, Male and GDS to get a complete picture.
join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Female_TFactivities.tsv | grep -v Regulon) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Male_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}Gender_disease_status_TFactivities.tsv | grep -v Regulon) > ${OUTPUT_FOLDER}TFactivities_merged.tsv join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_TFactivities.tsv | grep -v Regulon) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_TFactivities.tsv | grep -v Regulon) > ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the female diff exp data (for the TF). # Then, we map the female diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Female_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp.tsv
mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the male diff exp data (for the TF). # Then, we map the male diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}Male_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp.tsv join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp.tsv
mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv echo $'Regulon\tF.evdc_lvl\tF.Size\tF.NES\tF.p.value\tF.FDR\tM.evdc_lvl\tM.Size\tM.NES\tM.p.value\tM.FDR\tGDS.evdc_lvl\tGDS.Size\tGDS.NES\tGDS.p.value\tGDS.FDR\tF.logFC\tF.ajdP\tM.logFC\tM.ajdP' > ${OUTPUT_FOLDER}TFactivities_merged.tsv
cat ${OUTPUT_FOLDER}File_temp.tsv >> ${OUTPUT_FOLDER}TFactivities_merged.tsv
rm ${OUTPUT_FOLDER}File_temp.tsv
# Moving the slurm log file to data # Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/ mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
...@@ -75,72 +75,92 @@ for (ii in seq_len(dim(update_info)[1])) { ...@@ -75,72 +75,92 @@ for (ii in seq_len(dim(update_info)[1])) {
names(viper_regulon) <- paste(vp_names, vp_fullnames[2, ], sep = "_") names(viper_regulon) <- paste(vp_names, vp_fullnames[2, ], sep = "_")
rm(update_info, vp_fullnames, vp_names, ii) rm(update_info, vp_fullnames, vp_names, ii)
# We loop over the different files to analyse. # We do all integration schemes one by one.
for (i in seq_len(length(config$ranking_files))) { for (i in seq_len(length(config$integrations))) {
# We only do the first three cases because then we do not have P values so # We get the integration name for that scheme.
# we can not really create the "mySignature" object. integration <- config$integrations[[i]]
if (i > 4) {
# We only do the enrichment if it is necessary.
if (integration$use_for_network == "FALSE") {
next next
} }
# We define the I/Os. # We do all limma comparisons one by one.
ranking_file <- paste0(input_data_dir, config$ranking_files[[i]]) for (j in seq_len(length(config$limma_analyses))) {
file_prefix <- strsplit(config$ranking_files[[i]], split = "rankings.tsv")[[1]]
# We extract the Limma parameters.
# We read the prepared ranking and keep only the value used to rank. limma_parameters <- config$limma_analyses[[j]]
ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
select(Gene, ref_logFC, ranking_value) # We only do the enrichment if it is necessary.
rm(ranking_file) if (limma_parameters$use_for_network == "FALSE") {
next
# We clean genes (no duplicate entries with pipe separated gene symbols). }
tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
ranking_clean <- data.frame(do.call(rbind, c(tmp_res)), # We define the I/Os.
row.names = NULL, analysis_prefix <- paste0(integration$name, "_", limma_parameters$name)
stringsAsFactors = FALSE) ranking_file <- paste0(input_data_dir, analysis_prefix, "_rankings.tsv")
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
ranking_clean$LFC <- as.numeric(ranking_clean$LFC) # We read the prepared ranking and keep only the value used to rank.
ranking_clean <- ranking_clean %>% arrange(desc(ranking_value)) ranking <- read.delim(ranking_file, stringsAsFactors = FALSE, row.names = 1) %>%
rm(tmp_res, ranking) select(Gene, ref_logFC, ranking_value)
rm(ranking_file)
# Estimate Z-score values for the GES (As in DoRothEA manual, cheeck VIPER manual for details).
# In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi. # We clean genes (no duplicate entries with pipe separated gene symbols).
# In addition, we correct through the median logFC (since pi = -log(p) * logFC). if (sum(grepl("\\|", ranking$Gene)) > 0) {
myStatistics <- matrix(ranking_clean$LFC, tmp_res <- t(apply(ranking, 1, duplicate_row_first_cell))
dimnames = list(ranking_clean$Gene, "logFC") ) ranking_clean <- data.frame(do.call(rbind, c(tmp_res)),
mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))), row.names = NULL,
dimnames = list(ranking_clean$Gene, "pP.Value") ) stringsAsFactors = FALSE)
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1] ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
mySignature <- mySignature[order(mySignature, decreasing = T)] ranking_clean$LFC <- as.numeric(ranking_clean$LFC)
rm(ranking_clean, mypPvalue, myStatistics) ranking_clean <- ranking_clean %>% arrange(desc(ranking_value))
rm(tmp_res)
# Estimate TF activities via Viper. } else {
mrs <- msviper(ges = mySignature, regulon = viper_regulon, minsize = 4, ges.filter = F) ranking_clean <- ranking
ranking_clean$ranking_value <- as.numeric(ranking_clean$ranking_value)
# Format the results. ranking_clean$LFC <- as.numeric(ranking_clean$ref_logFC)
tf_name <- sapply(strsplit(names(mrs$es$nes), split = "_"), head, 1) ranking_clean <- ranking_clean %>% select(Gene, LFC, ranking_value) %>% arrange(desc(ranking_value))
evidence_level <- sapply(strsplit(names(mrs$es$nes), split = "_"), tail, 1) }
TF_activities <- data.frame(Regulon = tf_name, rm(ranking)
evdc_lvl = evidence_level,
Size = mrs$es$size[ names(mrs$es$nes) ], # Estimate Z-score values for the GES (As in DoRothEA manual, cheeck VIPER manual for details).
NES = mrs$es$nes, # In our case, we compute the pseudo P values based on the pi values with 10 ^ -pi.
p.value = mrs$es$p.value) # In addition, we correct through the median logFC (since pi = -log(p) * logFC).
TF_activities <- TF_activities[ order(TF_activities$p.value), ] %>% myStatistics <- matrix(ranking_clean$LFC,
filter(evdc_lvl %in% c("A", "B", "C", "D")) %>% dimnames = list(ranking_clean$Gene, "logFC") )
mutate(FDR = p.adjust(p.value, method = "fdr")) mypPvalue <- matrix(10 ^ (-ranking_clean$ranking_value * median(abs(ranking_clean$LFC))),
rm(mrs, mySignature, tf_name, evidence_level) dimnames = list(ranking_clean$Gene, "pP.Value") )
mySignature <- (qnorm(mypPvalue / 2, lower.tail = FALSE) * sign(myStatistics))[, 1]
# Save results mySignature <- mySignature[order(mySignature, decreasing = T)]
TF_act_fn <- paste0(output_data_dir, file_prefix, "TFactivities.tsv") rm(ranking_clean, mypPvalue, myStatistics)