Commit ae497b09 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Adapting the code to the lastest changes in the previous steps (pi-values / GD...

Adapting the code to the lastest changes in the previous steps (pi-values / GD rankings) and refactoring file names (outputs).
parent 243303b3
......@@ -19,7 +19,7 @@ INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (top DEGs based on pi values).
# Actual job in bash. This is a simple join between the TF-targets link database and the experimental data (diffentially expressed genes).
# We update the database based on the DoRothEA_unmapped_mapped_clean.tsv file.
cp ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv ${OUTPUT_FOLDER}Dorothea_temp.tsv
......@@ -35,34 +35,22 @@ done < "$input"
mv ${OUTPUT_FOLDER}Dorothea_temp.tsv ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv
# First the female data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_females_dorothea_GRN_internal.tsv
# Second the male data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_males_dorothea_GRN_internal.tsv
# Third the GDS data.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_Gender_disease_status_dorothea_GRN_internal.tsv
# Fourth, the combination of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
# Third, the combination of all files as one.
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
# Fifth the PDvsControl data (gender not taken into account, as a baseline).
join -1 1 -2 2 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k2,2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
paste <(cut -f 2 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 1 -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) <(cut -f 3- -d ' ' ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv
rm ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended_raw.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_PDVsControl_dorothea_GRN_internal.tsv
join -1 1 -2 1 <(sort -k1,1 ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv) <(sort -k1,1 ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_extended.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}SNage_all_lists_dorothea_GRN_internal.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -34,7 +34,7 @@ grep -v "source_genesymbol" ${OUTPUT_FOLDER}Dorothea_regulons_raw.tsv | cut -f 3
# We process the files further to extract the full list of gene ids used by DoRothEA.
cat <(cut -f 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) | grep -v "source_genesymbol" | grep -v "target_genesymbol" | sort -u > ${OUTPUT_FOLDER}Dorothea_geneids.tsv
# We also extract all gene ids used by GeneDer.
# We also extract all gene ids used in GeneDER.
cut -f 1 ${ALT_INPUT_FOLDER}Combined_probe_matching.tsv | grep -v "genes" | sort -u > ${OUTPUT_FOLDER}Geneder_geneids.tsv
# We compute the overlap, and more interestingly the genes from DoRothEA that do not match the GeneDER genes.
......@@ -71,9 +71,8 @@ cut -f 2 ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped.tsv | sort -u | wc -l
wc -l ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_missing.tsv
wc -l ${OUTPUT_FOLDER}Dorothea_geneids_unmapped.tsv
# Build the final clean map (that would still need to be check manually).
# Build the final clean map (that would still need to be checked manually).
join -1 2 -2 1 ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped.tsv <(comm -23 <(cut -f 2 ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped.tsv | sort | uniq -c | sort -g | grep '\s1\s' | sed -r 's/1 /1\t/g' | cut -f 2) ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_missing.tsv) | sed -r 's/ /\t/g' > ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped_clean.tsv
join -t$'t' -a 1 -1 1 -2 1 ${OUTPUT_FOLDER}Dorothea_geneids.tsv ${OUTPUT_FOLDER}Dorothea_geneids_unmapped_mapped_clean.tsv | cut -f 2 > ${OUTPUT_FOLDER}Dorothea_geneids_automatic.tsv
# Warning for user.
......
......@@ -20,14 +20,20 @@ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash.
# For each comparison of interest, we select the top 500 genes max, based on the pi-values.
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv
awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_Gender_disease_status_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv
# For each comparison of interest, we select the genes based on the pi-values.
awk '{if ($4 <= 0.05) print $0}' ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv | sort -k4,4gr | cut -f 1 | sort -u > ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv
awk '{if ($4 <= 0.05) print $0}' ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv | sort -k4,4gr | cut -f 1 | sort -u > ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv
# We were previously based on the PI values.
#awk '{if ($5 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv | sort -k5,5gr | cut -f 1 | sort -u > ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_PI_top_genes.tsv
#awk '{if ($5 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv | sort -k5,5gr | cut -f 1 | sort -u > ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_PI_top_genes.tsv
##awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_PDVsControl_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_PDVsControl_rankings_PI_top_genes.tsv
##awk '{if ($9 >= -log(0.05)/log(10)*1.5) print $0}' ${INPUT_FOLDER}SNage_Gender_disease_status_rankings.tsv | sort -k9,9gr | cut -f 2 | sort -u > ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv
# We also concatenate all gender relevant files to create the integrated list.
# We do NOT include the PDvscontrol file as this one represents more or less the baseline.
cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_rankings_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_PI_top_genes.tsv
cat ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_genes.tsv ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_genes.tsv | sort -u > ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv
# We were previously based on the PI values.
#cat ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings_PI_top_genes.tsv ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings_PI_top_genes.tsv | sort -u > ${OUTPUT_FOLDER}SNage_VSN_all_lists_PI_top_genes.tsv
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -29,18 +29,18 @@ do
# First, we map the DoRothEA data, we add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-5,7,9 > ${OUTPUT_FOLDER}File_temp.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${f}) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv) | cut -f 1-5,6,8 > ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-7,9,11 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv) | cut -f 1-7,8,10 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-9,11,13 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv) | cut -f 1-9,10,12 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-11,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv) | cut -f 1-11,12,14 > ${OUTPUT_FOLDER}File_temp2.tsv
paste <(head -n 1 ${OUTPUT_FOLDER}Dorothea_regulons_clean.tsv) <(echo $'Female.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval') > ${f2}
paste <(cut -f 1-5,8-9 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 6-7,12-13 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 10-11 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
rm ${OUTPUT_FOLDER}File_temp*.tsv
......@@ -48,8 +48,7 @@ done
# Now, we take care of GeneGo.
# We first create the ALL_lists mapping by simple concatenation (was not done before since we were waiting for the manual curation to take place).
cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_Gender_disease_status_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv
cat ${OUTPUT_FOLDER}SNage_all_lists_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_genego_mapping_refined.tsv ${OUTPUT_FOLDER}genego_extended_newTFs_genego_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}genego_mapping_refined.tsv
cat ${OUTPUT_FOLDER}SNage_PDVsControl_females_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_males_genego_mapping_refined.tsv ${OUTPUT_FOLDER}SNage_PDVsControl_all_lists_genego_mapping_refined.tsv ${OUTPUT_FOLDER}genego_extended_newTFs_mapping_refined.tsv | sort -u > ${OUTPUT_FOLDER}genego_mapping_refined.tsv
# Second, GeneGO files, we need to loop over relevant GRN files again.
# Note, we also do the extended networks despite the fact that we did not have the geneGo ids for all the new genes that GeneGo includes.
......@@ -72,19 +71,19 @@ do
# We then add female diff data and then male diff data.
# [Female] We start by mapping the second field, which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-12,14,16 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv) | cut -f 1-12,13,15 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Female] Then, we map the first field, which is now located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-14,16,18 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv) | cut -f 1-14,15,17 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] We continue by mapping the second field again (now for male), which then moves first.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv) | cut -f 1-16,17,19 > ${OUTPUT_FOLDER}File_temp2.tsv
mv ${OUTPUT_FOLDER}File_temp2.tsv ${OUTPUT_FOLDER}File_temp.tsv
# [Male] Then, we map the first field, which is again located second and is moved back to first place.
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp2.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}File_temp.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv) | cut -f 1-18,19,21 > ${OUTPUT_FOLDER}File_temp2.tsv
echo $'Source\tTarget\tSource.ggid\tTarget.ggid\tggid\tSource.type\tTarget.type\tRegulation\tCategory\tSpecies\tInformation\tReferences\tFemale.TF.logFC\tFemale.TF.adjPval\tFemale.target.logFC\tFemale.target.adjPval\tMale.TF.logFC\tMale.TF.adjPval\tMale.target.logFC\tMale.target.adjPval' > ${f2}
paste <(cut -f 1-12,15-16 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 13-14,19-20 ${OUTPUT_FOLDER}File_temp2.tsv) <(cut -f 17-18 ${OUTPUT_FOLDER}File_temp2.tsv) >> ${f2}
......
......@@ -15,29 +15,34 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GG_DATA_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Else/GeneGo/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/17/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash.
# Copy the GeneGo data (must have been run manually before).
cp -rf ${GG_DATA_FOLDER}* ${OUTPUT_FOLDER}
cp ${OUTPUT_FOLDER}SNage_VSN_all_max-avg_all_lists_genes.tsv ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_all_lists_max-avg_all_pivalue_rankings_genes.tsv
# We need to loop over all mapping files.
rm -rf ${OUTPUT_FOLDER}*_mapping_refined.tsv
rm -rf ${OUTPUT_FOLDER}*_mapping_help.txt
for tag in "PDVsControl_females" "PDVsControl_males" "Gender_disease_status" "PDVsControl"
for tag in "PDVsControl_females" "PDVsControl_males" "PDVsControl_all_lists"
do
# We join back the top PI genes and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere. This also allows us to have the real inputs ids that are
# not returned by GeneGo (despite the GUI saying otherwise).
join -t $'\t' -a 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_refined.tsv
join -t $'\t' -a 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_VSN_${tag}_max-avg_all_pivalue_rankings_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_refined.tsv
# We print additional information to help the manual check that should be done after the join to fill in the missing ids.
echo "PI genes not mapped in GeneGo:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 1 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_VSN_${tag}_max-avg_all_pivalue_rankings_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_${tag}_rankings_PI_top_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_VSN_${tag}_max-avg_all_pivalue_rankings_genes.tsv) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}SNage_${tag}_genego_mapping_help.txt
done
# The extended_TF is a special case since we do not have relevant top genes based on PI.
......@@ -46,9 +51,9 @@ tag="extended_newTFs"
# We join back the geneder genes (all) and the raw GeneGo mappings to keep only the real GeneDER genes since GeneGo
# is adding some irrelevant stuff from somewhere.
join -t $'\t' -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_refined.tsv
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(cut -f 2 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_genego_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}genego_${tag}_genego_mapping_help.txt
join -t $'\t' -1 1 <(cut -f 1 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_mapping.tsv | grep -v "Network Object") > ${OUTPUT_FOLDER}genego_${tag}_mapping_refined.tsv
echo "GeneGo ids not found in Geneder data:" >> ${OUTPUT_FOLDER}genego_${tag}_mapping_help.txt
join -t $'\t' -v 2 -1 1 <(cut -f 1 ${INPUT_FOLDER}*rankings.tsv | grep -v Gene | sort -u -k1,1) -2 2 <(sort -t $'\t' -k2,2 ${OUTPUT_FOLDER}genego_${tag}_mapping.tsv | grep -v "Network Object") >> ${OUTPUT_FOLDER}genego_${tag}_mapping_help.txt
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
\ No newline at end of file
......@@ -20,16 +20,16 @@ OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/18/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/18-RegulatoryNetwork_all/
# Actual job in bash.
# We first join all enrichment results across Female, Male and GDS to get a complete picture.
join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_females_TFactivities.tsv | grep -v Regulon) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_PDVsControl_males_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1) -2 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}SNage_Gender_disease_status_TFactivities.tsv | grep -v Regulon) > ${OUTPUT_FOLDER}TFactivities_merged.tsv
# We first join all enrichment results across the female and male analyses to get a complete picture.
join -t $'\t' -a 1 -a 2 -e "NA" -o auto -1 5 <(sort -t $'\t' -k5,5 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_TFactivities.tsv | grep -v Regulon) -2 5 <(sort -t $'\t' -k5,5 ${OUTPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_TFactivities.tsv | grep -v Regulon) | sort -t $'\t' -k1,1 > ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the female diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_females_rankings.tsv) | cut -f 1-16,18,20 > ${OUTPUT_FOLDER}File_temp.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_females_max-avg_all_pivalue_rankings.tsv) | cut -f 1-19,20,22 > ${OUTPUT_FOLDER}File_temp.tsv
mv ${OUTPUT_FOLDER}File_temp.tsv ${OUTPUT_FOLDER}TFactivities_merged.tsv
# Then, we map the male diff exp data (for the TF).
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 2 <(sort -t $'\t' -k2,2 ${INPUT_FOLDER}SNage_PDVsControl_males_rankings.tsv) | cut -f 1-18,20,22 > ${OUTPUT_FOLDER}File_temp.tsv
echo $'Regulon\tF.evdc_lvl\tF.Size\tF.NES\tF.p.value\tF.FDR\tM.evdc_lvl\tM.Size\tM.NES\tM.p.value\tM.FDR\tGDS.evdc_lvl\tGDS.Size\tGDS.NES\tGDS.p.value\tGDS.FDR\tF.logFC\tF.ajdP\tM.logFC\tM.ajdP' > ${OUTPUT_FOLDER}TFactivities_merged.tsv
join -t $'\t' -a 1 -e "NA" -o auto -1 1 <(sort -t $'\t' -k1,1 ${OUTPUT_FOLDER}TFactivities_merged.tsv) -2 1 <(sort -t $'\t' -k1,1 ${INPUT_FOLDER}SNage_VSN_PDVsControl_males_max-avg_all_pivalue_rankings.tsv) | cut -f 1-21,22,24 > ${OUTPUT_FOLDER}File_temp.tsv
echo $'Regulon\tF.nb_pos\tF.nb_neg\tF.pc_pos\tF.pc_neg\tF.evdc_lvl\tF.Size\tF.NES\tF.p.value\tF.FDR\tM.nb_pos\tM.nb_neg\tM.pc_pos\tM.pc_neg\tM.evdc_lvl\tM.Size\tM.NES\tM.p.value\tM.FDR\tF.logFC\tF.ajdP\tM.logFC\tM.ajdP' > ${OUTPUT_FOLDER}TFactivities_merged.tsv
cat ${OUTPUT_FOLDER}File_temp.tsv >> ${OUTPUT_FOLDER}TFactivities_merged.tsv
rm ${OUTPUT_FOLDER}File_temp.tsv
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment