Commit f2e78969 authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Refactoring step 04 to process all datasets at once (First part - until match)

parent 9b1ebb03
......@@ -9,6 +9,8 @@ data:
@sbatch ${CODE_FOLDER}clean_datasets.sh
check:
@sbatch ${CODE_FOLDER}check.sh
names:
@sbatch ${CODE_FOLDER}get_names.sh
ps:
@sbatch ${CODE_FOLDER}create_probelists.sh
match:
......
......@@ -71,24 +71,15 @@ for (i in seq_len(length(config$platforms))) {
} # End for each dataset.
rm(dataset_name, relevant_datasets)
# We enrich the probe identifiers with the corresponding gene names using either
# the BioConductor package or the filtered GPL file (as in 06/get_DEGs.R).
# First, from a bioconductor package (usual case).
# We enrich the probe identifiers with the corresponding gene names using
# the prepared file (based on BioMart).
unique_probe_ids <- sort(unique(probe_ids))
gene_annots_raw <- NULL
if (platform$library_name != "NA") {
gene_annots_raw <- ArrayUtils::get_gene_annots_from_package(platform$library_name,
unique_probe_ids)
} else {
# Here, we read instead the sorted GPL file for special cases with no Bioconductor library.
gpl_annot_folder <- paste0(raw_data_dir, "Platforms/")
gpl_annot_filename <- paste0(platform$geo_name, "_gene_annots.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(gpl_annot_folder,
gpl_annot_filename,
unique_probe_ids)
rm(gpl_annot_folder, gpl_annot_filename)
}
gene_annots_filename <- paste0(platform$platform_name, "_genenames.tsv")
gene_annots_raw <- ArrayUtils::get_gene_annots_from_file(output_data_dir,
gene_annots_filename,
unique_probe_ids)
rm(gene_annots_filename)
# Then, we make sure we deal with duplicates and reorder the df to match the list we have.
# We do not use EntrezGene and EnsEMBL ids.
gene_annots <- gene_annots_raw %>%
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-00:03:00
#SBATCH --time=0-00:05:00
#SBATCH -p batch
#SBATCH --qos=normal
......
#!/bin/bash -l
#SBATCH -J geneder:04:getnames
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
GEO_PLAFORM_FOLDER=/home/users/ltranchevent/Data/GeneDER/Original/Platforms/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/platforms_config.yml
# Get the biomart data.
nbPlatforms=${#platforms__platform_name[@]}
for (( i=0; i<$nbPlatforms; i++ ))
do
platformName=${platforms__platform_name[$i]}
platformBiomartName=${platforms__biomart_name[$i]}
platformGEOName=${platforms__geo_name[$i]}
if [ "${platformBiomartName}" != "NA" ]
then
# Get the official gene names.
wget -O ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6"><Dataset name = "hsapiens_gene_ensembl" interface = "default"><Attribute name = "'${platformBiomartName}'"/><Attribute name = "external_gene_name"/></Dataset></Query>'
awk 'BEGIN{FS=OFS="\t"}{if ($1 != "") {print $0}}' ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv | awk '{if (t[$1]) {t[$1]=t[$1]"|"$2} else {t[$1]=$2}} END{for (i in t) {if (i != "") {print i"\t"t[i]}}}' | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
rm ${OUTPUT_FOLDER}${platformName}_genenames_raw.tsv
sleep 2s
else
if [ "${platformGEOName}" != "NA" ]
then
# We use the GEO data
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformGEOName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
else
# We use manually curated data.
cut -f -2 ${GEO_PLAFORM_FOLDER}${platformName}_gene_official.tsv | grep -v OFFICIAL | sort -u > ${OUTPUT_FOLDER}${platformName}_genenames.tsv
fi
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......@@ -130,6 +130,7 @@ tag_probe_cmplx <- function(genes, subsets_as_string, current_conflict, sep = ",
# The work is performed by looking first at each platform individually (to solve the conflicts of
# each platform). In a second step, the conflicts across different platforms are solved. Last, the
# individual clean matching are saved as well as the global matching.
# NOTE: we do not do the second step anymore. Conflicts between platforms are kept as is.
# We do all platforms iteratively.
for (i in seq_len(length(config$platforms))) {
......@@ -237,39 +238,41 @@ rm(i)
# We do an additional run of cleaning to remove the probes conflicting across datasets.
# The same rules apply but the different matches come from different datasets.
# We start by reading the data again (per type now).
deps_matching <- list()
# We loop over platforms to read again the data.
for (i in seq_len(length(config$platforms))) {
platform <- config$platforms[[i]]
deps_matching_loc_fn <- paste0(output_data_dir,
platform$platform_name,
"_probe_matching_details.tsv")
deps_matching_loc <- read.table(file = deps_matching_loc_fn,
sep = "\t",
header = TRUE,
as.is = TRUE,
row.names = 1)
# We replace the NA values.
deps_matching_loc[is.na(deps_matching_loc)] <- ""
deps_matching <- rbind(deps_matching, deps_matching_loc %>%
dplyr::mutate(source = platform$platform_name))
rm(deps_matching_loc_fn, deps_matching_loc, platform)
} # End for each platform.
rm(i)
message(paste0("[", Sys.time(), "][Combined] Data ready."))
# NOTE: we do not do the second step anymore. Conflicts between platforms are kept as is.
# # We start by reading the data again (per type now).
# deps_matching <- list()
# # We loop over platforms to read again the data.
# for (i in seq_len(length(config$platforms))) {
# platform <- config$platforms[[i]]
# deps_matching_loc_fn <- paste0(output_data_dir,
# platform$platform_name,
# "_probe_matching_details.tsv")
# deps_matching_loc <- read.table(file = deps_matching_loc_fn,
# sep = "\t",
# header = TRUE,
# as.is = TRUE,
# row.names = 1)
# # We replace the NA values.
# deps_matching_loc[is.na(deps_matching_loc)] <- ""
# deps_matching <- rbind(deps_matching, deps_matching_loc %>%
# dplyr::mutate(source = platform$platform_name))
# rm(deps_matching_loc_fn, deps_matching_loc, platform)
# } # End for each platform.
# rm(i)
# message(paste0("[", Sys.time(), "][Combined] Data ready."))
# We create the first map by looping over the rows once (now accross datasets).
# The idea is to keep track of all genes (as geneset) and the associated probes.
# original geneset --> list of pipe separated probes
# Ex: ACT1A|ACT1B --> 123456_at|456789_at
# Ex: ACT1A|ACT2 --> 654321_s_at
genes_probe_matching <- vector()
tmp <- deps_matching %>%
dplyr::rowwise() %>%
dplyr::do(tmp = update_map(.$probeset, .$symbol_short))
rm(tmp)
# NOTE: We do not do anymore the second filter (across platforms)
# genes_probe_matching <- vector()
# tmp <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::do(tmp = update_map(.$probeset, .$symbol_short))
# rm(tmp)
# We create the second map by looping over the rows once more (now across datasets).
# The idea is now to register all occurences of all subsets.
......@@ -277,12 +280,13 @@ rm(tmp)
# Ex: ACT1A --> ACT1A|ACT1B,ACT1A|ACT2
# Ex: ACT1B --> ACT1A|ACT1B
# Ex: ACT2 --> ACT1A|ACT2
geneset_probe_matching <- vector()
tmp <- deps_matching %>%
dplyr::rowwise() %>%
dplyr::do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
rm(tmp, deps_matching)
message(paste0("[", Sys.time(), "][Combined] Maps ready."))
# NOTE: We do not do anymore the second filter (across platforms)
# geneset_probe_matching <- vector()
# tmp <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
# rm(tmp, deps_matching)
# message(paste0("[", Sys.time(), "][Combined] Maps ready."))
# This list will contain the global clean genes-to-probe matching data.
global_matching_data <- list()
......@@ -316,9 +320,10 @@ for (i in seq_len(length(config$platforms))) {
# We tag 1_at for removal since 2_at is better to measure a, we assume that then b must be
# measured independently via another probe or it is lost. Should 2_at not exist, we would
# keep 1_at and consider that a|b is one single entity (for now).
deps_matching <- deps_matching %>%
dplyr::rowwise() %>%
dplyr::mutate(conflicts_multiple_ds = tag_probe(symbol_short, subsets))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::mutate(conflicts_multiple_ds = tag_probe(symbol_short, subsets))
# We remove the probes that target a set of genes if there is a probe that targets a subset,
# possibly in addition to other genes (from another array type).
......@@ -326,15 +331,17 @@ for (i in seq_len(length(config$platforms))) {
# {a|c} -> {2_at}
# Focus is on 1_at. We tag it for removal since it is not possible to disentangle
# the signal. Probe 2 is removed for the same reason (when focus is on 2_at).
deps_matching <- deps_matching %>%
dplyr::rowwise() %>%
dplyr::mutate(conflicts_multiple_ds = tag_probe_cmplx(symbol_short, subsets,
conflicts_multiple_ds))
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Tagging done."))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::mutate(conflicts_multiple_ds = tag_probe_cmplx(symbol_short, subsets,
# conflicts_multiple_ds))
# message(paste0("[", Sys.time(), "][", platform$platform_name, "] Tagging done."))
# We tag the probes with conflicts with an extra column and save the data.
deps_matching <- deps_matching %>%
dplyr::mutate(use_tag_multiple_ds = ifelse(conflicts_multiple_ds != "", FALSE, TRUE))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::mutate(use_tag_multiple_ds = ifelse(conflicts_multiple_ds != "", FALSE, TRUE))
write.table(deps_matching,
file = deps_matching_details_fn,
sep = "\t",
......@@ -343,10 +350,12 @@ for (i in seq_len(length(config$platforms))) {
rm(deps_matching_details_fn)
# We log some generic informations about the matching.
nb_trues <- length(deps_matching$use_tag[deps_matching$use_tag == TRUE &
deps_matching$use_tag_multiple_ds == TRUE])
nb_falses <- length(deps_matching$use_tag[deps_matching$use_tag == FALSE |
deps_matching$use_tag_multiple_ds == FALSE])
# nb_trues <- length(deps_matching$use_tag[deps_matching$use_tag == TRUE &
# deps_matching$use_tag_multiple_ds == TRUE])
# nb_falses <- length(deps_matching$use_tag[deps_matching$use_tag == FALSE |
# deps_matching$use_tag_multiple_ds == FALSE])
nb_trues <- length(deps_matching$use_tag[deps_matching$use_tag == TRUE])
nb_falses <- length(deps_matching$use_tag[deps_matching$use_tag == FALSE])
perc_trues <- nb_trues / (nb_trues + nb_falses)
message(paste0("[", Sys.time(), "][", platform$platform_name, "] Keep ", nb_trues, " / ",
(nb_trues + nb_falses), " probesets (", round(perc_trues, 4) * 100, "%)."))
......@@ -354,7 +363,8 @@ for (i in seq_len(length(config$platforms))) {
# We create in addition the "gene entity to probes" map for the current array type.
deps_matching <- deps_matching %>%
dplyr::filter(use_tag == TRUE, use_tag_multiple_ds == TRUE) %>%
# dplyr::filter(use_tag == TRUE, use_tag_multiple_ds == TRUE) %>%
dplyr::filter(use_tag == TRUE) %>%
dplyr::select(probeset, SYMBOL)
deps_matching <- aggregate(deps_matching$probeset,
by = list(genes = deps_matching$SYMBOL),
......
......@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:15:00
#SBATCH --time=0-0:20:00
#SBATCH -p batch
#SBATCH --qos=normal
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment