Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
GeneDER_core
Commits
f2e78969
Commit
f2e78969
authored
Nov 10, 2020
by
Leon-Charles Tranchevent
Browse files
Refactoring step 04 to process all datasets at once (First part - until match)
parent
9b1ebb03
Changes
6
Hide whitespace changes
Inline
Side-by-side
04-Prepare_datasets/Makefile
View file @
f2e78969
...
...
@@ -9,6 +9,8 @@ data:
@
sbatch
${CODE_FOLDER}
clean_datasets.sh
check
:
@
sbatch
${CODE_FOLDER}
check.sh
names
:
@
sbatch
${CODE_FOLDER}
get_names.sh
ps
:
@
sbatch
${CODE_FOLDER}
create_probelists.sh
match
:
...
...
04-Prepare_datasets/create_probelists.R
View file @
f2e78969
...
...
@@ -71,24 +71,15 @@ for (i in seq_len(length(config$platforms))) {
}
# End for each dataset.
rm
(
dataset_name
,
relevant_datasets
)
# We enrich the probe identifiers with the corresponding gene names using either
# the BioConductor package or the filtered GPL file (as in 06/get_DEGs.R).
# First, from a bioconductor package (usual case).
# We enrich the probe identifiers with the corresponding gene names using
# the prepared file (based on BioMart).
unique_probe_ids
<-
sort
(
unique
(
probe_ids
))
gene_annots_raw
<-
NULL
if
(
platform
$
library_name
!=
"NA"
)
{
gene_annots_raw
<-
ArrayUtils
::
get_gene_annots_from_package
(
platform
$
library_name
,
unique_probe_ids
)
}
else
{
# Here, we read instead the sorted GPL file for special cases with no Bioconductor library.
gpl_annot_folder
<-
paste0
(
raw_data_dir
,
"Platforms/"
)
gpl_annot_filename
<-
paste0
(
platform
$
geo_name
,
"_gene_annots.tsv"
)
gene_annots_raw
<-
ArrayUtils
::
get_gene_annots_from_file
(
gpl_annot_folder
,
gpl_annot_filename
,
unique_probe_ids
)
rm
(
gpl_annot_folder
,
gpl_annot_filename
)
}
gene_annots_filename
<-
paste0
(
platform
$
platform_name
,
"_genenames.tsv"
)
gene_annots_raw
<-
ArrayUtils
::
get_gene_annots_from_file
(
output_data_dir
,
gene_annots_filename
,
unique_probe_ids
)
rm
(
gene_annots_filename
)
# Then, we make sure we deal with duplicates and reorder the df to match the list we have.
# We do not use EntrezGene and EnsEMBL ids.
gene_annots
<-
gene_annots_raw
%>%
...
...
04-Prepare_datasets/create_probelists.sh
View file @
f2e78969
...
...
@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-00:0
3
:00
#SBATCH --time=0-00:0
5
:00
#SBATCH -p batch
#SBATCH --qos=normal
...
...
04-Prepare_datasets/get_names.sh
0 → 100755
View file @
f2e78969
#!/bin/bash -l
#SBATCH -J geneder:04:getnames
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 2
#SBATCH --time=0-00:10:00
#SBATCH -p batch
#SBATCH --qos=normal
echo
"== Starting run at
$(
date
)
"
echo
"== Job ID:
${
SLURM_JOBID
}
"
echo
"== Node list:
${
SLURM_NODELIST
}
"
echo
"== Submit dir. :
${
SLURM_SUBMIT_DIR
}
"
echo
""
# Defining global parameters.
GEO_PLAFORM_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Original/Platforms/
OUTPUT_FOLDER
=
/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CODE_FOLDER
=
/home/users/ltranchevent/Projects/GeneDER/Analysis/04-Prepare_datasets/
# Loading modules.
module load lang/R/3.6.0-foss-2019a-bare
# Load configuration
source
../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
create_variables ../Confs/platforms_config.yml
# Get the biomart data.
nbPlatforms
=
${#
platforms__platform_name
[@]
}
for
((
i
=
0
;
i<
$nbPlatforms
;
i++
))
do
platformName
=
${
platforms__platform_name
[
$i
]
}
platformBiomartName
=
${
platforms__biomart_name
[
$i
]
}
platformGEOName
=
${
platforms__geo_name
[
$i
]
}
if
[
"
${
platformBiomartName
}
"
!=
"NA"
]
then
# Get the official gene names.
wget
-O
${
OUTPUT_FOLDER
}${
platformName
}
_genenames_raw.tsv
'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6"><Dataset name = "hsapiens_gene_ensembl" interface = "default"><Attribute name = "'
${
platformBiomartName
}
'"/><Attribute name = "external_gene_name"/></Dataset></Query>'
awk
'BEGIN{FS=OFS="\t"}{if ($1 != "") {print $0}}'
${
OUTPUT_FOLDER
}${
platformName
}
_genenames_raw.tsv |
awk
'{if (t[$1]) {t[$1]=t[$1]"|"$2} else {t[$1]=$2}} END{for (i in t) {if (i != "") {print i"\t"t[i]}}}'
|
sort
-u
>
${
OUTPUT_FOLDER
}${
platformName
}
_genenames.tsv
rm
${
OUTPUT_FOLDER
}${
platformName
}
_genenames_raw.tsv
sleep
2s
else
if
[
"
${
platformGEOName
}
"
!=
"NA"
]
then
# We use the GEO data
cut
-f
-2
${
GEO_PLAFORM_FOLDER
}${
platformGEOName
}
_gene_official.tsv |
grep
-v
OFFICIAL |
sort
-u
>
${
OUTPUT_FOLDER
}${
platformName
}
_genenames.tsv
else
# We use manually curated data.
cut
-f
-2
${
GEO_PLAFORM_FOLDER
}${
platformName
}
_gene_official.tsv |
grep
-v
OFFICIAL |
sort
-u
>
${
OUTPUT_FOLDER
}${
platformName
}
_genenames.tsv
fi
fi
done
# Moving the slurm log file to data
mv
${
CODE_FOLDER
}
slurm-
*
out
${
OUTPUT_FOLDER
}
04-Prepare_datasets/match_probes.R
View file @
f2e78969
...
...
@@ -130,6 +130,7 @@ tag_probe_cmplx <- function(genes, subsets_as_string, current_conflict, sep = ",
# The work is performed by looking first at each platform individually (to solve the conflicts of
# each platform). In a second step, the conflicts across different platforms are solved. Last, the
# individual clean matching are saved as well as the global matching.
# NOTE: we do not do the second step anymore. Conflicts between platforms are kept as is.
# We do all platforms iteratively.
for
(
i
in
seq_len
(
length
(
config
$
platforms
)))
{
...
...
@@ -237,39 +238,41 @@ rm(i)
# We do an additional run of cleaning to remove the probes conflicting across datasets.
# The same rules apply but the different matches come from different datasets.
# We start by reading the data again (per type now).
deps_matching
<-
list
()
# We loop over platforms to read again the data.
for
(
i
in
seq_len
(
length
(
config
$
platforms
)))
{
platform
<-
config
$
platforms
[[
i
]]
deps_matching_loc_fn
<-
paste0
(
output_data_dir
,
platform
$
platform_name
,
"_probe_matching_details.tsv"
)
deps_matching_loc
<-
read.table
(
file
=
deps_matching_loc_fn
,
sep
=
"\t"
,
header
=
TRUE
,
as.is
=
TRUE
,
row.names
=
1
)
# We replace the NA values.
deps_matching_loc
[
is.na
(
deps_matching_loc
)]
<-
""
deps_matching
<-
rbind
(
deps_matching
,
deps_matching_loc
%>%
dplyr
::
mutate
(
source
=
platform
$
platform_name
))
rm
(
deps_matching_loc_fn
,
deps_matching_loc
,
platform
)
}
# End for each platform.
rm
(
i
)
message
(
paste0
(
"["
,
Sys.time
(),
"][Combined] Data ready."
))
# NOTE: we do not do the second step anymore. Conflicts between platforms are kept as is.
# # We start by reading the data again (per type now).
# deps_matching <- list()
# # We loop over platforms to read again the data.
# for (i in seq_len(length(config$platforms))) {
# platform <- config$platforms[[i]]
# deps_matching_loc_fn <- paste0(output_data_dir,
# platform$platform_name,
# "_probe_matching_details.tsv")
# deps_matching_loc <- read.table(file = deps_matching_loc_fn,
# sep = "\t",
# header = TRUE,
# as.is = TRUE,
# row.names = 1)
# # We replace the NA values.
# deps_matching_loc[is.na(deps_matching_loc)] <- ""
# deps_matching <- rbind(deps_matching, deps_matching_loc %>%
# dplyr::mutate(source = platform$platform_name))
# rm(deps_matching_loc_fn, deps_matching_loc, platform)
# } # End for each platform.
# rm(i)
# message(paste0("[", Sys.time(), "][Combined] Data ready."))
# We create the first map by looping over the rows once (now accross datasets).
# The idea is to keep track of all genes (as geneset) and the associated probes.
# original geneset --> list of pipe separated probes
# Ex: ACT1A|ACT1B --> 123456_at|456789_at
# Ex: ACT1A|ACT2 --> 654321_s_at
genes_probe_matching
<-
vector
()
tmp
<-
deps_matching
%>%
dplyr
::
rowwise
()
%>%
dplyr
::
do
(
tmp
=
update_map
(
.
$
probeset
,
.
$
symbol_short
))
rm
(
tmp
)
# NOTE: We do not do anymore the second filter (across platforms)
# genes_probe_matching <- vector()
# tmp <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::do(tmp = update_map(.$probeset, .$symbol_short))
# rm(tmp)
# We create the second map by looping over the rows once more (now across datasets).
# The idea is now to register all occurences of all subsets.
...
...
@@ -277,12 +280,13 @@ rm(tmp)
# Ex: ACT1A --> ACT1A|ACT1B,ACT1A|ACT2
# Ex: ACT1B --> ACT1A|ACT1B
# Ex: ACT2 --> ACT1A|ACT2
geneset_probe_matching
<-
vector
()
tmp
<-
deps_matching
%>%
dplyr
::
rowwise
()
%>%
dplyr
::
do
(
tmp
=
update_map_subsets
(
.
$
symbol_short
,
.
$
nb_symbols
,
.
$
subsets
))
rm
(
tmp
,
deps_matching
)
message
(
paste0
(
"["
,
Sys.time
(),
"][Combined] Maps ready."
))
# NOTE: We do not do anymore the second filter (across platforms)
# geneset_probe_matching <- vector()
# tmp <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::do(tmp = update_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
# rm(tmp, deps_matching)
# message(paste0("[", Sys.time(), "][Combined] Maps ready."))
# This list will contain the global clean genes-to-probe matching data.
global_matching_data
<-
list
()
...
...
@@ -316,9 +320,10 @@ for (i in seq_len(length(config$platforms))) {
# We tag 1_at for removal since 2_at is better to measure a, we assume that then b must be
# measured independently via another probe or it is lost. Should 2_at not exist, we would
# keep 1_at and consider that a|b is one single entity (for now).
deps_matching
<-
deps_matching
%>%
dplyr
::
rowwise
()
%>%
dplyr
::
mutate
(
conflicts_multiple_ds
=
tag_probe
(
symbol_short
,
subsets
))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::mutate(conflicts_multiple_ds = tag_probe(symbol_short, subsets))
# We remove the probes that target a set of genes if there is a probe that targets a subset,
# possibly in addition to other genes (from another array type).
...
...
@@ -326,15 +331,17 @@ for (i in seq_len(length(config$platforms))) {
# {a|c} -> {2_at}
# Focus is on 1_at. We tag it for removal since it is not possible to disentangle
# the signal. Probe 2 is removed for the same reason (when focus is on 2_at).
deps_matching
<-
deps_matching
%>%
dplyr
::
rowwise
()
%>%
dplyr
::
mutate
(
conflicts_multiple_ds
=
tag_probe_cmplx
(
symbol_short
,
subsets
,
conflicts_multiple_ds
))
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
platform
$
platform_name
,
"] Tagging done."
))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::rowwise() %>%
# dplyr::mutate(conflicts_multiple_ds = tag_probe_cmplx(symbol_short, subsets,
# conflicts_multiple_ds))
# message(paste0("[", Sys.time(), "][", platform$platform_name, "] Tagging done."))
# We tag the probes with conflicts with an extra column and save the data.
deps_matching
<-
deps_matching
%>%
dplyr
::
mutate
(
use_tag_multiple_ds
=
ifelse
(
conflicts_multiple_ds
!=
""
,
FALSE
,
TRUE
))
# NOTE: We do not do anymore the second filter (across platforms)
# deps_matching <- deps_matching %>%
# dplyr::mutate(use_tag_multiple_ds = ifelse(conflicts_multiple_ds != "", FALSE, TRUE))
write.table
(
deps_matching
,
file
=
deps_matching_details_fn
,
sep
=
"\t"
,
...
...
@@ -343,10 +350,12 @@ for (i in seq_len(length(config$platforms))) {
rm
(
deps_matching_details_fn
)
# We log some generic informations about the matching.
nb_trues
<-
length
(
deps_matching
$
use_tag
[
deps_matching
$
use_tag
==
TRUE
&
deps_matching
$
use_tag_multiple_ds
==
TRUE
])
nb_falses
<-
length
(
deps_matching
$
use_tag
[
deps_matching
$
use_tag
==
FALSE
|
deps_matching
$
use_tag_multiple_ds
==
FALSE
])
# nb_trues <- length(deps_matching$use_tag[deps_matching$use_tag == TRUE &
# deps_matching$use_tag_multiple_ds == TRUE])
# nb_falses <- length(deps_matching$use_tag[deps_matching$use_tag == FALSE |
# deps_matching$use_tag_multiple_ds == FALSE])
nb_trues
<-
length
(
deps_matching
$
use_tag
[
deps_matching
$
use_tag
==
TRUE
])
nb_falses
<-
length
(
deps_matching
$
use_tag
[
deps_matching
$
use_tag
==
FALSE
])
perc_trues
<-
nb_trues
/
(
nb_trues
+
nb_falses
)
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
platform
$
platform_name
,
"] Keep "
,
nb_trues
,
" / "
,
(
nb_trues
+
nb_falses
),
" probesets ("
,
round
(
perc_trues
,
4
)
*
100
,
"%)."
))
...
...
@@ -354,7 +363,8 @@ for (i in seq_len(length(config$platforms))) {
# We create in addition the "gene entity to probes" map for the current array type.
deps_matching
<-
deps_matching
%>%
dplyr
::
filter
(
use_tag
==
TRUE
,
use_tag_multiple_ds
==
TRUE
)
%>%
# dplyr::filter(use_tag == TRUE, use_tag_multiple_ds == TRUE) %>%
dplyr
::
filter
(
use_tag
==
TRUE
)
%>%
dplyr
::
select
(
probeset
,
SYMBOL
)
deps_matching
<-
aggregate
(
deps_matching
$
probeset
,
by
=
list
(
genes
=
deps_matching
$
SYMBOL
),
...
...
04-Prepare_datasets/match_probes.sh
View file @
f2e78969
...
...
@@ -4,7 +4,7 @@
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH -n 4
#SBATCH --time=0-0:
15
:00
#SBATCH --time=0-0:
20
:00
#SBATCH -p batch
#SBATCH --qos=normal
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment