Commit e97aa7ac authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Added scripts to match the array probes to genes.

parent 2844b9fb
......@@ -120,6 +120,7 @@ get_scan_data <- function(dataset_name, data_dir, verbose = TRUE) {
collapser <- function(x){
x %>%
unique %>%
sort %>%
paste(collapse = "|")
}
......
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
run:
@Rscript --vanilla ${CODE_FOLDER}/match_probes.R > ${OUTPUT_FOLDER}/run_log.out 2> ${OUTPUT_FOLDER}/run_log.err
# Objectives
The objectives of this step is to integrate the results of the differential expression analysis across several datasets in order to identify similarities and overlaps and to identify robust DEGs.
# Details and instructions
The datasets / meta-datasets are first summarized at the gene level (limma analyses are performed at the probe level). Conflicts and non unique mappings are handled to create a unique list of DEGS.
```
make clean_outputs
make summarize
```
The results are list of DEGs (instead of differentially expressed probes).
# Prerequisites
A prerequisite is to have the results of the limma analysis for all datasets / meta-datasets (Step 05).
local_input_data_dir: !!str '05/'
local_data_dir: !!str '06/'
local_code_dir: !!str '06-Data_integration/'
matching:
- ['GSE20141', 'HGU133Plus2']
- ['GSE20163', 'HGU133A']
- ['E-MEXP-1416', 'U133_X3P']
analyses:
- ['GSE20141', 'GSE20163', 'GSE20164', 'GSE20292', 'GSE7307', 'GSE7621', 'GSE8397', 'Simunovic', 'E-MEXP-1416']
- ['HG-U133A', 'HG-U133_Plus_2', 'E-MEXP-1416']
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
message(paste0("[", Sys.time(), "] Configuration read."))
# ================================================================================================
# Functions
# ================================================================================================
#' Function to get all the subsets of size n from an array.
#'
#' @param array The array for which the subsets are needed.
#' @param n The size of the subsets.
#' @param current An array used for internal computation (for recurrence).
#' @return A two-dimensional table that contains the subsets.
subsets_size_n <- function (array, n, current = vector()) {
if (n == 0) {
return(current)
} else {
results <- vector()
for (i in 1:length(array)) {
next_i <- i + 1
new_array <- vector()
if (next_i <= length(array)) {
new_array <- array[next_i:length(array)]
}
if (length(new_array) >= n - 1) {
results <- cbind(results, subsets_size_n(new_array, n - 1, c(current, array[i])))
}
}
return(results)
}
}
#' A function that returns all the subsets of an array of any size (actually 1 to n-1).
#' The subsets themselves are represented as strings (concatenation of subset elements).
#' The set of subsets is returned as a string as well.
#'
#' @param array_as_str The array to use to build the subsets, provided as a string.
#' @param sep0 The separator used to concatenate the input array.
#' @param sep1 The separator to use to concatenate the subsets.
#' @param sep2 The separator to use to concatenate the subset elements.
#' @return a list of subsets with each subset represented as a string.
get_all_subsets <- function (array_as_str, sep0 = "|", sep1 = ",", sep2 = "|") {
if (grepl(sep0, array_as_str, fixed = TRUE)[1]) {
array <- unlist(strsplit(array_as_str, sep0, fixed = TRUE))
all_subsets <- vector()
for (i in 1:(length(array) - 1)) {
res <- subsets_size_n(array, i, vector())
for (j in 1:dim(res)[2]) {
all_subsets <- c(all_subsets, paste(res[1:i, j], sep = "", collapse = sep2))
}
}
return(paste(all_subsets, sep = "", collapse = sep1))
} else {
return("")
}
}
#' Function to update the map that associates the genes to the probes that
#' target them. The map is global, has been created before and is only updated here.
#'
#' @param probe A string representing the current probe to add to the system.
#' @param genes A string representing the gene or set of genes the given probe targets.
create_map <- function (probe, genes) {
# We don't use probes without gene symbol.
if (!is.na(genes)) {
# We update the global map of genes to probes.
if (is.na(genes_probe_matching[genes])) {
# First instance for that gene.
genes_probe_matching[genes] <<- probe
} else {
# We have already seen that gene.
genes_probe_matching[genes] <<- paste(genes_probe_matching[genes], probe, sep = "|")
}
}
}
#' Function to update the map that associates a set of genes to its subsets.
#' The map is global, has been created before and is only updated here.
#'
#' @param symbol A string representing the set of genes for which the map needs to be updated.
#' @param nb_symbol The number of genes in the set.
#' @param subsets The subsets of the genes indicated as a string.
#' @param sepi The separator used to concatenate the subsets (input).
#' @param sepo The separator to use to concatenate the subsets in the map (output).
create_map_subsets <- function (symbol, nb_symbols, subsets_as_string, sepi = ",", sepo = ",") {
if (nb_symbols > 1) {
gene_subsets <- unlist(strsplit(subsets_as_string, sepi, fixed = TRUE))
for (gene_subset in gene_subsets) {
if (is.na(geneset_probe_matching[gene_subset])) {
# First instance for that geneset.
geneset_probe_matching[gene_subset] <<- symbol
} else {
# We have already seen that geneset.
geneset_probe_matching[gene_subset] <<- paste(geneset_probe_matching[gene_subset],
symbol, sep = sepo)
}
}
}
}
#' Function to create a use tag of a given probe given its gene subsets.
#' It will tag as FALSE any probe for which a subset is also targetted by another probe.
#' EX: we have ps1 -> g1|g2 and ps2 -> g1, the use tag of ps1 is set to FALSE.
#'
#' @param genes The genes of the current probe.
#' @param subsets_as_string The subsets of the above genes, provided as as string.
#' @param sep The separator used to concatenate the above subsets.
tag_probe <- function (genes, subsets_as_string, sep = ",") {
subsets <- unlist(strsplit(subsets_as_string, sep, fixed = TRUE))
current_conflicts <- vector()
for (subset in subsets) {
if (!is.na(genes_probe_matching[subset])) {
# One of the subset exists.
current_conflicts <- c(current_conflicts, paste0("[I]", subset))
}
}
return(paste0(current_conflicts, sep = "", collapse = " "))
}
#' Function to create a use tag of a given probe given its gene subsets.
#' It will tag as FALSE any probe for which a subset is also targetted by another
#' probe that potentially targets other genes in addition.
#' EX: we have ps1 -> g1|g2 and ps2 -> g1|g3, the use tag of ps1 is set to FALSE.
#' The use tag of ps2 will later be set to FALSE for the same reason.
#'
#' @param genes The genes of the current probe.
#' @param subsets_as_string The subsets of the above genes, provided as as string.
#' @param current_conflicts The string that contains the previous information about potential
#' conflicts since we will only update it (not erase it).
#' @param sep The separator used to concatenate the above subsets.
tag_probe_cmplx <- function (genes, subsets_as_string, current_conflict, sep = ",") {
subsets <- unlist(strsplit(subsets_as_string, sep, fixed = TRUE))
current_conflicts <- c(current_conflict)
for (subset in subsets) {
all_keys <- geneset_probe_matching[subset]
nb_unique_keys <- length(unique(unlist(strsplit(all_keys, sep, fixed = TRUE))))
# This subset is shared among several original keys but these keys
# are not stricty the same, they only overlap.
if (nb_unique_keys > 1) {
current_conflicts <- c(current_conflicts, paste0("[II]", subset))
}
}
return(paste0(current_conflicts, sep = "", collapse = " "))
}
# ================================================================================================
# Main
# ================================================================================================
# We do all array types one by one.
for (i in 1:length(local_config$matching)) {
# We get the dataset name for that matching analysis (only one).
matching_dataset <- local_config$matching[[i]]
# We set the I/Os.
# We read gender data but it does not matter since we onlyh look at the probe and genes.
deps_gender_fn <- paste0(input_data_dir, matching_dataset[1], "_toptable_gender.tsv")
deps_matching_fn <- paste0(output_data_dir, matching_dataset[2], "_probe_matching.tsv")
deps_matching_details_fn <- paste0(output_data_dir, matching_dataset[2],
"_probe_matching_details.tsv")
# We read the limma data.
deps_gender <- read.table(deps_gender_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
rownames(deps_gender) <- deps_gender$X
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Limma data read."))
# We remove the unnecessary data to prepare the matching.
deps_annotated <- deps_gender %>% select(X, SYMBOL) %>% rename(probeset = X)
# We count the number of genes and add all subsets as extra columns.
deps_annotated <- deps_annotated %>% rowwise() %>%
mutate(nb_symbols = 1 + str_count(SYMBOL, fixed("|")))
# We do not consider the probes with more than 5 genes in any case.
# TODO: this could be fine tuned.
deps_annotated <- deps_annotated %>% mutate(symbol_short = ifelse(nb_symbols < 5, SYMBOL, ""))
deps_annotated <- deps_annotated %>% rowwise() %>%
mutate(subsets = get_all_subsets(array_as_str = symbol_short))
remove(deps_gender)
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Data ready."))
# We create the first map by looping over the row once.
# The idea is to keep track of all genes (as geneset) and the associated probes.
# original geneset --> list of pipe separated probes
# Ex: ACT1A|ACT1B --> 123456_at|456789_at
# Ex: ACT1A|ACT2 --> 654321_s_at
genes_probe_matching <- vector()
tmp <- deps_annotated %>% rowwise() %>% do(tmp = create_map(.$probeset, .$symbol_short))
# We create the second map by looping over the row once more.
# The idea is now to register all occurences of all subsets.
# subset --> list of comma separated original genesets
# Ex: ACT1A --> ACT1A|ACT1B,ACT1A|ACT2
# Ex: ACT1B --> ACT1A|ACT1B
# Ex: ACT2 --> ACT1A|ACT2
geneset_probe_matching <- vector()
tmp <- deps_annotated %>% rowwise() %>%
do(tmp = create_map_subsets(.$symbol_short, .$nb_symbols, .$subsets))
remove(tmp)
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Maps ready."))
# We remove the probes that target a set of genes if there is a probe that targets
# one of its subset.
# Ex: 1_at -> to a|b
# 2_at -> to a
# We tag 1_at for removal since 2_at is better to measure a, we assume that then b must be
# measured independently via another probe or it is lost. Should 2_at not exist, we would
# keep 1_at and consider that a|b is one single entity (for now).
deps_annotated <- deps_annotated %>% rowwise() %>%
mutate(conflicts = tag_probe(symbol_short, subsets))
# We remove the probes that target a set of genes if there is a probe that targets a subset,
# possibly in addition to other genes.
# Ex: {a|b} -> {1_at}
# {a|c} -> {2_at}
# Focus is on 1_at. We tag it for removal since it is not possible to disentangle
# the signal. Probe 2 is removed for the same reason (when focus is on 2_at).
deps_annotated <- deps_annotated %>% rowwise() %>%
mutate(conflicts = tag_probe_cmplx(symbol_short, subsets, conflicts))
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Tagging done."))
# We tag the probes with conflicts with an extra column and save the data.
deps_annotated <- deps_annotated %>%
mutate(use_tag = ifelse(conflicts != "", FALSE, ifelse(symbol_short == "", FALSE, TRUE)))
write.table(deps_annotated, file = deps_matching_details_fn,
sep = "\t", quote = FALSE, col.names = NA)
# We log some generic informations about the matching.
nb_true <- length(deps_annotated$use_tag[deps_annotated$use_tag == TRUE])
nb_false <- length(deps_annotated$use_tag[deps_annotated$use_tag == FALSE])
perc_true <- nb_true / (nb_true + nb_false)
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Keep ", nb_true, " / ",
(nb_true + nb_false), " probesets (", round(perc_true, 4) * 100, "%)."))
# We create in addition the gene entity to probes map and save it.
deps_annotated <- deps_annotated %>% filter(use_tag == TRUE) %>% select(probeset, SYMBOL)
deps_annotated <- aggregate(deps_annotated$probeset, by = list(genes = deps_annotated$SYMBOL),
FUN = paste) %>%
rename(probesets = x)
deps_annotated <- deps_annotated %>% rowwise() %>%
mutate(probesets = paste(unlist(strsplit(probesets, ", ")), collapse = "|"))
write.table(deps_annotated, file = deps_matching_fn, sep = "\t", quote = FALSE, col.names = NA)
message(paste0("[", Sys.time(), "][", matching_dataset[2], "] Saving done."))
}
# We log the session details for reproducibility.
sessionInfo()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment