Commit 77f8c71f authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Minor modifications in step 06 to optimize the code (duplicate_rows).

parent fe85f14c
......@@ -24,7 +24,6 @@ module load lang/R/3.6.0-foss-2019a-bare
# Actual jobs
Rscript --vanilla ${CODE_FOLDER}analyse_integration_results.R > ${OUTPUT_FOLDER}analyse_log.out 2> ${OUTPUT_FOLDER}analyse_log.err
Rscript --vanilla ${CODE_FOLDER}analyse_integration_results_figures.R > ${OUTPUT_FOLDER}analyse_fig_log.out 2> ${OUTPUT_FOLDER}analyse_fig_log.err
Rscript --vanilla ${CODE_FOLDER}compare_integrations.R > ${OUTPUT_FOLDER}compare_integrations_log.out 2> ${OUTPUT_FOLDER}compare_integrations_log.err
# Moving the slurm log file to data
mv ${CODE_FOLDER}slurm-*out ${OUTPUT_FOLDER}
......
......@@ -25,6 +25,26 @@ message(paste0("[", Sys.time(), "] Configuration done."))
# Functions
# ================================================================================================
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 3, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
new_data <- cbind(new_data, rep(row[n], length(keys)))
}
# We adjust the keys and the column names.
new_data[, ikey] <- keys
return(new_data)
}
#' @title Plot the differential expression of top hits.
#'
#' @description This function create one plot for each top hit (gene) of a given limma
......@@ -55,7 +75,7 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
nb_hits = 10) {
# We set the I/Os.
file_prefix_short <- paste0(output_data_dir, integration_name, "_", vsn$name, "_",
file_prefix_short <- paste0(output_data_dir, integration_name, "_",
limma_analysis$name, "_")
# Read the matching data.
......@@ -65,7 +85,14 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(matching_data_fn)
tmp_res <- t(apply(matching_data, 1,
duplicate_row_multiple_gene_names))
matching_data_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
names(matching_data_clean) <- names(matching_data)
matching_data <- matching_data_clean
rm(matching_data_fn, tmp_res, matching_data_clean)
message(paste0("[", Sys.time(), "] Matching data read."))
# We read the results of the integration.
......@@ -103,8 +130,9 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
# We get the relevant probe identifiers.
gene <- top_hits[p, ]
matching_data_gene <- matching_data %>%
dplyr::filter(SYMBOL == gene$SYMBOL) %>%
dplyr::filter(genes == gene$SYMBOL) %>%
dplyr::select(ends_with(str_replace(selection_name, "-", ".")))
# We do all datasets one by one and collect the clinical and expression data.
......@@ -122,8 +150,16 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
dataset_arraytype <- stringr::str_replace(dataset$array_type, "-", ".")
probeset_tag <- paste0(dataset_arraytype, "_",
stringr::str_replace(selection_name, "-", "."))
probe_id <- matching_data_gene[[probeset_tag]]
rm(dataset, probeset_tag, dataset_arraytype, jj)
probe_ids <- matching_data_gene[[probeset_tag]]
# We here rely on the fact that even in case of multiple gene mapping only one probe
# should be present here. We still log a warning in case it is not the case
probe_id <- probe_ids[!is.na(probe_ids)]
if (length(probe_id) > 1) {
message(paste0("[", Sys.time(), "] Gene ", gene$SYMBOL, " has multiple probes for dataset ",
dataset_name, " (", dataset_arraytype, "): ", paste0(probe_ids, collapse = "-")))
}
rm(dataset, probeset_tag, dataset_arraytype, jj, probe_ids)
# We read the clinical data.
clin_data <- clin_data_all[[ii]] %>%
......@@ -135,8 +171,10 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
exp_data <- t(exp_data_all[[ii]][1, ])
exp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (!is.na(probe_id)) {
exp_data <- t(exp_data_all[[ii]][probe_id, ])
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
exp_data <- t(exp_data_all[[ii]][probe_id, ])
}
}
}
colnames(exp_data) <- "Expression"
......@@ -146,8 +184,10 @@ plot_top_hits <- function(output_data_dir, integration_name, limma_analysis, sel
zexp_data <- t(zexp_data_all[[ii]][1, ])
zexp_data[, 1] <- NA
if (!is.null(probe_id)) {
if (!is.na(probe_id)) {
zexp_data <- t(zexp_data_all[[ii]][probe_id, ])
if (length(probe_id) > 0) {
if (!is.na(probe_id)) {
zexp_data <- t(zexp_data_all[[ii]][probe_id, ])
}
}
}
colnames(zexp_data) <- "Expression"
......@@ -243,8 +283,7 @@ ctrl_plt <- c("#7da7de", "#c2e0ae")
# (so far regardless of the significance). Data are plotted per dataset (each one in an individual
# subplot) as well as all together (when the data are first transformed into z-scores).
# We create once for all the list of all datasets (microarray and RNA-seq).
datasets <- c(config$datasets, config$rs_datasets)
datasets <- config$datasets
# We do all integration schemes one by one.
for (i in seq_len(length(config$integrations))) {
......@@ -253,115 +292,104 @@ for (i in seq_len(length(config$integrations))) {
integration <- config$integrations[[i]]
int_criteria <- str_split(integration$criteria, ",")[[1]]
# We repeat the analysis for all VSN usages.
for (j in seq_len(length(config$variance_methods))) {
# We get the current VSN configuration.
vsn <- config$variance_methods[[j]]
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We define the current limma analysis.
limma_analysis <- config$limma_analyses[[k]]
# We define the color palette accordingly.
current_plt <- gds_plt
if (k == 1 | k == 3) {
current_plt <- diz_plt
} else if (k == 4) {
current_plt <- ctrl_plt
} else if (k == 6) {
current_plt <- male_plt
}
print(paste0(k, " set to ", str(current_plt)))
print(" ")
# We loop over the different gene-probe selection methods.
for (l in seq_len(length(config$selections))) {
# We define the current gene-probe selection method.
selection <- config$selections[[l]]
# We loop over all datasets to read only the relevant datasets
exp_data_all <- list()
zexp_data_all <- list()
clin_data_all <- list()
m_indx <- 1
m_indx_map <- list()
for (m in seq_len(length(datasets))) {
# We extract the configuration of the current dataset.
dataset <- datasets[[m]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- dataset$array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all <- TRUE
for (c in seq_len(length(int_criteria))) {
int_criterium <- str_split(int_criteria[c], ";")[[1]]
if (dataset[int_criterium[1]] != int_criterium[2]) {
satisfy_all <- FALSE
}
rm(int_criterium)
}
rm(c)
if (satisfy_all != TRUE) {
rm(satisfy_all, dataset, dataset_name, dataset_arraytype)
next
# We loop over the different limma analyses.
for (k in seq_len(length(config$limma_analyses))) {
# We define the current limma analysis.
limma_analysis <- config$limma_analyses[[k]]
# We define the color palette accordingly.
current_plt <- gds_plt
if (k == 1 | k == 3) {
current_plt <- diz_plt
} else if (k == 4) {
current_plt <- ctrl_plt
} else if (k == 6) {
current_plt <- male_plt
}
print(paste0(k, " set to ", str(current_plt)))
print(" ")
# We loop over the different gene-probe selection methods.
for (l in seq_len(length(config$selections))) {
# We define the current gene-probe selection method.
selection <- config$selections[[l]]
# We loop over all datasets to read only the relevant datasets
exp_data_all <- list()
zexp_data_all <- list()
clin_data_all <- list()
m_indx <- 1
m_indx_map <- list()
for (m in seq_len(length(datasets))) {
# We extract the configuration of the current dataset.
dataset <- datasets[[m]]
dataset_name <- dataset$dataset_name
dataset_arraytype <- dataset$array_type
# We only read the dataset if it corresponds to the integration criteria.
satisfy_all <- TRUE
for (c in seq_len(length(int_criteria))) {
int_criterium <- str_split(int_criteria[c], ";")[[1]]
if (dataset[int_criterium[1]] != int_criterium[2]) {
satisfy_all <- FALSE
}
rm(int_criterium)
}
rm(c)
if (satisfy_all != TRUE) {
rm(satisfy_all, dataset, dataset_name, dataset_arraytype)
next
}
# We define the dataset specific I/Os.
local_input_edata_dir <- input_edata_dir
file_suffix <- paste0("_", vsn$name, ".tsv")
file_long_suffix <- paste0("_", vsn$name, "_zscores.tsv")
if (dataset_arraytype == "RNAseq_EG") {
local_input_edata_dir <- input_rsedata_dir
file_suffix <- paste0("_cpm.tsv")
file_long_suffix <- paste0("_cpm_zscores.tsv")
}
exp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_suffix)
zexp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_long_suffix)
clin_data_fn <- paste0(local_input_edata_dir, dataset_name, "_clinical.tsv")
rm(local_input_edata_dir, file_suffix, file_long_suffix)
# We read the data.
clin_data_all[[m_indx]] <- read.table(clin_data_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
exp_data_all[[m_indx]] <- read.table(exp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
zexp_data_all[[m_indx]] <- read.table(zexp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
m_indx_map[[m_indx]] <- m
m_indx <- m_indx + 1
rm(dataset, dataset_name, dataset_arraytype, exp_data_fn, zexp_data_fn, clin_data_fn)
} # End for each dataset.
rm(m)
# Actual plotting of the top hits (as we ll as the genes we want to plot anyway from
# the local configuration file).
plot_top_hits(output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = limma_analysis,
selection_name = selection$name,
palette = current_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config)
rm(selection, exp_data_all, zexp_data_all, clin_data_all, m_indx, m_indx_map)
} # End for each gene-probe selection method.
rm(l, limma_analysis, current_plt)
} # End for each limma analysis.
rm(k, vsn)
} # End for each variance configuration.
rm(j, integration, int_criteria)
# We define the dataset specific I/Os.
local_input_edata_dir <- input_edata_dir
local_input_data_dir <- input_data_dir
vsn_name <- config$modifications[[m]]$best_variance
file_suffix <- paste0("_", vsn_name, ".tsv")
file_long_suffix <- paste0("_", vsn_name, "_zscores.tsv")
exp_data_fn <- paste0(local_input_edata_dir, dataset_name, "_normalized", file_suffix)
zexp_data_fn <- paste0(local_input_data_dir, dataset_name, "_normalized", file_long_suffix)
clin_data_fn <- paste0(local_input_edata_dir, dataset_name, "_clinical.tsv")
rm(local_input_edata_dir, local_input_data_dir, vsn_name, file_suffix, file_long_suffix)
# We read the data.
clin_data_all[[m_indx]] <- read.table(clin_data_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
exp_data_all[[m_indx]] <- read.table(exp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
zexp_data_all[[m_indx]] <- read.table(zexp_data_fn,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE)
m_indx_map[[m_indx]] <- m
m_indx <- m_indx + 1
rm(dataset, dataset_name, dataset_arraytype, exp_data_fn, zexp_data_fn, clin_data_fn)
} # End for each dataset.
rm(m)
# Actual plotting of the top hits (as we ll as the genes we want to plot anyway from
# the local configuration file).
plot_top_hits(output_data_dir = output_data_dir,
integration_name = integration$name,
limma_analysis = limma_analysis,
selection_name = selection$name,
palette = current_plt,
clin_data_all, exp_data_all, zexp_data_all, m_indx_map, config)
rm(selection, exp_data_all, zexp_data_all, clin_data_all, m_indx, m_indx_map)
} # End for each gene-probe selection method.
rm(l, limma_analysis, current_plt)
} # End for each limma analysis.
rm(k, integration, int_criteria)
} # End for each data integration scheme.
rm(i, datasets, ctrl_plt, diz_plt, gds_plt, male_plt)
......
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("ggplot2")
library("tidyverse")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Boxplot of P values from common / unique genes from two analyses.
#'
#' @descrtiption Function to plot the -log10( P values) of the genes found in common between the SN and
#' SNage analyses and the genes unique to each analysis.
#'
#' @param exp_tag The string tag corresponding to the limma analysis under study.
#' @param nb_genes The number of top genes to consider when comparing the analyses.
plotit <- function(exp_tag = "Gender_disease_status", nb_genes = 100) {
# Read the data from the current integration (MA+RS) and the previous one using only
# microarray data (MA).
F06_fn <- paste0(output_data_dir, "SN_VSN_", exp_tag, "_max-avg_integration.tsv")
F06f <- read.delim(F06_fn, header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(pvalue = int_csss_pval_marotmayer) %>%
select(SYMBOL, pvalue) %>%
arrange(pvalue)
F16_fn <- paste0(output_data_dir, "SNage_VSN_", exp_tag, "_max-avg_integration.tsv")
F16f <- read.delim(F16_fn, header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(pvalue = int_csss_pval_marotmayer) %>%
select(SYMBOL, pvalue) %>%
arrange(pvalue)
F06 <- head(F06f, nb_genes)
F16 <- head(F16f, nb_genes)
rm(F06_fn, F16_fn)
# We compute the overall rank correlation.
Ff <- merge(x = F06f, y = F16f, by = "SYMBOL")
res <- cor.test(Ff$pvalue.x, Ff$pvalue.y, method = "spearman")
print(paste0("[", exp_tag, "] rho = ", round(res$estimate, 3)))
rm(F06f, F16f, Ff, res)
# We merge the two resulting dataframe as to get the common and unique genes.
F <- merge(x = F06, y = F16, by = "SYMBOL")
F06uniq <- setdiff(F06, data.frame(SYMBOL = F$SYMBOL,
pvalue = F$pvalue.x))
F16uniq <- setdiff(F16, data.frame(SYMBOL = F$SYMBOL,
pvalue = F$pvalue.y))
all <- rbind(
data.frame(log_pvalue = -log10(F$pvalue.x)) %>% mutate(Category = "SN - common genes"),
data.frame(log_pvalue = -log10(F$pvalue.y)) %>% mutate(Category = "SNage - common genes"),
data.frame(log_pvalue = -log10(F06uniq$pvalue)) %>% mutate(Category = "SN - unique genes"),
data.frame(log_pvalue = -log10(F16uniq$pvalue)) %>% mutate(Category = "SNage - unique genes")
)
perc <- dim(F)[1] / dim(F06)[1] * 100
rm(F06, F16, F, F06uniq, F16uniq)
# We build the boxplots.
bx_fn <- paste0(output_data_dir, "SN_VSN_", exp_tag, "_comparison_SN_SNage.png")
png(bx_fn)
myplot <- ggplot(all, aes(x = Category, y = log_pvalue, colour = Category)) +
geom_boxplot(lwd = 1.2, outlier.shape = NA) +
geom_jitter(size = 3, width = 0.2, height = 0) +
theme(legend.position = "bottom",
axis.title.x = element_blank(),
plot.title = element_text(size = 15,
hjust = 0.5),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
panel.grid.major = element_line(colour = "lightgrey"),
strip.background = element_rect(fill = "black"),
strip.text = element_text(colour = "white")) +
scale_color_brewer(palette = "Dark2") +
ggtitle(paste0("[", exp_tag, "] Analysis of top ", nb_genes, " genes (common = ", perc, "%)."))
print(myplot)
dev.off()
rm(all, myplot, bx_fn)
}
# ================================================================================================
# Main
# ================================================================================================
plotit(exp_tag = "Gender_disease_status", nb_genes = 100)
plotit(exp_tag = "PDVsControl", nb_genes = 100)
plotit(exp_tag = "PDVsControl_females", nb_genes = 100)
plotit(exp_tag = "PDVsControl_males", nb_genes = 100)
# We log the session details for reproducibility.
rm(config, output_data_dir, input_data_dir)
sessionInfo()
......@@ -15,8 +15,7 @@ echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/04/
CINPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
INPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/05/
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/06/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/06-Data_integration/
......@@ -28,22 +27,7 @@ source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# Preparing the job.
rm -rf ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
nbDatasets=${#datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${datasets__dataset_name[$i]}
cut -f 2-3 ${MA_INPUT_FOLDER}${datasetName}_clinical.tsv | grep -v Disease | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $3, $2, $1}' | sed -r 's/\t/./' | sed -r 's/^/'"${datasetName}"'\t/g' >> ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
done
nbDatasets=${#rs_datasets__dataset_name[@]}
for (( i=0; i<$nbDatasets; i++ ))
do
datasetName=${rs_datasets__dataset_name[$i]}
cut -f 2-3 ${RS_INPUT_FOLDER}${datasetName}_clinical.tsv | grep -v Disease | sort | uniq -c | sed 's/^\s*//' | sed -r 's/\s/\t/g' | awk -v OFS="\t" -F"\t" '{print $3, $2, $1}' | sed -r 's/\t/./' | sed -r 's/^/'"${datasetName}"'\t/g' >> ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
done
head -n 1 ${MA_CINPUT_FOLDER}clinical_categories_summarized.tsv > ${OUTPUT_FOLDER}clinical_categories_summarized_wide.tsv
cat ${MA_CINPUT_FOLDER}clinical_categories_summarized.tsv | grep -v Dataset >> ${OUTPUT_FOLDER}clinical_categories_summarized_wide.tsv
cat ${RS_CINPUT_FOLDER}clinical_categories_summarized.tsv | grep -v Dataset >> ${OUTPUT_FOLDER}clinical_categories_summarized_wide.tsv
cp -rf ${INPUT_FOLDER}clinical_categories_summarized.tsv ${OUTPUT_FOLDER}clinical_categories_summarized.tsv
# Actual job
Rscript --vanilla ${CODE_FOLDER}integrate_datasets.R > ${OUTPUT_FOLDER}integrate_log.out 2> ${OUTPUT_FOLDER}integrate_log.err
......
This diff is collapsed.
......@@ -103,6 +103,54 @@ select_best <- function(row, array_indexes) {
return(to_return)
}
# duplicate_row_multiple_gene_names <- function(row, sep = "|") {
# keys <- strsplit(as.character(row[2]), sep, fixed = TRUE)[[1]]
# if (length(keys) == 0) {
# keys <- ""
# }
# val_1 <- rep(row[1], length(keys))
# val_3 <- rep(row[3], length(keys))
# val_4 <- rep(row[4], length(keys))
# val_5 <- rep(row[5], length(keys))
# val_6 <- rep(row[6], length(keys))
# val_7 <- rep(row[7], length(keys))
# val_8 <- rep(row[8], length(keys))
# val_9 <- rep(row[9], length(keys))
# val_10 <- rep(row[10], length(keys))
# to_ret <- cbind(val_1,
# keys,
# val_3,
# val_4,
# val_5,
# val_6,
# val_7,
# val_8,
# val_9,
# val_10)
# names(to_ret) <- names(row)
# return(to_ret)
# }
#' Function to duplicate data-frame rows based on one key field.
#' This field is used to do the duplication based on a string split. The other fields are
#' simply copied as is.
duplicate_row_multiple_gene_names <- function(row, ikey = 2, sep = "|") {
keys <- strsplit(as.character(row[ikey]), sep, fixed = TRUE)[[1]]
if (length(keys) == 0) {
keys <- ""
}
# We duplicate all fields.
new_data <- c()
for (n in seq_len(length(row))) {
new_data <- cbind(new_data, rep(row[n], length(keys)))
}
# We adjust the keys and the column names.
new_data[, ikey] <- keys
return(new_data)
}
# ================================================================================================
# Main
# ================================================================================================
......@@ -185,17 +233,33 @@ for (i in seq_len(length(config$integrations))) {
# Now, we are sure the current dataset is relevant and should be integrated.
# We read the results of the limma analysis.
local_input_data_dir <- input_data_dir
file_suffix <- paste0("_", config$modifications[[l]]$best_variance, ".tsv")
deps_analysis_fn <- paste0(local_input_data_dir, dataset_name, "_toptable_",
limma_analysis$name, file_suffix)
datasets_data[[i_ds]] <- read.table(deps_analysis_fn,
local_input_data_dir <- input_data_dir
file_suffix <- paste0("_", config$modifications[[l]]$best_variance, ".tsv")
deps_analysis_fn <- paste0(local_input_data_dir, dataset_name, "_toptable_",
limma_analysis$name, file_suffix)
local_exp_mat <- read.table(deps_analysis_fn,
header = TRUE,
sep = "\t",
row.names = NULL,
as.is = TRUE)
rm(deps_analysis_fn, local_input_data_dir)
rownames(datasets_data[[i_ds]]) <- datasets_data[[i_ds]]$X
local_exp_mat_clean <- local_exp_mat
if (sum(grepl("\\|", local_exp_mat$SYMBOL)) > 0) {
tmp_res <- t(apply(local_exp_mat, 1,
duplicate_row_multiple_gene_names))
local_exp_mat_clean <- data.frame(do.call(rbind, c(tmp_res)),
row.names = NULL,
stringsAsFactors = FALSE)
names(local_exp_mat_clean) <- names(local_exp_mat)
local_exp_mat_clean$logFC <- as.numeric(local_exp_mat_clean$logFC)