Commit f293ccca authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Updated scRNA-seq analysis pipeline.

parent fd93d4c9
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("stringr")
library("org.Hs.eg.db")
library("tidyverse")
library("ReactomePA")
library("DOSE")
library("clusterProfiler")
library("msigdbr")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
raw_data_dir <- config$global_raw_data_dir
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
rm(args)
# Configure MSigDB
msigdb <- msigdbr(species = "Homo sapiens") %>% dplyr::select(gs_name, entrez_gene)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Perform a regular enrichment analysis using Fisher's over-representation.
#'
#' @description This functions relies on clusterProfiler and the associated packages
#' to run the enrichment for a given ontology among DO, GO (all three subtypes), KEGG and
#' REACTOME. It saves the results in the provided folder and return the enrichResult object
#' for further plotting.
#'
#' @param genes The list of genes to analyse. The gene identifiers have to match what is
#' expected - be it entrezgene identifiers or gene symbols.
#' @param background The list of all genes to be used as a background for correct
#' estimation of the statistics. The genes identifiers also have to match what is expected.
#' @param type The name of the ontology to use among "DO", "GOMF", "GOBP", "GOCC",
#' "KEGG" and "REACTOME". The GO analyses relies on gene symbols, the others use entrezgene ids.
#' Default to "DO".
#' @param file_dir The folder in which the tables will be saved. Default to "".
#' @param file_prefix The string that can be used as a prefix to name the file. Default to "".
#' @param symplify A boolean indicating whether the results should be simplified (by removing
#' redundant terms). Default to TRUE.
#' @return The enrichResult object that contains the enriched terms.
perform_enrichment <- function(genes,
background,
type = "DO",
file_dir = "",
file_prefix = "",
simplify = TRUE) {
# Default case, we just set the result to NULL.
enrich <- NULL
switch(type,
DO = {
enrich <- DOSE::enrichDO(gene = genes,
universe = background,
ont = "DO",
minGSSize = config$min_gs_size,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
readable = TRUE)
},
GOMF = {
enrich <- clusterProfiler::enrichGO(gene = genes,
universe = background,
keyType = "SYMBOL",
OrgDb = org.Hs.eg.db, #nolint
ont = "MF",
minGSSize = 10,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
readable = FALSE)
},
GOBP = {
enrich <- clusterProfiler::enrichGO(gene = genes,
universe = background,
keyType = "SYMBOL",
OrgDb = org.Hs.eg.db, #nolint
ont = "BP",
minGSSize = config$min_gs_size,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
readable = FALSE)
},
GOCC = {
enrich <- clusterProfiler::enrichGO(gene = genes,
universe = background,
keyType = "SYMBOL",
OrgDb = org.Hs.eg.db, #nolint
ont = "CC",
minGSSize = config$min_gs_size,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
readable = FALSE)
},
KEGG = {
enrich <- clusterProfiler::enrichKEGG(gene = genes,
universe = background,
organism = "hsa",
minGSSize = config$min_gs_size,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
use_internal_data = FALSE)
},
REACTOME = {
enrich <- ReactomePA::enrichPathway(gene = genes,
universe = background,
organism = "human",
minGSSize = config$min_gs_size,
maxGSSize = config$max_gs_size,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
readable = TRUE)
}
)
# We simplify the enrichment results by removing the potentially redundant terms.
if (simplify == TRUE) {
enrich <- clusterProfiler::simplify(enrich,
cutoff = 0.75,
by = "p.adjust",
select_fun = min)
}
# We save the table as TSV file.
enrich_fn <- paste0(file_dir, file_prefix, "enrich_", type, ".tsv")
write.table(enrich, file = enrich_fn, sep = "\t", quote = FALSE, col.names = NA)
# We return the enrichResult object.
return(enrich)
}
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in seq_len(length(config$scdatasets))) {
# We get the dataset details.
dataset <- config$scdatasets[[i]]
dataset_name <- dataset$dataset_name
# We run the quality control of the current dataset (if necessary).
if (selected_dataset_name == "" || selected_dataset_name == dataset_name) {
# We define the I/Os
raw_data_subdir <- paste0(raw_data_dir, dataset_name, "/data/")
output_data_subdir <- paste0(output_data_dir, dataset_name, "_seurat/")
# We are loading the background
background_fn <- paste0(raw_data_subdir, "genes.tsv")
background <- read.table(background_fn,
header = FALSE,
sep = "\t",
row.names = NULL,
as.is = TRUE)$V2
rm(background_fn)
# We map gene symbols to entrezgene ids for background.
background_map <- bitr(background,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
rm(background)
# We perform the enrichment on all rankings files.
ranking_files <- list.files(path = output_data_subdir, pattern = "*rankings.tsv")
# We do all integration schemes one by one.
for (ranking_file in ranking_files) {
# We define the I/Os.
analysis_prefix <- strsplit(ranking_file, ".tsv")[[1]]
ofile_prefix <- paste0("CP_", analysis_prefix, "_")
input_filename <- paste0(output_data_subdir, ranking_file)
# We read the prepared ranking and keep only the value used to rank.
ranking <- read.delim(input_filename, stringsAsFactors = FALSE) %>%
select(Gene, adj_P_value)
rm(input_filename)
ranking_clean <- ranking
ranking_clean$adj_P_value <- as.numeric(ranking_clean$adj_P_value)
rm(ranking)
# We map the gene symbols to the EntrezGene identifiers.
mapped_egenes <- bitr(ranking_clean$Gene,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #nolint
ranking_final <- merge(x = ranking_clean,
y = mapped_egenes,
by.x = "Gene",
by.y = "SYMBOL",
all.x = TRUE) %>%
mutate(EGene = ENTREZID) %>%
select(Gene, EGene, adj_P_value)
ranking_final$adj_P_value <- as.numeric(ranking_final$adj_P_value)
ranking_final <- ranking_final %>% arrange(adj_P_value)
ranking_final_eg <- ranking_final %>% filter(!is.na(EGene))
rm(mapped_egenes, ranking_clean)
# We then prepare the gene lists for Fisher-based enrichment. These are just gene lists based on
# the adjusted P values.
selection_sy <- ranking_final %>% filter(adj_P_value < 0.05)
selection_eg <- selection_sy %>% filter(!is.na(EGene))
rm(ranking_final, ranking_final_eg)
# We loop over the functional sources.
for (k in seq_len(length(config$functional_sources))) {
# We define the current source.
func_source <- config$functional_sources[[k]]
if (is.null(func_source$cp_name) || func_source$cp_name == "") {
next
}
# We do not do the MSIGDB enrichment (was only for GSEA).
if (func_source$cp_name == "MSIGDB") {
next;
}
# We define the reference genes (depending on the source).
selection <- NULL
background <- NULL
if (func_source$cp_gene_id_type == "entrezgene") {
selection <- selection_eg$EGene
background <- background_map$ENTREZID
} else if (func_source$cp_gene_id_type == "symbol") {
selection <- selection_sy$Gene
background <- background_map$SYMBOL
}
# We run the Fisher analysis.
fisher_adjP <- perform_enrichment(selection,
background,
type = func_source$cp_name,
file_dir = output_data_subdir,
file_prefix = ofile_prefix,
simplify = FALSE)
message(paste0("[", Sys.time(), "][", analysis_prefix, "] CP enrichment (",
func_source$cp_name, ") done."))
rm(func_source, fisher_adjP, selection, background)
} # End for each functional source.
rm(k, selection_sy, selection_eg)
rm(analysis_prefix, ofile_prefix)
} # End for each integration.
rm(ranking_file, ranking_files, output_data_subdir, raw_data_subdir, background_map)
} # End if dataset should be run.
rm(dataset, dataset_name)
} # End for each dataset
rm(i)
# We log the session details for reproducibility.
rm(config, input_data_dir, output_data_dir, msigdb)
sessionInfo()
#!/bin/bash -l
#SBATCH -J geneder:09:cp_enrich
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --time=0-08:00:00
#SBATCH --mem=10GB
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF=${1}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# For all scRNA datasets
nbScDatasets=${#scdatasets__dataset_name[@]}
for (( i=0; i<$nbScDatasets; i++ ))
do
datasetName=${scdatasets__dataset_name[$i]}
if [ "${datasetName}" == "${REF}" ]
then
# We run the functional enrichment.
echo "== Job $i started (${datasetName}) =="
Rscript --vanilla ${CODE_FOLDER}/CP_enrich_seurat.R ${datasetName} > ${OUTPUT_FOLDER}/${datasetName}_seurat/CP_enrich_log.out 2> ${OUTPUT_FOLDER}/${datasetName}_seurat/CP_enrich_log.err
echo "== Job $i ended (${datasetName}) =="
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
......@@ -6,5 +6,28 @@ clean:
@rm -rf *~
clean_outputs:
@rm -rf ${OUTPUT_FOLDER}/*
analyse:
@sbatch ${CODE_FOLDER}analyse.sh
@/bin/bash ${CODE_FOLDER}getdata.sh
s_ana:
@sbatch --time=0-02:30:00 --mem=64GB ${CODE_FOLDER}analyse_seurat.sh GSE157783
@sbatch --time=0-20:00:00 --mem=196GB -p bigmem ${CODE_FOLDER}analyse_seurat.sh GSE178265
#s_ana_stest:
# @sbatch --time=0-00:20:00 --mem=8GB ${CODE_FOLDER}sanalyse_seurat.sh GSE157783s
# @sbatch --time=0-00:20:00 --mem=8GB ${CODE_FOLDER}sanalyse_seurat.sh GSE178265s
#s_ana_mtest:
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_seurat.sh GSE157783m
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_seurat.sh GSE178265m
l_ana:
@sbatch --time=0-17:00:00 --mem=64GB ${CODE_FOLDER}analyse_liger.sh GSE157783
@sbatch --time=2-00:00:00 --mem=400GB -p bigmem ${CODE_FOLDER}analyse_liger.sh GSE178265
#l_ana_stest:
# @sbatch --time=0-00:20:00 --mem=6GB ${CODE_FOLDER}sanalyse_liger.sh GSE157783s
# @sbatch --time=0-00:20:00 --mem=6GB ${CODE_FOLDER}sanalyse_liger.sh GSE178265s
#l_ana_mtest:
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_liger.sh GSE157783m
# @sbatch --time=0-02:00:00 --mem=64GB ${CODE_FOLDER}analyse_liger.sh GSE178265m
prep:
@sbatch ${CODE_FOLDER}prepare_rankings_seurat.sh GSE157783
#@sbatch ${CODE_FOLDER}prepare_rankings_seurat.sh GSE178265
enrich:
@sbatch ${CODE_FOLDER}CP_enrich_seurat.sh GSE157783
#@sbatch ${CODE_FOLDER}CP_enrich_seurat.sh GSE178265
......@@ -2,6 +2,7 @@
The objectives of this step is to analyse the single-cell data (PD versus control) for male and female samples.
# Details and instructions
Beware, we use a version of SEurat that needs spatstat v1.64 not v2.x.
# Prerequisites
A prerequisite is to have the scRNA data ready in the project data folder.
\ No newline at end of file
A prerequisite is to have the scRNA data ready in the project data folder.
This diff is collapsed.
#!/bin/bash -l
#SBATCH -J geneder:09:scRNA
#SBATCH -J geneder:09:liger
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --ntasks-per-socket=14
#SBATCH --ntasks-per-node=28
#SBATCH -c 1
#SBATCH --mem=32GB
#SBATCH --time=0-02:00:00
#SBATCH --mem=64GB
#SBATCH --time=0-04:00:00
#SBATCH -p batch
#SBATCH --qos=normal
......@@ -20,6 +20,7 @@ echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF=${1}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
......@@ -28,22 +29,20 @@ module load lang/R/3.6.2-foss-2019b-bare
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# We get the Ensembl and HGNC data
wget -O ${OUTPUT_FOLDER}mitochondrial_genes.tsv 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default" ><Filter name = "chromosome_name" value = "MT"/><Attribute name = "ensembl_gene_id" /></Dataset></Query>'
wget -O ${OUTPUT_FOLDER}ribosomalrna_genes.tsv 'https://www.genenames.org/cgi-bin/genegroup/download?id=848&type=branch'
# For all scRNA datasets
#nbScDatasets=${#scdatasets__dataset_name[@]}
#for (( i=0; i<$nbScDatasets; i++ ))
#do
i=1
nbScDatasets=${#scdatasets__dataset_name[@]}
for (( i=0; i<$nbScDatasets; i++ ))
do
datasetName=${scdatasets__dataset_name[$i]}
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}/
mkdir ${OUTPUT_FOLDER}${datasetName}/
Rscript --vanilla ${CODE_FOLDER}analyse.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}/analysis_log.out 2> ${OUTPUT_FOLDER}${datasetName}/analysis_log.err
echo "== Job $i ended (${datasetName}) =="
#done
if [ "${datasetName}" == "${REF}" ]
then
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}_liger/
mkdir ${OUTPUT_FOLDER}${datasetName}_liger/
Rscript --vanilla ${CODE_FOLDER}analyse_liger.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}_liger/analysis_log.out 2> ${OUTPUT_FOLDER}${datasetName}_liger/analysis_log.err
echo "== Job $i ended (${datasetName}) =="
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/bin/bash -l
#SBATCH -J geneder:09:seurat
#SBATCH --mail-type=all
#SBATCH --mail-user=leon-charles.tranchevent@uni.lu
#SBATCH -N 1
#SBATCH --ntasks-per-socket=14
#SBATCH --ntasks-per-node=28
#SBATCH -c 1
#SBATCH --mem=64GB
#SBATCH --time=0-04:00:00
#SBATCH -p batch
#SBATCH --qos=normal
echo "== Starting run at $(date)"
echo "== Job ID: ${SLURM_JOBID}"
echo "== Node list: ${SLURM_NODELIST}"
echo "== Submit dir. : ${SLURM_SUBMIT_DIR}"
echo ""
# Defining global parameters.
OUTPUT_FOLDER=/home/users/ltranchevent/Data/GeneDER/Analysis/09/
CODE_FOLDER=/home/users/ltranchevent/Projects/GeneDER/Analysis/09-Single-cell_analysis/
REF=${1}
# Loading modules.
module load lang/R/3.6.2-foss-2019b-bare
# Load configuration
source ../libs/conf/confSH.sh
create_variables ../Confs/datasets_config.yml
# For all scRNA datasets
nbScDatasets=${#scdatasets__dataset_name[@]}
for (( i=0; i<$nbScDatasets; i++ ))
do
datasetName=${scdatasets__dataset_name[$i]}
if [ "${datasetName}" == "${REF}" ]
then
echo "== Job $i started (${datasetName}) =="
rm -rf ${OUTPUT_FOLDER}${datasetName}_seurat/
mkdir ${OUTPUT_FOLDER}${datasetName}_seurat/
Rscript --vanilla ${CODE_FOLDER}analyse_seurat.R ${datasetName} > ${OUTPUT_FOLDER}${datasetName}_seurat/analysis_log.out 2> ${OUTPUT_FOLDER}${datasetName}_seurat/analysis_log.err
echo "== Job $i ended (${datasetName}) =="
fi
done
# Moving the slurm log file to data
mv ${CODE_FOLDER}/slurm-*out ${OUTPUT_FOLDER}/
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("tidyverse")
library("ggplot2")
source("../libs/conf/confR.R")
source("../libs/utils/utils.R")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
config <- read_config(config_dirs = c("../Confs/", "./"))
output_data_dir <- paste0(config$global_data_dir, config$local_data_dir)
input_data_dir <- paste0(config$global_data_dir, config$local_input_data_dir)
# Optional command line parameter to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
rm(args)
message(paste0("[", Sys.time(), "] Configuration done."))
# ================================================================================================
# Functions
# ================================================================================================
#' @title Function to compute the gender-specificity score based on the Pi value rankings.
#' @description This function takes a dataframe with the log fold changes and P values of two
#' comparisons termed "ref" and "ctrl". The idea is to compute a score representing their
#' specificity for "ref" using "ctrl" to correct the non specific DEGs.
#' The specificity is based on the Pi values so the actual regulation (up/down) does not
#' matter and two genes with strong but opposite signals will not be considered specific.
#' @param FM The data-frame that contains the experimental data. The expected columns are
#' ref_logFC, ref_Pval, ref_adj_Pval, ctrl_logFC, ctrl_Pval, and ctrl_adj_Pval.
#' @param has_ctrl A boolean indicating whether a "ctrl" comparison is present. If not then
#' only the Pi value computation and the associated rankings are performed for "ref", and
#' no delta values are returned.
#' @return An enriched data-frame that contains additional columns, in particular one that
#' contains the gender-specificity scores, representing the specificity to the "ref" comparison
#' (unless has_ctrl is set to FALSE).
compute_genderspecificity_scores <- function(FM, has_ctrl = TRUE) {
# First, we compute the Pi values for both comparisons ("ref" and "control"). For this, we use
# the nominal P values and the log fold changes. Using the corrected P values would be more
# difficult since most values would be 1. We anyway intend to use the gender specific rankings
# produced here for GSEA, and only the ranking matters, not so much the actual values.
#
# We will add two fields:
# ref_pivalue:
# The Pi values for the "ref" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. These values are the ones used to
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
# ctrl_pivalue:
# The Pi values for the "ctrl" comparison. Values are ~[0, 100].
# Small values correspond to non differentially expressed genes, high values
# correspond to differentially expressed genes. These values are the ones used to
# create the ranking for GSEA when we analyse gender DEGs (and not gender specific
# DEGs).
#
# We start by the most obvious cases for which all values are present.
# The Pi value is simply the absolute log fold change * the P value (on a log10 scale).
FM$ref_pivalue <- -log10(FM$ref_Pval) * abs(FM$ref_logFC)
if (has_ctrl) {
FM$ctrl_pivalue <- -log10(FM$ctrl_Pval) * abs(FM$ctrl_logFC)
}
# We continue with special cases, that is when P value == 1.
# These are the genes that are definitely not differentially expressed.
# Because P value == 1, we have Pi value == 0.
# We use a small workaround to substract a small epsilon to the original P value.
# This way, all genes with P value == 1 are still ranked (therefore based on their log
# fold change only). This is because we want to avoid too many ties.
pseudo_pval_ref <- 0.1 * (9 + max(FM$ref_Pval[FM$ref_Pval < 1]))
pseudo_pi_ref <- -log10(pseudo_pval_ref) * abs(FM$ref_logFC[FM$ref_pivalue == 0])
FM$ref_pivalue[FM$ref_pivalue == 0] <- pseudo_pi_ref
if (has_ctrl) {
pseudo_pval_ctrl <- 0.1 * (9 + max(FM$ctrl_Pval[FM$ctrl_Pval < 1]))
pseudo_pi_ctrl <- -log10(pseudo_pval_ctrl) * abs(FM$ctrl_logFC[FM$ctrl_pivalue == 0])
FM$ctrl_pivalue[FM$ctrl_pivalue == 0] <- pseudo_pi_ctrl
rm(pseudo_pval_ctrl, pseudo_pi_ctrl)
}
rm(pseudo_pval_ref, pseudo_pi_ref)
# We then aim at creating a delta that represents the difference between the differential
# expression in "ref" and in "ctrl". The comparison is done based on the rankings, which
# are themseleves based on the Pi values. We can not use the Pi values directly since they