################################################## ## Helper Functions of over-represetation analysis ## Date: 13.04.2021 ## Author: Marek Ostaszewski, Dezso Modos ################################################## source("https://git-r3lab.uni.lu/covid/models/-/raw/master/Integration/MINERVA_access/minerva_access_functions.R") ### A function retrieving element identifiers from MINERVA get_mnv_refs <- function(fcomponents, id_type) { ### Get MINERVA HGNCs as a named list message("Retrieving MINERVA HGNCs collection") mnv_refs <- sapply(fcomponents$map_elements, function(x) get_annotation(x$references, "HGNC_SYMBOL")) names(mnv_refs) <- fcomponents$models$name return(mnv_refs) } ### For handling Reactome pathways ### Get HGNCs of UniProts translate_up_hgnc <- function(upid, hard_filter = T) { resp <- ask_GET(paste0("https://www.ebi.ac.uk/proteins/api/genecentric/", upid)) if(is.null(resp)) { return("") } res <- fromJSON(resp) if(is.null(res$relatedGene)) { return("") } ### This is a patchy reference; the correct way would be to get HGNC ID (numeric) and retrieve the symbol based on this ### but I'd like to avoid yet another API call hgncs <- with(res$relatedGene, geneName[accession == upid & geneNameType == "Gene name"]) ### Gene symbols retrieved this way may not be HGNCs; we hard filter out non-uppercased or too short ids if(hard_filter) { hgncs <- hgncs[(nchar(hgncs) > 2) & (toupper(hgncs) == hgncs)] } return(hgncs) } ### Get participants of a pathway and translate them into HGNCs get_reactome_pathway_contents <- function(reactome_path_id) { res <- fromJSON(ask_GET(paste0("https://reactome.org/ContentService/data/participants/", reactome_path_id))) ids <- sapply(res$refEntities, "[[", "identifier") symbols <- sapply(unique(unlist(ids)), translate_up_hgnc) return(unlist(unique(symbols))) } ### A function retrieving element identifiers from WikiPathway diagrams get_wp_refs <- function() { message("Retrieving WP collection") ### This file contains all the pathways in the WP COVID-19 repo wpids <- readLines("https://raw.githubusercontent.com/wikipathways/SARS-CoV-2-WikiPathways/master/pathways.txt") require(rWikiPathways) message("Retrieving WP HGNCs") wp_hgncs <- sapply(wpids, getXrefList, "H") names(wp_hgncs) <- wpids return(wp_hgncs) } ### A function retrieving element identifiers from Reactome diagrams get_rct_refs <- function() { message("Retrieving Reactome collection") rct_cov_hgncs <- sapply(c("R-HSA-9678110", "R-HSA-9679504", "R-HSA-9679514", "R-HSA-9683701", "R-HSA-9679509"), get_reactome_pathway_contents) rct_cov2_hgncs <- sapply(c("R-HSA-9694614", "R-HSA-9694676", "R-HSA-9694682", "R-HSA-9694635", "R-HSA-9694322"), get_reactome_pathway_contents) return(c(rct_cov_hgncs, rct_cov2_hgncs)) } get_hypergeometric_pvalue <- function(signature, genesets, background, verbose = F) { ### This function is a modified version of the hypergeometric function from the R package hyperR ### (https://github.com/montilab/hypeR/blob/master/R/hyper_enrichment.R). ### The used background in this case is a vector of gene IDs. It checks whether certain genes are in the background. ### Without such check thehypergeomteric tests can understimate the enrichmnet. if (!is(signature, "vector")) stop("Expected signature to be a vector of symbols\n") if (!is(genesets, "list")) stop("Expected genesets to be a list of genesets\n") if (!is(background, "vector")) stop("Expected background to be a vector of gene symbols\n") if (length(intersect(signature, background))==0) stop ("The singiture is not in the background. Some mapping misstake.\n") signature <- unique(signature) genesets <- lapply(genesets, unique) background <- unique(background) signature.found <- signature[signature %in% unique(unlist(genesets))] n.hits <- sapply(genesets, function(x, y) length(intersect(x, y)), signature.found) n.drawn <- length(signature) n.genesets <- sapply(genesets, function(x, y) length(intersect(x, y)), background) n.left <- length(background)-n.genesets print (list(n.hits, n.drawn, n.genesets, n.left)) pvals <- suppressWarnings(stats::phyper(q=n.hits-1, m=n.genesets, n=n.left, k=n.drawn, lower.tail=FALSE)) if(verbose) { message(pvals)} data <- data.frame(label=names(genesets), pval=signif(pvals, 2), fdr=signif(stats::p.adjust(pvals, method="fdr"), 2), signature=length(signature), geneset=n.genesets, overlap=n.hits, left=n.left, hits=sapply(genesets, function(x, y) paste(intersect(x, y), collapse=','), signature.found), stringsAsFactors=FALSE) return(data) } group_refs <- function(refs, groups) { ### Using by-name groupings provided, group unique HGNCs and return grefs <- sapply(groups, function(x) setdiff(unique(unlist(refs[x])), c("", NA))) names(grefs) <- names(groups) return(grefs) } get_degs <- function(data, ident1a, ident1b, ident2a, ident2b){ #'Get diferretailly expressed genes comapring two idnetifiers in the data. #'The identifiers are either at the beginning (ident1a ident2a) or at the end #'of the cell tyep (ident2a, ident2b) #'data is asureat object #'ident1a, ident1b, ident2a, ident2b are strigns. # Empty table fo DEG results all_degs <- data.frame() # Calculate DEGs between colon 24 and mock for each cell type for (i in unique(data$CellTypes)){ print(i) identity1 <- paste0(ident1a, i, ident1b) identity2 <- paste0(ident2a, i, ident2b) identifyers <- c(Idents(data)) selection1 <- Idents(data) == identity1 selection2 <- Idents(data) == identity2 if (length(identifyers[selection1])>2 & length(identifyers[selection2])>2){ # Get DEGs for all cell degs <- FindMarkers(data, ident.1 = identity1, ident.2 = identity2, assay='RNA',test.use = "MAST") # Put gene names as column not row names degs$gene <- rownames(degs) # Add comparison as column degs <- degs %>% mutate(comparison = paste0(paste0(identity1, '_v_',identity2))) # Append to previous results all_degs <- rbind(all_degs, degs) } } return(all_degs) } # Filter for p adj <= 0.05 and lfc >= 0.5 filtering_for_degs <- function (degsf, fdr=0.05, fc=0.5){ degsf <- degsf %>% dplyr::filter((p_val_adj <= fdr) & (abs(avg_log2FC>=fc))) return(degsf) } plot_group_intersect <- function(overrepresentation, map_grefs, title, file_name) { edge_table <- data.frame("Source" = character(0),"Target" = character(0), "Weight" = integer(0), "Overlap percetage" = numeric(0)) node_table <- data.frame("Function" = character(0), "fdr" = numeric(0), "Overlap percentage" = numeric(0), "Size" = integer(0)) #DEGs hits in the over-representation all_deg_hits <- unique(unlist(strsplit(overrepresentation$hits, split = ","))) #Creating the graph tables for (i in 1:length(map_grefs)) { for (j in 1:length(map_grefs)) { if (i != j & i > j ) { groups_isect <- intersect(map_grefs[[i]], map_grefs[[j]]) my.row <- data.frame("Source" = names(map_grefs)[i], "Target" = names(map_grefs)[j], "Weight" = length(groups_isect), "Overlap percetage" = length(intersect(groups_isect,all_deg_hits))/length(groups_isect)) edge_table <- rbind(edge_table, my.row) } } function_size <- length(all_deg_hits) if(function_size == 0) { function_size <- 1 } node_table_row <- data.frame("Function" = names(map_grefs)[i], "fdr" = overrepresentation[names(map_grefs)[i], "fdr"], "Overlap percentage" = length(intersect(map_grefs[[i]], all_deg_hits))/function_size, "Size" = function_size) node_table <- rbind(node_table, node_table_row) } g <- graph_from_data_frame(edge_table, directed=FALSE, vertices=node_table) png(file_name, 3000, 3000) g$layout <- layout_in_circle #The palette is optimized to overlap percentage. If you want it to FDR you need to use the second palette. g_palette = multiRamp(rbind(c(0,0.9),c(0.9,1)),list(c("blue","red"),c("red", "red"))) V(g)$color <- g_palette(V(g)$Overlap.percentage) E(g)$color <- g_palette(edge.attributes(g)$Overlap.percetage) plot.igraph(g, main = title, vertex.label = c("Virus replication cycle","ER stress and \nunfolded protein response", "Autophagy and \nprotein degradation","Apoptosis\n pathway","Renin-angiotensin system", "Coagulopathy\n pathway","PAMP signalling", "Induction of interferons\n and the cytokine storm", "Altered host\n metabolism", "Non-mechanistic diagrams"), edge.width= 0.4*edge_attr(g,"Weight")+3, vertex.size = 60, vertex.size2 = 15, vertex.label.color ="white", edge.curved = FALSE, vertex.label.cex = 4, edge.arrow.size = 6, vertex.shape = "rectangle") dev.off() } creating_overrepresetation_graphs_per_condtions <- function(data, map_grefs, root_dir = "") { results <- list() filterd_condtions <- filtering_for_degs(data) background <- unique(data$gene) cell_types <- unique(filterd_condtions["comparison"])$comparison for (comparison in cell_types) { print(comparison) #Doing the enrichment filterd_condtions <- filtering_for_degs(data, fdr=0.1, fc=0.3) degs <- filterd_condtions[filterd_condtions["comparison"]==comparison, "gene"] result <- get_hypergeometric_pvalue(degs, map_grefs, background) results[[comparison]] <- result plot_group_intersect(result, map_grefs, title = comparison, file_name = paste0(root_dir, comparison,"plot_05_05.png")) write.table(result, paste0(root_dir, comparison,".txt"), quote=FALSE, sep ="\t") } names(results) <- cell_types return(results) }