overrepresentation_analysis_helper_functions.R 10.7 KB
Newer Older
Marek Ostaszewski's avatar
Marek Ostaszewski committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
##################################################
## Helper Functions of over-represetation analysis
## Date: 13.04.2021
## Author: Marek Ostaszewski, Dezso Modos
##################################################

source("https://git-r3lab.uni.lu/covid/models/-/raw/master/Integration/MINERVA_access/minerva_access_functions.R")

### A function retrieving element identifiers from MINERVA
get_mnv_refs <- function(fcomponents, id_type) {
    ### Get MINERVA HGNCs as a named list
    message("Retrieving MINERVA HGNCs collection")
    mnv_refs <- sapply(fcomponents$map_elements, function(x) get_annotation(x$references, "HGNC_SYMBOL"))
    names(mnv_refs) <- fcomponents$models$name
    return(mnv_refs)
}

### For handling Reactome pathways
### Get HGNCs of UniProts
translate_up_hgnc <- function(upid, hard_filter = T) {
    resp <- ask_GET(paste0("https://www.ebi.ac.uk/proteins/api/genecentric/", upid))
    if(is.null(resp)) { return("") }
    res <- fromJSON(resp)
    if(is.null(res$relatedGene)) { return("") }
    ### This is a patchy reference; the correct way would be to get HGNC ID (numeric) and retrieve the symbol based on this
    ### but I'd like to avoid yet another API call
    hgncs <- with(res$relatedGene, geneName[accession == upid & geneNameType == "Gene name"])
    ### Gene symbols retrieved this way may not be HGNCs; we hard filter out non-uppercased or too short ids
    if(hard_filter) {
        hgncs <- hgncs[(nchar(hgncs) > 2) & (toupper(hgncs) == hgncs)]
    }
    return(hgncs)
}

### Get participants of a pathway and translate them into HGNCs
get_reactome_pathway_contents <- function(reactome_path_id) {
    res <- fromJSON(ask_GET(paste0("https://reactome.org/ContentService/data/participants/", reactome_path_id)))
    ids <- sapply(res$refEntities, "[[", "identifier")
    symbols <- sapply(unique(unlist(ids)), translate_up_hgnc)
    return(unlist(unique(symbols)))
}

### A function retrieving element identifiers from WikiPathway diagrams 
get_wp_refs <- function() {
    message("Retrieving WP collection")
    ### This file contains all the pathways in the WP COVID-19 repo
    wpids <- readLines("https://raw.githubusercontent.com/wikipathways/SARS-CoV-2-WikiPathways/master/pathways.txt")
    require(rWikiPathways)
    message("Retrieving WP HGNCs")
    wp_hgncs <- sapply(wpids, getXrefList, "H")
    names(wp_hgncs) <- wpids
    return(wp_hgncs)
}

### A function retrieving element identifiers from Reactome diagrams 
get_rct_refs <- function() {
    message("Retrieving Reactome collection")
    rct_cov_hgncs <- sapply(c("R-HSA-9678110", "R-HSA-9679504", "R-HSA-9679514", "R-HSA-9683701", "R-HSA-9679509"), 
                            get_reactome_pathway_contents)
    rct_cov2_hgncs <- sapply(c("R-HSA-9694614", "R-HSA-9694676", "R-HSA-9694682", "R-HSA-9694635", "R-HSA-9694322"), 
                             get_reactome_pathway_contents)
    return(c(rct_cov_hgncs, rct_cov2_hgncs))
}
get_hypergeometric_pvalue <- function(signature, genesets, background,
                                      verbose = F) {
    ### This function is a modified version of the hypergeometric function from the R package hyperR
    ### (https://github.com/montilab/hypeR/blob/master/R/hyper_enrichment.R). 
    ### The used background in this case is a vector of gene IDs. It checks whether certain genes are in the background. 
    ### Without such check thehypergeomteric tests can understimate the enrichmnet.
    
    if (!is(signature, "vector")) stop("Expected signature to be a vector of symbols\n")
    if (!is(genesets, "list")) stop("Expected genesets to be a list of genesets\n")
    if (!is(background, "vector")) stop("Expected background to be a vector of gene symbols\n")
    if (length(intersect(signature, background))==0) stop ("The singiture is not in the background. Some mapping misstake.\n")
    
    
    signature <- unique(signature)
    genesets <- lapply(genesets, unique)
    background <- unique(background)
    
    signature.found <- signature[signature %in% unique(unlist(genesets))]
    
    n.hits <- sapply(genesets, function(x, y) length(intersect(x, y)), signature.found)
    n.drawn <- length(signature)
    n.genesets <- sapply(genesets, function(x, y) length(intersect(x, y)), background) 
    n.left <- length(background)-n.genesets
    
    print (list(n.hits, n.drawn, n.genesets, n.left))
    
    pvals <- suppressWarnings(stats::phyper(q=n.hits-1,
                                            m=n.genesets,
                                            n=n.left,
                                            k=n.drawn,
                                            lower.tail=FALSE))
    if(verbose) { message(pvals)}
    
    
    data <- data.frame(label=names(genesets),
                       pval=signif(pvals, 2),
                       fdr=signif(stats::p.adjust(pvals, method="fdr"), 2),
                       signature=length(signature),
                       geneset=n.genesets,
                       overlap=n.hits,
                       left=n.left,
                       hits=sapply(genesets, function(x, y) paste(intersect(x, y), collapse=','), signature.found),
                       stringsAsFactors=FALSE)
    return(data)
}

group_refs <- function(refs, groups) {
    ### Using by-name groupings provided, group unique HGNCs and return
    grefs <- sapply(groups, function(x) setdiff(unique(unlist(refs[x])), c("", NA)))
    names(grefs) <- names(groups)
    return(grefs)
}


get_degs <- function(data, ident1a, ident1b, ident2a, ident2b){
    #'Get diferretailly expressed genes comapring two idnetifiers in the data.
    #'The identifiers are either at the beginning (ident1a  ident2a) or at the end 
    #'of the cell tyep (ident2a,  ident2b)
    #'data is asureat object
    #'ident1a, ident1b, ident2a, ident2b are strigns. 
    
    # Empty table fo DEG results
    all_degs <- data.frame()
    
    # Calculate DEGs between colon 24 and mock for each cell type
    for (i in unique(data$CellTypes)){
        print(i)
        
        identity1 <- paste0(ident1a, i, ident1b)
        identity2 <- paste0(ident2a, i, ident2b)
        
        identifyers <- c(Idents(data))
        selection1 <- Idents(data) == identity1
        selection2 <- Idents(data) == identity2
        if (length(identifyers[selection1])>2 & length(identifyers[selection2])>2){
            # Get DEGs for all cell 
            degs <- FindMarkers(data, ident.1 = identity1, ident.2 = identity2, assay='RNA',test.use = "MAST")
            # Put gene names as column not row names
            degs$gene <- rownames(degs)
            # Add comparison as column
            degs <- degs %>% mutate(comparison = paste0(paste0(identity1, '_v_',identity2)))
            
            # Append to previous results
            all_degs <- rbind(all_degs, degs)
        }
    }
    return(all_degs)
}

# Filter for p adj <= 0.05 and lfc >= 0.5
filtering_for_degs <- function (degsf, fdr=0.05, fc=0.5){
    degsf <- degsf %>% dplyr::filter((p_val_adj <= fdr) & (abs(avg_log2FC>=fc)))
    return(degsf)
}

plot_group_intersect <- function(overrepresentation, map_grefs, title, file_name) {
    edge_table <- data.frame("Source" = character(0),"Target" = character(0), 
                             "Weight" = integer(0), "Overlap percetage" = numeric(0))
    node_table <- data.frame("Function" = character(0), "fdr" = numeric(0), 
                             "Overlap percentage" = numeric(0), "Size" = integer(0))
    #DEGs hits in the over-representation
    all_deg_hits <- unique(unlist(strsplit(overrepresentation$hits, split = ",")))
    #Creating the graph tables
    for (i in 1:length(map_grefs)) {
        for (j in 1:length(map_grefs)) {
            if (i != j & i > j ) {
                groups_isect <- intersect(map_grefs[[i]], map_grefs[[j]])
                my.row <- data.frame("Source" = names(map_grefs)[i], "Target" = names(map_grefs)[j], "Weight" = length(groups_isect), 
                                     "Overlap percetage" = length(intersect(groups_isect,all_deg_hits))/length(groups_isect)) 
                edge_table <- rbind(edge_table, my.row) 
            }
        }
        function_size <- length(all_deg_hits)
        if(function_size == 0) { function_size <- 1 }
        node_table_row <- data.frame("Function" = names(map_grefs)[i], 
                                     "fdr" = overrepresentation[names(map_grefs)[i], "fdr"],
                                     "Overlap percentage" = length(intersect(map_grefs[[i]], all_deg_hits))/function_size,
                                     "Size" = function_size)
        node_table <- rbind(node_table, node_table_row)
    }
    
    g <- graph_from_data_frame(edge_table, directed=FALSE, vertices=node_table)
    
    png(file_name, 3000, 3000)
    g$layout <- layout_in_circle
    #The palette is optimized to overlap percentage. If you want it to FDR you need to use the second palette.
    g_palette = multiRamp(rbind(c(0,0.9),c(0.9,1)),list(c("blue","red"),c("red", "red")))
    V(g)$color <- g_palette(V(g)$Overlap.percentage)
    E(g)$color <- g_palette(edge.attributes(g)$Overlap.percetage)
    plot.igraph(g,
                main = title,
                vertex.label = c("Virus replication cycle","ER stress and \nunfolded protein response",
                                 "Autophagy and \nprotein degradation","Apoptosis\n pathway","Renin-angiotensin system",
                                 "Coagulopathy\n pathway","PAMP signalling", 
                                 "Induction of interferons\n and the cytokine storm",
                                 "Altered host\n metabolism", "Non-mechanistic diagrams"),
                edge.width= 0.4*edge_attr(g,"Weight")+3, 
                vertex.size = 60, vertex.size2 = 15,
                vertex.label.color ="white",
                edge.curved = FALSE,
                vertex.label.cex = 4,
                edge.arrow.size = 6,
                vertex.shape = "rectangle")
    dev.off()
}

creating_overrepresetation_graphs_per_condtions <- function(data, map_grefs, root_dir = "") {
    results <- list()
    filterd_condtions <- filtering_for_degs(data)
    background <- unique(data$gene)
    cell_types <- unique(filterd_condtions["comparison"])$comparison
    for (comparison in cell_types) {
        print(comparison)
        #Doing the enrichment
        filterd_condtions <- filtering_for_degs(data, fdr=0.1, fc=0.3)
        degs <- filterd_condtions[filterd_condtions["comparison"]==comparison, "gene"]
        result <- get_hypergeometric_pvalue(degs, map_grefs, background)
        results[[comparison]] <- result
        plot_group_intersect(result, map_grefs, title = comparison, file_name = paste0(root_dir, comparison,"plot_05_05.png"))
        write.table(result, paste0(root_dir, comparison,".txt"), quote=FALSE, sep ="\t")
    }
    names(results) <- cell_types
    return(results)
}