Commit 2bb2e901 authored by Marek Ostaszewski's avatar Marek Ostaszewski
Browse files

Merge branch 'repo_update' into 'master'

Repo update

See merge request !305
parents 8e876051 f11162e7
##########################################################
### Normalization of GSE152075 SARS-CoV-2 dataset #####
########################################################
library(pacman)
#### Normalize Data #####
### Load necessary packages
pacman::p_load("here", "rentrez","reutils","hipathia", "biomaRt", "utils",
"stringr", "AnnotationDbi", "org.Hs.eg.db","dplyr","tidyr",
"openxlsx", "preprocessCore", "edgeR")
### Download the file if it doesn't exist
if(!file.exists("GSE152075_raw_counts_GEO.txt.gz")) {
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE152075&format=file&file=GSE152075%5Fraw%5Fcounts%5FGEO%2Etxt%2Egz",
destfile = "GSE152075_raw_counts_GEO.txt.gz")
}
### Load and clean RNA-seq expression set from raw counts file.
expreset_raw <- read.delim(file = gzfile("GSE152075_raw_counts_GEO.txt.gz"), header = T, sep = " ")
identifiers <- rownames(expreset_raw)
identifiers_df <- data.frame(id = identifiers,
entrez = mapIds(org.Hs.eg.db, keys = identifiers, column = "ENTREZID", keytype = "SYMBOL"),
stringsAsFactors = F) %>% .[!is.na(.$entrez),] ## identify weird codes which do not have entrez ID
expreset_raw <- expreset_raw[rownames(expreset_raw) %in% identifiers_df$id, ] ## here we clean the dataset from pseudogenes and rows with none entrez ids which will not be used by Hipathia
print("read and clean from non entrez codes ...done")
# Explore how our data is organized
hist(as.numeric(expreset_raw[2,]),breaks=100)
var(as.numeric(expreset_raw[2,]))
getVari <- apply(expreset_raw, 1, var)
hist(getVari,100)
# Normalise by TMM with "edgeR" package
dge <- DGEList(counts=expreset_raw)
print("dge...done")
tmm <- calcNormFactors(dge, method="TMM")
print("tmm...done")
logcpm <- cpm(tmm, prior.count=3, log=TRUE)
print("logcpm...done")
### Normalise by Hipathia
gExp = logcpm
gExp = normalize.quantiles(gExp)
rownames(gExp)<- rownames(logcpm)
colnames(gExp) <- colnames(logcpm)
### Construct the metadata file with the info from GEO GSE152075 (Virus vs Control)
metadata <- data.frame(fileName = colnames(gExp),
type = c(rep("Virus",430), rep("Control",54)))
### Translate by Hipathia
trans_data <- translate_data(gExp, "hsa")
#### Export results table for the Cov-Hipathia Disease Maps differential signaling example.
trans_data_export <- tibble::rownames_to_column(as.data.frame(trans_data), var ="X")
write.table(trans_data_export, file = "GSE152075_input_data.tsv", col.names = T, row.names = F, quote = F, sep = "\t")
write.table(metadata, file = "GSE152075_design_matrix.tsv", row.names = F, col.names = F, quote = F, sep = "\t" )
...@@ -7,7 +7,7 @@ output: ...@@ -7,7 +7,7 @@ output:
df_print: paged df_print: paged
--- ---
```{r setup, cache=TRUE} ```{r setup, cache=FALSE}
source("overrepresentation_analysis_helper_functions.R") source("overrepresentation_analysis_helper_functions.R")
library(dplyr) library(dplyr)
library(Seurat) library(Seurat)
...@@ -28,7 +28,7 @@ if(!file.exists("eils-lung.cells.ucsc.edu_meta.tsv")) { ...@@ -28,7 +28,7 @@ if(!file.exists("eils-lung.cells.ucsc.edu_meta.tsv")) {
meta <- read.table("eils-lung.cells.ucsc.edu_meta.tsv", header=T, sep="\t", as.is=T, row.names=1) meta <- read.table("eils-lung.cells.ucsc.edu_meta.tsv", header=T, sep="\t", as.is=T, row.names=1)
### Folder for writing out the results. Modify if necessary ### Folder for writing out the results. Modify if necessary
results.folder = "./lung_rmd_res/" results.folder <- "./lung_rmd_res/"
if(!dir.exists(results.folder)) { dir.create(results.folder) } if(!dir.exists(results.folder)) { dir.create(results.folder) }
``` ```
...@@ -50,7 +50,7 @@ I do not know whether we need to do QC steps on the dataset or is it done. ...@@ -50,7 +50,7 @@ I do not know whether we need to do QC steps on the dataset or is it done.
I will use the dataset as it is I expect that the quality filtering is already done. I will use the dataset as it is I expect that the quality filtering is already done.
Just to make sure let's make some QC plots. Just to make sure let's make some QC plots.
```{r plot Seurat features, cache=TRUE} ```{r plot Seurat features, cache=FALSE}
Seurat::VlnPlot(lung_sc_rna_seq, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) Seurat::VlnPlot(lung_sc_rna_seq, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- Seurat::FeatureScatter(lung_sc_rna_seq, feature1 = "nCount_RNA", feature2 = "percent.mt") plot1 <- Seurat::FeatureScatter(lung_sc_rna_seq, feature1 = "nCount_RNA", feature2 = "percent.mt")
...@@ -97,9 +97,11 @@ Let's set the meta data specific identity :-) ...@@ -97,9 +97,11 @@ Let's set the meta data specific identity :-)
```{r process identities(cntd)} ```{r process identities(cntd)}
Seurat::Idents(object = lung_sc_rna_seq) <- "new.ident" Seurat::Idents(object = lung_sc_rna_seq) <- "new.ident"
``` ```
```{r calculate and write down markers, cache=TRUE} ```{r calculate markers, cache=TRUE}
lung_sc_rna_seq.markers <- Seurat::FindAllMarkers(lung_sc_rna_seq, only.pos = TRUE, min.pct = 0.25, lung_sc_rna_seq.markers <- Seurat::FindAllMarkers(lung_sc_rna_seq, only.pos = TRUE, min.pct = 0.25,
logfc.threshold = 0.25) logfc.threshold = 0.25)
```
```{r write down markers, cache=FALSE}
write.table(lung_sc_rna_seq.markers, file = paste0(results.folder, "Lung_cell_specific_markers.txt"), write.table(lung_sc_rna_seq.markers, file = paste0(results.folder, "Lung_cell_specific_markers.txt"),
sep = "\t", quote=F, row.names = F) sep = "\t", quote=F, row.names = F)
``` ```
......
...@@ -80,7 +80,7 @@ degs_inf_vs_bys_12 <- get_degs(colon, 'Colon_12h_', "_Infected", 'Colon_12h_', " ...@@ -80,7 +80,7 @@ degs_inf_vs_bys_12 <- get_degs(colon, 'Colon_12h_', "_Infected", 'Colon_12h_', "
degs_inf_vs_bys_24 <- get_degs(colon, 'Colon_24h_', "_Infected", 'Colon_12h_', "_Bystander") degs_inf_vs_bys_24 <- get_degs(colon, 'Colon_24h_', "_Infected", 'Colon_12h_', "_Bystander")
``` ```
```{r colon: write results, cache=TRUE} ```{r colon: write results, cache=FALSE}
write.table(degs_12_vs_mock, file = file.path(results.folder,"comparing_all_cells_12h.txt"), write.table(degs_12_vs_mock, file = file.path(results.folder,"comparing_all_cells_12h.txt"),
sep = "\t", quote=F, row.names = F) sep = "\t", quote=F, row.names = F)
write.table(degs_24_vs_mock, file = file.path(results.folder, "comparing_all_cells_24h.txt"), write.table(degs_24_vs_mock, file = file.path(results.folder, "comparing_all_cells_24h.txt"),
...@@ -111,25 +111,33 @@ unique(Idents(illeum)) ...@@ -111,25 +111,33 @@ unique(Idents(illeum))
```{r ileum: calculate DEGs per cell type and write them down, cache=TRUE} ```{r ileum: calculate DEGs per cell type and write them down, cache=TRUE}
degs_inf_vs_mock_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Infected", 'Illeum_Mock_', "_Non-Infected") degs_inf_vs_mock_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Infected", 'Illeum_Mock_', "_Non-Infected")
write.table(degs_inf_vs_mock_12_illeum,
file = file.path(results.folder, "degs_inf_vs_mock_illeum_12h.txt"), sep = "\t", quote=F, row.names = F)
degs_inf_vs_mock_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Infected", 'Illeum_Mock_', "_Non-Infected") degs_inf_vs_mock_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Infected", 'Illeum_Mock_', "_Non-Infected")
write.table(degs_inf_vs_mock_24_illeum,
file = file.path(results.folder, "degs_inf_vs_mock_illeum_24h.txt"), sep = "\t", quote=F, row.names = F)
degs_bys_vs_mock_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Bystander", 'Illeum_Mock_', "_Non-Infected") degs_bys_vs_mock_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Bystander", 'Illeum_Mock_', "_Non-Infected")
write.table(degs_bys_vs_mock_12_illeum,
file = file.path(results.folder, "degs_bys_vs_mock_12h_illeum.txt"), sep = "\t", quote=F, row.names = F)
degs_bys_vs_mock_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Bystander", 'Illeum_Mock_', "_Non-Infected") degs_bys_vs_mock_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Bystander", 'Illeum_Mock_', "_Non-Infected")
write.table(degs_bys_vs_mock_24_illeum,
file = file.path(results.folder, "degs_bys_vs_mock_24h_illeum.txt"), sep = "\t", quote=F, row.names = F)
degs_inf_vs_bys_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Infected", 'Illeum_12h_', "_Bystander") degs_inf_vs_bys_12_illeum <- get_degs(illeum, 'Illeum_12h_', "_Infected", 'Illeum_12h_', "_Bystander")
write.table(degs_inf_vs_bys_12_illeum,
file = file.path(results.folder, "degs_inf_vs_bys_12h_illeum.txt"), sep = "\t", quote=F, row.names = F)
degs_inf_vs_bys_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Infected", 'Illeum_24h_', "_Bystander") degs_inf_vs_bys_24_illeum <- get_degs(illeum, 'Illeum_24h_', "_Infected", 'Illeum_24h_', "_Bystander")
```
```{r ileum: write results, cache=FALSE}
write.table(degs_inf_vs_mock_12_illeum,
file = file.path(results.folder, "degs_inf_vs_mock_illeum_12h.txt"),
sep = "\t", quote=F, row.names = F)
write.table(degs_inf_vs_mock_24_illeum,
file = file.path(results.folder, "degs_inf_vs_mock_illeum_24h.txt"),
sep = "\t", quote=F, row.names = F)
write.table(degs_bys_vs_mock_12_illeum,
file = file.path(results.folder, "degs_bys_vs_mock_12h_illeum.txt"),
sep = "\t", quote=F, row.names = F)
write.table(degs_bys_vs_mock_24_illeum,
file = file.path(results.folder, "degs_bys_vs_mock_24h_illeum.txt"),
sep = "\t", quote=F, row.names = F)
write.table(degs_inf_vs_bys_12_illeum,
file = file.path(results.folder, "degs_inf_vs_bys_12h_illeum.txt"),
sep = "\t", quote=F, row.names = F)
write.table(degs_inf_vs_bys_24_illeum, write.table(degs_inf_vs_bys_24_illeum,
file = file.path(results.folder, "degs_inf_vs_bys_24h_illeum.txt"), sep = "\t", quote=F, row.names = F) file = file.path(results.folder, "degs_inf_vs_bys_24h_illeum.txt"),
sep = "\t", quote=F, row.names = F)
``` ```
Writing down the cell counts for each type of data. This can be useful to investigate the data. Writing down the cell counts for each type of data. This can be useful to investigate the data.
......
##################################################
## Helper Functions of over-represetation analysis
## Date: 13.04.2021
## Author: Marek Ostaszewski, Dezso Modos
##################################################
source("https://git-r3lab.uni.lu/covid/models/-/raw/master/Integration/MINERVA_access/minerva_access_functions.R")
### A function retrieving element identifiers from MINERVA
get_mnv_refs <- function(fcomponents, id_type) {
### Get MINERVA HGNCs as a named list
message("Retrieving MINERVA HGNCs collection")
mnv_refs <- sapply(fcomponents$map_elements, function(x) get_annotation(x$references, "HGNC_SYMBOL"))
names(mnv_refs) <- fcomponents$models$name
return(mnv_refs)
}
### For handling Reactome pathways
### Get HGNCs of UniProts
translate_up_hgnc <- function(upid, hard_filter = T) {
resp <- ask_GET(paste0("https://www.ebi.ac.uk/proteins/api/genecentric/", upid))
if(is.null(resp)) { return("") }
res <- fromJSON(resp)
if(is.null(res$relatedGene)) { return("") }
### This is a patchy reference; the correct way would be to get HGNC ID (numeric) and retrieve the symbol based on this
### but I'd like to avoid yet another API call
hgncs <- with(res$relatedGene, geneName[accession == upid & geneNameType == "Gene name"])
### Gene symbols retrieved this way may not be HGNCs; we hard filter out non-uppercased or too short ids
if(hard_filter) {
hgncs <- hgncs[(nchar(hgncs) > 2) & (toupper(hgncs) == hgncs)]
}
return(hgncs)
}
### Get participants of a pathway and translate them into HGNCs
get_reactome_pathway_contents <- function(reactome_path_id) {
res <- fromJSON(ask_GET(paste0("https://reactome.org/ContentService/data/participants/", reactome_path_id)))
ids <- sapply(res$refEntities, "[[", "identifier")
symbols <- sapply(unique(unlist(ids)), translate_up_hgnc)
return(unlist(unique(symbols)))
}
### A function retrieving element identifiers from WikiPathway diagrams
get_wp_refs <- function() {
message("Retrieving WP collection")
### This file contains all the pathways in the WP COVID-19 repo
wpids <- readLines("https://raw.githubusercontent.com/wikipathways/SARS-CoV-2-WikiPathways/master/pathways.txt")
require(rWikiPathways)
message("Retrieving WP HGNCs")
wp_hgncs <- sapply(wpids, getXrefList, "H")
names(wp_hgncs) <- wpids
return(wp_hgncs)
}
### A function retrieving element identifiers from Reactome diagrams
get_rct_refs <- function() {
message("Retrieving Reactome collection")
rct_cov_hgncs <- sapply(c("R-HSA-9678110", "R-HSA-9679504", "R-HSA-9679514", "R-HSA-9683701", "R-HSA-9679509"),
get_reactome_pathway_contents)
rct_cov2_hgncs <- sapply(c("R-HSA-9694614", "R-HSA-9694676", "R-HSA-9694682", "R-HSA-9694635", "R-HSA-9694322"),
get_reactome_pathway_contents)
return(c(rct_cov_hgncs, rct_cov2_hgncs))
}
get_hypergeometric_pvalue <- function(signature, genesets, background,
verbose = F) {
### This function is a modified version of the hypergeometric function from the R package hyperR
### (https://github.com/montilab/hypeR/blob/master/R/hyper_enrichment.R).
### The used background in this case is a vector of gene IDs. It checks whether certain genes are in the background.
### Without such check thehypergeomteric tests can understimate the enrichmnet.
if (!is(signature, "vector")) stop("Expected signature to be a vector of symbols\n")
if (!is(genesets, "list")) stop("Expected genesets to be a list of genesets\n")
if (!is(background, "vector")) stop("Expected background to be a vector of gene symbols\n")
if (length(intersect(signature, background))==0) stop ("The singiture is not in the background. Some mapping misstake.\n")
signature <- unique(signature)
genesets <- lapply(genesets, unique)
background <- unique(background)
signature.found <- signature[signature %in% unique(unlist(genesets))]
n.hits <- sapply(genesets, function(x, y) length(intersect(x, y)), signature.found)
n.drawn <- length(signature)
n.genesets <- sapply(genesets, function(x, y) length(intersect(x, y)), background)
n.left <- length(background)-n.genesets
print (list(n.hits, n.drawn, n.genesets, n.left))
pvals <- suppressWarnings(stats::phyper(q=n.hits-1,
m=n.genesets,
n=n.left,
k=n.drawn,
lower.tail=FALSE))
if(verbose) { message(pvals)}
data <- data.frame(label=names(genesets),
pval=signif(pvals, 2),
fdr=signif(stats::p.adjust(pvals, method="fdr"), 2),
signature=length(signature),
geneset=n.genesets,
overlap=n.hits,
left=n.left,
hits=sapply(genesets, function(x, y) paste(intersect(x, y), collapse=','), signature.found),
stringsAsFactors=FALSE)
return(data)
}
group_refs <- function(refs, groups) {
### Using by-name groupings provided, group unique HGNCs and return
grefs <- sapply(groups, function(x) setdiff(unique(unlist(refs[x])), c("", NA)))
names(grefs) <- names(groups)
return(grefs)
}
get_degs <- function(data, ident1a, ident1b, ident2a, ident2b){
#'Get diferretailly expressed genes comapring two idnetifiers in the data.
#'The identifiers are either at the beginning (ident1a ident2a) or at the end
#'of the cell tyep (ident2a, ident2b)
#'data is asureat object
#'ident1a, ident1b, ident2a, ident2b are strigns.
# Empty table fo DEG results
all_degs <- data.frame()
# Calculate DEGs between colon 24 and mock for each cell type
for (i in unique(data$CellTypes)){
print(i)
identity1 <- paste0(ident1a, i, ident1b)
identity2 <- paste0(ident2a, i, ident2b)
identifyers <- c(Idents(data))
selection1 <- Idents(data) == identity1
selection2 <- Idents(data) == identity2
if (length(identifyers[selection1])>2 & length(identifyers[selection2])>2){
# Get DEGs for all cell
degs <- FindMarkers(data, ident.1 = identity1, ident.2 = identity2, assay='RNA',test.use = "MAST")
# Put gene names as column not row names
degs$gene <- rownames(degs)
# Add comparison as column
degs <- degs %>% mutate(comparison = paste0(paste0(identity1, '_v_',identity2)))
# Append to previous results
all_degs <- rbind(all_degs, degs)
}
}
return(all_degs)
}
# Filter for p adj <= 0.05 and lfc >= 0.5
filtering_for_degs <- function (degsf, fdr=0.05, fc=0.5){
degsf <- degsf %>% dplyr::filter((p_val_adj <= fdr) & (abs(avg_log2FC>=fc)))
return(degsf)
}
plot_group_intersect <- function(overrepresentation, map_grefs, title, file_name) {
edge_table <- data.frame("Source" = character(0),"Target" = character(0),
"Weight" = integer(0), "Overlap percetage" = numeric(0))
node_table <- data.frame("Function" = character(0), "fdr" = numeric(0),
"Overlap percentage" = numeric(0), "Size" = integer(0))
#DEGs hits in the over-representation
all_deg_hits <- unique(unlist(strsplit(overrepresentation$hits, split = ",")))
#Creating the graph tables
for (i in 1:length(map_grefs)) {
for (j in 1:length(map_grefs)) {
if (i != j & i > j ) {
groups_isect <- intersect(map_grefs[[i]], map_grefs[[j]])
my.row <- data.frame("Source" = names(map_grefs)[i], "Target" = names(map_grefs)[j], "Weight" = length(groups_isect),
"Overlap percetage" = length(intersect(groups_isect,all_deg_hits))/length(groups_isect))
edge_table <- rbind(edge_table, my.row)
}
}
function_size <- length(all_deg_hits)
if(function_size == 0) { function_size <- 1 }
node_table_row <- data.frame("Function" = names(map_grefs)[i],
"fdr" = overrepresentation[names(map_grefs)[i], "fdr"],
"Overlap percentage" = length(intersect(map_grefs[[i]], all_deg_hits))/function_size,
"Size" = function_size)
node_table <- rbind(node_table, node_table_row)
}
g <- graph_from_data_frame(edge_table, directed=FALSE, vertices=node_table)
png(file_name, 3000, 3000)
g$layout <- layout_in_circle
#The palette is optimized to overlap percentage. If you want it to FDR you need to use the second palette.
g_palette = multiRamp(rbind(c(0,0.9),c(0.9,1)),list(c("blue","red"),c("red", "red")))
V(g)$color <- g_palette(V(g)$Overlap.percentage)
E(g)$color <- g_palette(edge.attributes(g)$Overlap.percetage)
plot.igraph(g,
main = title,
vertex.label = c("Virus replication cycle","ER stress and \nunfolded protein response",
"Autophagy and \nprotein degradation","Apoptosis\n pathway","Renin-angiotensin system",
"Coagulopathy\n pathway","PAMP signalling",
"Induction of interferons\n and the cytokine storm",
"Altered host\n metabolism", "Non-mechanistic diagrams"),
edge.width= 0.4*edge_attr(g,"Weight")+3,
vertex.size = 60, vertex.size2 = 15,
vertex.label.color ="white",
edge.curved = FALSE,
vertex.label.cex = 4,
edge.arrow.size = 6,
vertex.shape = "rectangle")
dev.off()
}
creating_overrepresetation_graphs_per_condtions <- function(data, map_grefs, root_dir = "") {
results <- list()
filterd_condtions <- filtering_for_degs(data)
background <- unique(data$gene)
cell_types <- unique(filterd_condtions["comparison"])$comparison
for (comparison in cell_types) {
print(comparison)
#Doing the enrichment
filterd_condtions <- filtering_for_degs(data, fdr=0.1, fc=0.3)
degs <- filterd_condtions[filterd_condtions["comparison"]==comparison, "gene"]
result <- get_hypergeometric_pvalue(degs, map_grefs, background)
results[[comparison]] <- result
plot_group_intersect(result, map_grefs, title = comparison, file_name = paste0(root_dir, comparison,"plot_05_05.png"))
write.table(result, paste0(root_dir, comparison,".txt"), quote=FALSE, sep ="\t")
}
names(results) <- cell_types
return(results)
}
\ No newline at end of file
...@@ -5,71 +5,34 @@ ...@@ -5,71 +5,34 @@
## Author: Marek Ostaszewski ## Author: Marek Ostaszewski
################################################## ##################################################
create_minerva_overlay <- function(deg_table, map_components, library(dplyr)
min_lfc = 0.3, upper_cap = 3, lower_cap = -3,
to_file = NULL) { ### A helper function to flatten the distribution by assigning values over max = max and under min = min
# Remove low lfc genes trim_tails <- function(x, max, min) {
res <- deg_table[abs(deg_table$avg_log2FC) >= min_lfc,] x[x > max] <- max
# Reduce to minimal data frame x[x < min] <- min
res <- data.frame(name = res$gene, value = res$avg_log2FC) return(x)
res <- res[!is.na(res$name),] }
tres <- table(res$name)
res <- res[!(res$name %in% names(tres[tres > 1])),] create_minerva_overlay <- function(deg_table, name_hgnc_map,
# Cap to upper threshold and normalise positive values cap = 3, to_file = NULL) {
res$value[res$value > upper_cap] <- upper_cap ### Reduce to minimal data frame, cap to thresholds, normalise
res$value[res$value > 0] <- res$value[res$value > 0]/upper_cap res <- transmute(deg_table, name = gene, value = trim_tails(avg_log2FC, cap, -cap)) %>%
# Cap to upper threshold and normalise negative values filter(!is.na(name)) %>%
res$value[res$value < lower_cap] <- lower_cap mutate(value = value/cap)
res$value[res$value < 0] <- res$value[res$value < 0]/abs(lower_cap)
hgnc_refs <- get_components_annotations(map_components, "HGNC_SYMBOL")
all_named_hgncs <- list()
for(h in 1:length(hgnc_refs)) {
names(hgnc_refs[[h]]) <- map_components$map_elements[[h]]$name
hgnc_refs[[h]] <- hgnc_refs[[h]][!is.na(hgnc_refs[[h]]) & sapply(hgnc_refs[[h]], length) > 0]
for(n in names(hgnc_refs[[h]])) {
all_named_hgncs[[n]] <- unique(c(all_named_hgncs[[n]], hgnc_refs[[h]][[n]]))
}
}
overlay <- data.frame(name = names(all_named_hgncs), value = 0)
for(i in 1:nrow(overlay)) {
overlay[i,2] <- mean(res[res$name %in% all_named_hgncs[[i]],"value"])
}
overlay <- overlay[!is.na(overlay$value),] ### Calculate mean value for multiple HGNCs
overlay <- sapply(name_hgnc_map, function(x) mean(res[res$name %in% x,]$value))
overlay <- overlay[!is.nan(overlay)]
if(!is.null(to_file)) { if(!is.null(to_file)) {
# Write a MINERVA-friendly version of the table, return processing-friendly version # Write a MINERVA-friendly version of the table, return processing-friendly version
write.table(overlay, file = to_file, sep = "\t", col.names = T, row.names = F, quote = F) write.table(data.frame(name = names(overlay), value = overlay),
file = to_file, sep = "\t", col.names = T, row.names = F, quote = F)
} }
return(res) return(res)
} }
filter_data <- function(scdf, adjp = 0.1, minlfc = 0.3) {
scdf[abs(scdf$avg_log2FC) > minlfc & scdf$p_val_adj < adjp,]
}
pick_genes <- lapply(system("ls _notgit/rmd_res/all_*", intern = T), read.table, header = T, sep = "\t")
filter_genes <- lapply(pick_genes, filter_data)
mod_names <- sapply(filter_genes, function(x) x$comparison[1])
mod_names <- gsub("_Non-Infected", "_Mock", mod_names)
mod_names <- gsub("_Mock_", "_", mod_names)
mod_names <- gsub("_Illeum_", "_", mod_names)
mod_names <- gsub("_Colon_", "_", mod_names)
mod_names <- gsub("_v_Enterocyte 1_", "_vs_", mod_names)
mod_names <- gsub("_v_Inmature Enterocyte 2_", "_vs_", mod_names)
mod_names <- gsub("_v_Enterocyte 2_", "_vs_", mod_names)
mod_names <- gsub("_v_12h_Enterocyte 2_", "_vs_", mod_names)
mod_names <- gsub("_v_12h_Enterocyte 1_", "_vs_", mod_names)
mod_names <- gsub("_v_24h_Inmature Enterocyte 2_", "_vs_", mod_names)
mod_names <- gsub("_Enterocyte 1_", "_Enterocyte_1_", mod_names)
mod_names <- gsub("_Enterocyte 2_", "_Enterocyte_2_", mod_names)
mod_names <- gsub("_Inmature Enterocyte 2_", "_Imm_Enterocyte_2_", mod_names)
names(filter_genes) <- mod_names
### Helper functions to access and retrieve content from MINERVA-hosted diagrams ### Helper functions to access and retrieve content from MINERVA-hosted diagrams
source("https://git-r3lab.uni.lu/covid/models/-/raw/master/Integration/MINERVA_access/minerva_access_functions.R") source("https://git-r3lab.uni.lu/covid/models/-/raw/master/Integration/MINERVA_access/minerva_access_functions.R")
...@@ -78,24 +41,60 @@ map <- "https://covid19map.elixir-luxembourg.org/minerva/api/" ...@@ -78,24 +41,60 @@ map <- "https://covid19map.elixir-luxembourg.org/minerva/api/"
### Get the components of MINERVA and WikiPathways (see minerva_access.R) ### Get the components of MINERVA and WikiPathways (see minerva_access.R)
map_components <- get_map_components(map) map_components <- get_map_components(map)
### Detailed lists ### Detailed lists
mnv_refs <- get_components_annotations(map_components, "HGNC_SYMBOL") hgnc_refs <- get_components_annotations(map_components, "HGNC_SYMBOL")
mnv_refs <- sapply(mnv_refs, function(x) unique(unlist(x))) ### An element in MINERVA can have many HGNCs; to solve these conflicts
### create a list of name - HGNC mappings; each named element gets a list of associated HGNCs
scores <- data.frame(sapply(filter_genes, function(x) sapply(mnv_refs, function(y) length(intersect(y, x$gene))))) ### these are passed to "create_minerva_overlay"
colnames(scores) <- sapply(filter_genes, function(x) x$comparison[1]) name_hgncs <- list()
scores["overview",] <- colSums(scores) for(h in 1:length(hgnc_refs)) {
names(hgnc_refs[[h]]) <- map_components$map_elements[[h]]$name
hgnc_refs[[h]] <- hgnc_refs[[h]][!is.na(hgnc_refs[[h]]) & sapply(hgnc_refs[[h]], length) > 0]
for(n in names(hgnc_refs[[h]])) {
name_hgncs[[n]] <- unique(c(name_hgncs[[n]], hgnc_refs[[h]][[n]]))
}
}
selected_overlays <- c("Illeum_24h_Imm_Enterocyte_2_Bystander_vs_Mock", ### Read airway DEG files (generated by "SC_analysis_airways.Rmd"), filter and create overlay
"Illeum_24h_Imm_Enterocyte_2_Infected_vs_Mock") ### Airway cells' DEGs are all in the same file, with "cluster" defining their label
if(!file.exists("./lung_rmd_res/Lung_cell_specific_markers.txt")) {
message("File './lung_rmd_res/Lung_cell_specific_markers.txt' does not exist, run 'SC_analysis_airways.Rmd' to generate it.")
} else {
mnv_airway_cells <- read.table(file = "./lung_rmd_res/Lung_cell_specific_markers.txt",
sep = "\t", header = T) %>%
filter(abs(avg_log2FC) > 0.3 & p_val_adj < 0.1)
### We select three "secretory" subtypes to create overlays
ariw_sec1_cells <- create_minerva_overlay(filter(mnv_airway_cells, cluster == "Secretory1"),
name_hgnc_map = name_hgncs, to_file = "airway_secretory1.txt")
ariw_sec2_cells <- create_minerva_overlay(filter(mnv_airway_cells, cluster == "Secretory2"),
name_hgnc_map = name_hgncs, to_file = "airway_secretory2.txt")
ariw_sec3_cells <- create_minerva_overlay(filter(mnv_airway_cells, cluster == "Secretory3"),
name_hgnc_map = name_hgncs, to_file = "airway_secretory3.txt")
}