Upload New File

e32c8424 · Alberto Valdeolivas · bffedeb7 · e32c8424
Commit e32c8424 authored 3 years ago by Alberto Valdeolivas
--- a/02_Carnival_Transcriptomics_phosphoprotemicsMann.Rmd
+++ b/02_Carnival_Transcriptomics_phosphoprotemicsMann.Rmd
+---
+title: "CARNIVAL for causal integration of Phospho and transcriptomics data after SARS-CoV-2 infection"
+author: "Alberto Valdeolivas: alberto.valdeolivas@bioquant.uni-heidelberg.de; Date:"
+date: "09/07/2020"
+output: github_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+### License Info
+
+This program is free software: you can redistribute it and/or modify it under 
+the terms of the GNU General Public License as published by the Free Software 
+Foundation, either version 3 of the License, or (at your option) any later 
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY 
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 
+A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+Please check http://www.gnu.org/licenses/.
+
+## Introduction
+
+The present script computes TF activity from transcriptomics data on the A549 
+human lung derived cell line after SARS-CoV-2 infection. It also takes kinase
+activity derived from phosphoproteomics data in the same cell line. Both 
+experiments were conducted 24 hours after SARS-CoV-2 infection, but they come 
+from two different independent publications (see references)
+
+## Getting the inputs for CARNIPHAL  
+
+We first load the required libraries and define a function to export CARNIVAL
+results to cytoscape.
+
+```{r, message=FALSE, warning=FALSE}
+library(readr)
+library(DESeq2)
+library(CARNIVAL)
+library(OmnipathR)
+library(progeny)
+library(dorothea)
+library(dplyr)
+library(readr)
+library(tidyr)
+library(tibble)
+library(ggplot2)
+
+## We also define a function to format the CARNIVAL output to cytoscape
+OutputCyto <- function(CarnivalResults, outputFile) {
+    CarnivalNetwork <- 
+        as.data.frame(CarnivalResults$weightedSIF, stringsAsFactors = FALSE) %>%
+        dplyr::mutate(Sign = as.numeric(Sign), Weight = as.numeric(Weight)) %>% 
+        dplyr::mutate(Weight = Sign * Weight) %>%
+        dplyr::select(Node1, Weight, Node2)
+        
+    CarnivalNetworkNodes <- 
+        unique(c(CarnivalNetwork$Node1,CarnivalNetwork$Node2))
+    
+    CarnivalAttributes <- CarnivalResults$nodesAttributes %>% 
+        as.data.frame() %>%
+        dplyr::filter(Node %in% CarnivalNetworkNodes) %>%
+        dplyr::mutate(NodeType = as.character(NodeType)) %>%
+        dplyr::mutate(NodeType=if_else(NodeType =="", "I", NodeType))
+            
+    nameOutputNetwork <- paste0(outputFile, "Network.sif")
+    nameOutputAttributes <-  paste0(outputFile, "Attributes.txt")    
+    
+    write.table(CarnivalNetwork, file = nameOutputNetwork,
+        quote = FALSE, row.names = FALSE, col.names = FALSE, sep = " ")
+    
+    write.table(CarnivalAttributes, file = nameOutputAttributes,
+        quote = FALSE, row.names = FALSE, col.names = TRUE, sep = "\t")
+}
+```
+
+### Differential expression analysis on the transcriptomics data
+
+We take the series 5 from the following dataset (A549 mock treated versus 
+SARS-CoV-2 infected): 
+
+<https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE147507>
+
+which is related to the following publication:
+
+<https://www.cell.com/cell/pdf/S0092-8674(20)30489-X.pdf?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS009286742030489X%3Fshowall%3Dtrue>
+
+```{r, message=FALSE}
+## Raw counts table
+GSE147507_raw_counts <- 
+    read.csv("TranscriptomicsData/GSE147507_RawReadCounts_Human.tsv", sep = "\t")
+
+count_A549vsCOV2_df <- GSE147507_raw_counts[,c(22:27)]
+row.names(count_A549vsCOV2_df) <- GSE147507_raw_counts$X
+
+## Define conditions
+targets_A549vsCOV2 <- 
+    as.data.frame(matrix(NA,length(names(count_A549vsCOV2_df)),1))
+names(targets_A549vsCOV2) <- c("condition")
+row.names(targets_A549vsCOV2) <- names(count_A549vsCOV2_df)
+targets_A549vsCOV2$condition <- 
+    gsub("Series5_", "", row.names(targets_A549vsCOV2))
+targets_A549vsCOV2$condition <- 
+    factor(gsub("_[1-3]$", "", targets_A549vsCOV2$condition))
+targets_A549vsCOV2
+
+## Create deseq2 object
+dds_A549vsCOV2 <- 
+    DESeqDataSetFromMatrix(countData = as.matrix(count_A549vsCOV2_df), 
+    colData = targets_A549vsCOV2, design = ~ condition)
+
+## Set control
+dds_A549vsCOV2$condition <- relevel(dds_A549vsCOV2$condition, 
+    ref = levels(targets_A549vsCOV2$condition)[1])
+
+## Carry out diff exp
+dds_A549vsCOV2 <- DESeq(dds_A549vsCOV2)
+
+## See the comparisons carried out
+comparison_A549vsCOV2 <- resultsNames(dds_A549vsCOV2)
+
+## Get results table
+results_A549vsCOV2 <- 
+    results(dds_A549vsCOV2, name=comparison_A549vsCOV2[2])
+```
+
+### Pathway activity estimation using Progeny
+
+We first estimate the pathway activity using the Progeny package. In particular,
+we compute the normalised enriched score (NEs) of the different pathways by 
+running Progeny using the statistic from the differential express analysis.
+
+```{r}
+dds_A549vsCOV2_df <- as.data.frame(results_A549vsCOV2) %>% 
+    rownames_to_column(var = "GeneID") %>% 
+    dplyr::select(GeneID, stat) %>% 
+    dplyr::filter(!is.na(stat)) %>% 
+    column_to_rownames(var = "GeneID") 
+
+pathways_A549vsCOV2_zscore <- t(progeny(as.matrix(dds_A549vsCOV2_df), 
+    scale=TRUE, organism="Human", top = 100, perm = 10000, z_scores = TRUE))
+colnames(pathways_A549vsCOV2_zscore) <- "NES"
+
+## I also need to run progeny in such a way to have values between 1 and -1 to
+## use as CARNIVAL input
+pathways_A549vsCOV2_zscore_inputCarnival <- 
+  t(progeny(as.matrix(dds_A549vsCOV2_df), 
+    scale=TRUE, organism="Human", top = 100, perm = 10000, z_scores = FALSE))
+colnames(pathways_A549vsCOV2_zscore_inputCarnival) <- "Activity"
+```
+
+We now display the normalized enrichment scores (NES) in a bar plot.
+
+```{r, dpi=300}
+pathways_A549vsCOV2_zscore_df <- as.data.frame(pathways_A549vsCOV2_zscore) %>% 
+    rownames_to_column(var = "Pathway") %>%
+    dplyr::arrange(NES) %>%
+    dplyr::mutate(Pathway = factor(Pathway))
+
+ggplot(pathways_A549vsCOV2_zscore_df,aes(x = reorder(Pathway, NES), y = NES)) + 
+    geom_bar(aes(fill = NES), stat = "identity") +
+    scale_fill_gradient2(low = "darkblue", high = "indianred", 
+        mid = "whitesmoke", midpoint = 0) + 
+    theme_minimal() +
+    theme(axis.title = element_text(face = "bold", size = 12),
+        axis.text.x = 
+            element_text(angle = 45, hjust = 1, size =10, face= "bold"),
+        axis.text.y = element_text(size =10, face= "bold"),
+        panel.grid.major = element_blank(), 
+        panel.grid.minor = element_blank()) +
+    xlab("Pathways")
+```
+
+### Transcription Factor activity with Dorothea and Viper
+
+Now, we estimate the transcription factor (TF) activity using the dorothea 
+package. We select Dorothea interactions with confidence level A, B and C.
+
+```{r}
+## We load Dorothea Regulons
+data(dorothea_hs, package = "dorothea")
+regulons <- dorothea_hs %>%
+  dplyr::filter(confidence %in% c("A", "B","C"))
+```
+
+We run Viper using the statistic from the different expression analysis.
+
+```{r, message=FALSE}
+dds_A549vsCOV2_stat <-  as.data.frame(results_A549vsCOV2) %>% 
+    rownames_to_column(var = "GeneID") %>% 
+    dplyr::select(GeneID, stat) %>% 
+    dplyr::filter(!is.na(stat)) %>% 
+    column_to_rownames(var = "GeneID") %>%
+    as.matrix()
+
+tf_activities_A549vsCOV2_stat <- 
+    dorothea::run_viper(as.matrix(dds_A549vsCOV2_stat), regulons,
+    options =  list(minsize = 5, eset.filter = FALSE, 
+    cores = 1, verbose = FALSE, nes = TRUE))
+```
+
+We now display the top 25 normalized enrichment scores (NES) for the TF in a 
+bar plot.
+
+```{r, dpi=300}
+tf_activities_A549vsCOV2_top25 <- tf_activities_A549vsCOV2_stat %>%
+    as.data.frame() %>% 
+    rownames_to_column(var = "GeneID") %>%
+    dplyr::rename(NES = "stat") %>%
+    dplyr::top_n(25, wt = abs(NES)) %>%
+    dplyr::arrange(NES) %>% 
+    dplyr::mutate(GeneID = factor(GeneID))
+
+ggplot(tf_activities_A549vsCOV2_top25,aes(x = reorder(GeneID, NES), y = NES)) + 
+    geom_bar(aes(fill = NES), stat = "identity") +
+    scale_fill_gradient2(low = "darkblue", high = "indianred", 
+        mid = "whitesmoke", midpoint = 0) + 
+    theme_minimal() +
+    theme(axis.title = element_text(face = "bold", size = 12),
+        axis.text.x = 
+            element_text(angle = 45, hjust = 1, size =10, face= "bold"),
+        axis.text.y = element_text(size =10, face= "bold"),
+        panel.grid.major = element_blank(), 
+        panel.grid.minor = element_blank()) +
+    xlab("Transcription Factors")
+```
+
+### Reading Kinase Activity
+
+We here read the kinase activity computed by Danish Memon <memon@ebi.ac.uk> from
+the phosphoproteomics data extracted from the following publication:
+
+<https://www.biorxiv.org/content/10.1101/2020.06.17.156455v1>
+
+```{r, message=FALSE}
+## We take the kinase activity after 24 hours to be in line with the transcriptomics
+## data
+DIA_kinase_activity <- read_csv("Mann/diaCovidKinaseActMat.csv") %>% 
+    dplyr::rename(kinase = "X1") %>% 
+    dplyr::select(kinase, `24h`)
+```
+
+
+### Generating the Prior Knowledge Network from Omnipath
+
+We use the OmnipathR package to fetch the Omnipath database and generate the 
+prior knowledge network. We take the signed and directed protein-protein 
+interactions. 
+
+```{r}
+ia_omnipath <- import_omnipath_interactions() %>% as_tibble()
+ia_pwextra <- import_pathwayextra_interactions() %>% as_tibble()
+ia_kinaseextra <- import_kinaseextra_interactions() %>% as_tibble()
+
+## We bind the datasets
+interactions <- as_tibble(
+    bind_rows(
+        ia_omnipath %>% mutate(type = 'ppi'),
+        ia_pwextra %>% mutate(type = 'ppi'),
+        ia_kinaseextra %>% mutate(type = 'ppi')))
+
+signed_directed_interactions <- 
+    dplyr::filter(interactions, consensus_direction==1) %>%
+    filter(consensus_stimulation == 1 | consensus_inhibition == 1) %>% 
+    dplyr::mutate(sign = if_else(consensus_stimulation==1,1,-1))  %>%
+    dplyr::select(source_genesymbol, sign,  target_genesymbol) %>%
+    dplyr::rename(source ="source_genesymbol", target ="target_genesymbol")
+
+carnival_pkn <- signed_directed_interactions %>%
+    dplyr::distinct(source, target, .keep_all = TRUE)
+
+all_source_nodes <- unique(carnival_pkn$source)
+all_target_nodes <- unique(carnival_pkn$target)
+all_nodes_network <- unique(c(all_source_nodes,all_target_nodes))
+```
+
+## Results
+
+We are going to run CARNIVAL twice using the TF activity as the CARNIVAL 
+measurements and the kinase activities as perturbations. In the second run,
+we will additionally use pathway scores from progeny as weights for the network. 
+For these runs, we are going to select the top 30 TFs and the top 10 kinases. 
+
+```{r}
+top_10_kinases <- DIA_kinase_activity %>% 
+    dplyr::filter(kinase %in% all_source_nodes) %>% 
+    dplyr::top_n(10,abs(`24h`)) %>%  
+    tibble::column_to_rownames(var = "kinase") %>% 
+    t() %>% sign() %>% as.data.frame()
+
+top_30_tf <- tf_activities_A549vsCOV2_stat %>% 
+  as.data.frame() %>% 
+  rownames_to_column(var = "GeneID") %>%
+  dplyr::filter(GeneID %in% all_nodes_network) %>%
+  dplyr::arrange(desc(abs(stat))) %>%
+  dplyr::top_n(30, wt = abs(stat)) %>%
+  column_to_rownames(var = "GeneID") %>%
+  t() %>% as.data.frame()
+
+progeny_weigths <- pathways_A549vsCOV2_zscore_inputCarnival %>% 
+    as.data.frame() %>% t()
+```
+
+### CARNIVAL without progeny weights 
+
+```{r, message=FALSE, warning=FALSE, eval=FALSE}
+carnival_results_noprogeny <-runCARNIVAL(
+    solverPath="/opt/ibm/ILOG/CPLEX_Studio129/cplex/bin/x86-64_linux/cplex",
+    netObj=carnival_pkn,
+    measObj=top_30_tf,
+    inputObj = top_10_kinases,
+    # dir_name="Carnival_Results",
+    # weightObj=as.data.frame(t(Kin_activity_LNCaP_noInhib_t1_EGF)),
+    # nodeID = 'gene',
+    timelimit = 900,
+    solver = "cplex")
+saveRDS(carnival_results_noprogeny, 
+    file = "Carnival_Results/carnival_results_noprogeny.rds")
+OutputCyto(carnival_results_noprogeny, 
+    outputFile="Carnival_Results/carnival_results_noprogeny")
+```
+
+Network with the results: Rectangles are the most active transcription factors 
+after infection and the inverse triangles are the most active kinases. Ellipses 
+are signaling intermediates proteins linking those kinsases and TFs. 
+Red means positive activity or overphosphorylation after infection and blue the 
+opposite. 
+
+
+<br><br>
+![](Carnival_Results/carnival_results_noprogenyNetwork.sif.png)
+<br><br>
+
+### CARNIVAL with progeny weights
+
+```{r, message=FALSE, warning=FALSE, eval=FALSE}
+carnival_results_withprogeny <-runCARNIVAL(
+    solverPath="/opt/ibm/ILOG/CPLEX_Studio129/cplex/bin/x86-64_linux/cplex",
+    netObj=carnival_pkn,
+    measObj=top_30_tf,
+    inputObj = top_10_kinases,
+    # dir_name="Carnival_Results",
+    weightObj=progeny_weigths,
+    # nodeID = 'gene',
+    timelimit = 900,
+    solver = "cplex")
+saveRDS(carnival_results_withprogeny, 
+    file = "Carnival_Results/carnival_results_withprogeny.rds")
+OutputCyto(carnival_results_withprogeny, 
+    outputFile="Carnival_Results/carnival_results_withprogeny")
+```
+
+<br><br>
+![](Carnival_Results/carnival_results_withprogenyNetwork.sif.png)
+<br><br>
+
+## Session Info Details
+
+```{r, echo=FALSE, eval=TRUE}
+sessionInfo()
+```
\ No newline at end of file