568Edu_647Sox2.Rmd

---
title: "R Notebook"
output: html_notebook
---

```{r}
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
#display.brewer.all()
library(tidyverse)
library(ggpubr)
library(stringr)
library(ggsignif)
library(openxlsx)
library(viridis)
library(xgboost)
library(data.table)
library(readxl)
library(janitor)
library(dplyr)
library(jcolors)
library(stringr)
library(rstatix)
library(FSA)
library(dunn.test)
```
Data import
```{r}


setwd("//atlas.uni.lux/users/isabel.rosety/GBA/Stainings/Plots/488Ki67_568Edu_647Sox2/")


data1<-read.csv("//atlas.uni.lux/users/isabel.rosety/GBA/Stainings/Plots/488Ki67_568Edu_647Sox2/IR_20220915_488Ki67_568Edu_647Sox2_RS_e51e52e54d30Edu8h_20220915_175904.csv",sep = ";")
data2<-read.csv("//atlas.uni.lux/users/isabel.rosety/GBA/Stainings/Plots/488Ki67_568Edu_647Sox2/IR_20220915_488Ki67_568Edu_647Sox2_RS_e53d30edu8h_20220919_134031.csv",sep = ";")
#data<-read.csv("//atlas.uni.lux/users/isabel.rosety/GBA/Stainings/Plots/488Ki67_568Edu_647Sox2/IR_20220915_488Ki67_568Edu_647Sox2_RS_e52to54d30edu48h_20220915_175904.csv",sep = ";")
data3<-read.csv("//atlas.uni.lux/users/isabel.rosety/GBA/Stainings/Plots/488Ki67_568Edu_647Sox2/IR_20230523_568Edu_647Sox2_RS_b55b56b57d30_20230521_183416.csv",sep = ";")

data = bind_rows(data1,data2,data3) 

tmp=do.call(rbind,strsplit(data$AreaName,"_")) #stringsplit will take the column Areaname and will split this depending on the underscore, then we put them otgeter one ofter the other


data$Condition = tmp[,1]

data %>% 
   mutate(Condition = str_replace_all(Condition, 
            pattern = "MUT", replacement = "GBA_PD")) %>%
     mutate(Condition = str_replace_all(Condition, 
            pattern = "WT", replacement = "CTRL")) ->data
data %>% 
  mutate(CellLine = AreaName) ->data

data %>% 
   mutate(CellLine = str_replace_all(CellLine, 
            pattern = "WT_56", replacement = "CTRL1")) %>%
    mutate(Condition = str_replace_all(Condition, 
            pattern = "WT_39", replacement = "CTRL2"))  %>%
    mutate(CellLine = str_replace_all(CellLine, 
            pattern = "WT_68", replacement = "CTRL3")) %>%
    mutate(CellLine = str_replace_all(CellLine, 
            pattern = "MUT_309", replacement = "PD1")) %>%
    mutate(CellLine = str_replace_all(CellLine, 
            pattern = "MUT_KTI6", replacement = "PD2")) %>%
    mutate(CellLine = str_replace_all(CellLine, 
            pattern = "MUT_SGO1", replacement = "PD3")) ->data


data$TimepointEdu<-paste0(data$Condition,"_",data$Timepoint) #stringr package needed


toselect = c("Barcode","CellLine","Condition","TimepointEdu", "Batch","OrganoidIdx","Day","Batch","EduInSox2MFI","Sox2ByNucAlive","EduByNucAlive","EduAndSox2BySox2")
#toselect = c("Barcode","AreaName","Condition","Batch","OrganoidIdx","Day","Batch","Sox2ByNucAlive","Ki67ByNucAlive","EduByNucAlive","EduAndSox2BySox2","Ki67AndSox2BySox2","Ki67AndEduAndSox2BySox2")


data %>% #same 
  dplyr::select(toselect)->data
feature_names<-colnames(data[,8:ncol(data)])
#feature_names<-"TUNELByNuc"
#Mean of replicates
data%>%
    pivot_longer(feature_names,names_to = "Features", values_to = "Measure") -> data_all_long
data2 = drop_na(data_all_long)  

#Getting the mean of the technical replicates
data2 %>%
  group_by(Condition,CellLine,TimepointEdu,Batch,Features, Barcode,Day) %>%
  summarize(MeanFeatures = mean(Measure))-> data_Mean

#Normalizing to mean of controls
data_Mean %>%
    dplyr::filter(Condition=="CTRL") %>% 
    #group_by(Condition, Day, Features,Batch) %>% 
    group_by(Condition,Features,Batch) %>% #View
    summarise(baseline = mean(MeanFeatures)) %>% #View 
    ungroup() %>%
    dplyr::select(-Condition) %>%
    #full_join(data_Mean, by=c("Day","Features","Batch")) %>%
    full_join(data_Mean, by=c("Features","Batch")) %>% 
    mutate(Foldchange = MeanFeatures /baseline) ->data_all_based
    included_vars =c("CellLine", "Condition","Batch","Barcode","Day") # here we already have the mean of each section
    data_all_based %>% 
    pivot_wider(id_cols=all_of(included_vars),names_from = Features,values_from = Foldchange)->data_all_based_wide2
    

#If I want to get the wide table without normalizing to mean of controls:
data_Mean %>%
   full_join(data_Mean, by=c("Features","MeanFeatures","Batch","CellLine", "TimepointEdu","Barcode", "Condition","Day")) ->data_Mean 
included_vars =c("CellLine", "Condition","Batch","TimepointEdu","Barcode","Day") 
data_Mean %>% 
    pivot_wider(all_of(included_vars),names_from = Features,values_from = MeanFeatures)->data_Mean_wide

#write.csv(data_Mean_wide, file = 'data_all_combined.csv')

```

KW test loop
```{r}

apply(data_Mean_wide [7:ncol(data_Mean_wide)],2,shapiro.test)->shapiro_test #normality hypothesis rejected when p-value <0.05 -> use non parametric

kruskal.test(MeanFeatures ~ Features, data = data_Mean)

kruskal.test(EduAndSox2BySox2 ~ TimepointEdu, data = data_Mean_wide ) # if p-value <0.05, then there is a statistically significant difference between the groups

#Dunn test
#apply(data_Mean_wide [7:ncol(data_Mean_wide)],2,kruskal.test)
assign_significance = function(pvals){
  significance = rep(NA,length(pvals))
  significance[pvals >= 0.05] = 'ns'
  significance[pvals <0.05 & pvals >0.01] = '*' 
  significance[pvals <=0.01 & pvals >0.001] = '**'
  significance[pvals <=0.001 & pvals >0.0001] = '***'
  significance[pvals <=0.0001] = '****'
  return(significance)
}
Features=c("EduAndSox2BySox2")
wanted =c("CTRL_T0 - CTRL_T7","CTRL_T0 - GBA_PD_T0", "CTRL_T7 - GBA_PD_T7","GBA_PD_T0 - GBA_PD_T7")

results = list()

for(feat in Features){
  kpvals = kruskal.test(data_Mean_wide[[feat]],as.factor(data_Mean_wide$TimepointEdu))$p.value #first time run this line and next line to see the order of the features, define feat
  tmp = dunn.test(data_Mean_wide[[feat]],as.factor(data_Mean_wide$TimepointEdu), list=TRUE)
  pvals = tmp$P[match(wanted,tmp$comparisons)]
  padjusted =p.adjust(pvals,method="BH",n=length(wanted)*length(Features))
  results[[feat]] = cbind(wanted,feat,kpvals,pvals,padjusted)
}

table_results = as.data.table(do.call(rbind,results))
table_results$significancekw = assign_significance(as.double(table_results$kpvals))
table_results$significancedt = assign_significance(as.double(table_results$pvals))
table_results$significancedtadj = assign_significance(as.double(table_results$padjusted))
table_results = dplyr::select(table_results,"wanted", "feat", "kpvals","significancekw","pvals","significancedt","padjusted","significancedtadj")

#write.csv(table_results, file = 'KW_table_results.csv')

```

Add group1 and group2
```{r}
#table_results|>column_to_rownames(var="wanted")->table_results
tmp <- str_split_fixed(table_results$wanted, ' - ', 2)
table_results$group1 = tmp[,1]
table_results$group2 = tmp[,2]

```


Test to add significance bars with Kruskal Wallis (loop)
```{r}
for (i in 1:length(Features)) {
data_Mean_wide %>%
  pivot_longer(cols=Features, names_to = "feature", values_to = "value") %>%
  filter(feature %in% Features[i]) %>%
  ggplot( aes(x=factor(TimepointEdu, level = c("CTRL_T0","CTRL_T7", "GBA_PD_T0","GBA_PD_T7")), y=value)) +
  #geom_violin( aes(fill=Condition),show.legend = T, trim=T),
  geom_violin( aes(fill=TimepointEdu),show.legend = T,scale = "width", trim=T)+
  geom_dotplot(binaxis = "y",stackdir = "center",dotsize=0.5)+
    scale_fill_manual(values= c("#1565C0",alpha(("#1565C0"),0.4),"#CC0000",alpha(("#CC0000"),0.4)),name = "TimepointEdu")+
    #scale_y_continuous(expand = c(0, 0))+
    ylim(0, 0.09)+
    scale_color_jcolors("pal7")+
    theme(legend.key=element_blank()) +
    stat_pvalue_manual(table_results, xmin = "group1", label="significancedtadj", y.position = 0.07, step.increase = 0.09,size=8, hide.ns =F)+
    
    #facet_grid(~fct_relevel(Day, "d30","d60"), scales="free") +
    labs(x     = "",
       y     = paste(Features[i]),
       #y     = paste(names[i]),
       fill  = "Condition",
       #title = paste(feature_names[i])) +
       title = "" )+
    theme_bw() +
    theme(
    axis.line = element_line(colour = 'black', linewidth = 0.5) ,
    axis.title.x = element_blank(),
    axis.text.x = element_text(size=8, color="black"),
    axis.title.y = element_text(size = 12),
    axis.text.y = element_text(size=10, color="black"),
    axis.ticks.y = element_line(),
    axis.ticks.length=unit(.25, "cm"),
    #change legend text font size)
    #legend.key.size = unit(0.7, "cm"),
    #legend.key.width = unit(0.6,"cm"),
    legend.key=element_blank(),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    plot.title = element_text(size = 20, hjust=0.5, vjust= 1, face = "bold"),
    plot.subtitle = element_blank(),#element_text(size = 2, hjust=0.5)
    strip.text = element_text(size=12, vjust=0.5),
    strip.background = element_rect(fill="lightgray"),
   # panel.border = element_rect(fill = NA, color = "black"),
    panel.spacing.y = unit(0.8, "lines"),
    strip.switch.pad.wrap=unit(20, "lines"),
    legend.position="right",
    legend.text = element_text(size=17),
    legend.title = element_text(size=19)
    )  -> p
  print(p)
  #t<- cowplot::ggdraw(cowplot::add_sub(p, "Wilcox-test, ***p=0.001, **p=0.01, *p=0.05",hjust=-0.2, size=13))
  ##ggsave(paste0(Sys.Date(),"_", names[i], ".pdf"), plot=t)
#ggsave(paste0(Sys.Date(),(sprintf("Plot_%s_Edu_All_Timepoints_test_ViolinTrimmed.pdf",feature_names[i]))),height=3,width=5)
}

```