Commit 017b15d8 authored by Marek Ostaszewski's avatar Marek Ostaszewski
Browse files

data processing scripts by Stephane

parent 9a3b5434
Pipeline #44631 passed with stages
in 43 seconds
# Analysis of the colon data with Seurat
# 10X data and targeted assay
#-----------------------
# libraries
#-----------------------
library(reticulate)
library(dplyr)
library(Matrix)
library(Seurat)
library(future)
library(ggplot2)
library(cowplot)
library(viridis)
library(patchwork)
library(readxl)
library(glue)
#-----------------------
# variables
#-----------------------
# working directory
wrkDir <- "PATH_TO_WORKING_DIR"
# path to 10X folder with one subfolder per cellranger run:
inpDir <- "PATH_TO_10X_DATA_DIR"
# atlases
atlasDir <- "PATH_TO_ATLAS_DIR"
# targeted data
targetedDir <- "PATH_TO_TARGETED_DIR"
# output dir
outDir <- "PATH_TO_OUTPUT_DIR"
setwd(wrkDir)
#-----------------------
# Load 10X data
#-----------------------
# load 10X filtered matrices and make Seurat objects
# then concat
print('Loading 10X filtered matrices ...')
# Colon_Mock
studyId <- "Colon_Mock"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Colon_Mock <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# Colon_12h
studyId <- "Colon_12h"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Colon_12h <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# Colon_24h
studyId <- "Colon_24h"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Colon_24h <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# merge sets
#-----------------------
print('Merging sets ...')
MergeSI <- merge(x = Colon_Mock,
y = c(Colon_12h, Colon_24h))
#-----------------------
# Preprocessing
#-----------------------
print('Preprocessing ...')
# QC
#-----------------------
# mitochondrial content
MergeSI[["percent.mt"]] <- PercentageFeatureSet(object = MergeSI,
pattern = "^MT-")
# set tissue name
MergeSI$Test <- ifelse(MergeSI$orig.ident %in% c("Colon_Mock","Colon_12h","Colon_24h"),
"Colon",
"Ileum")
# QC violin plots
VlnPlot(MergeSI,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
group.by = "Test",
ncol = 3,
pt.size = 0)
# cell filtering
#-----------------------
# filter cells, keep cells with:
# - number of genes detected: in [1500; 9000]
# - library size: < 90000 UMI
# - mitochondrial content: < 30%
# number of genes detected: in [1500; 9000]
MergeSI2 <- subset(MergeSI,
subset = nFeature_RNA > 1500 & nFeature_RNA < 9000)
# library size: < 90000 UMI
MergeSI2 <- subset(MergeSI2,
subset = nCount_RNA < 90000)
# mitochondrial content: < 30%
MergeSI2 <- subset(MergeSI2,
subset = percent.mt < 30)
# draw ridgeplot distribution of number of genes detected
RidgePlot(MergeSI2, features = "nFeature_RNA")
# QC violin plots
VlnPlot(MergeSI2,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3,
pt.size = 0)
# split by time point
#-----------------------
MergeSI.list <- SplitObject(MergeSI2, split.by = "orig.ident")
#-----------------------
# normalisation
#-----------------------
# run SCTransform with lib size and mito content
print('Normalize data sets separately')
for (i in 1:length(MergeSI.list)) {
MergeSI.list[[i]] <- SCTransform(MergeSI.list[[i]],
vars.to.regress = "percent.mt",
verbose = T)
print(i)
}
#-----------------------
# integration
#-----------------------
print('Integrate data sets')
# select features for downstream integration
SI.features <- SelectIntegrationFeatures(object.list = MergeSI.list,
nfeatures = 3000)
# run PrepSCTIntegration to compute Pearson residuals
MergeSI.list <- PrepSCTIntegration(object.list = MergeSI.list,
anchor.features = SI.features,
verbose = T)
# find anchors
print('Find anchors')
# set params for parallelization
#plan("multiprocess", workers = 4)
#plan()
options(future.globals.maxSize = 4000 * 1024^2)
# run FindIntegrationAnchors
SI.anchors <- FindIntegrationAnchors(object.list = MergeSI.list,
normalization.method = "SCT",
anchor.features = SI.features,
verbose = T)
# get names of shared features:
all_features <- lapply(MergeSI.list, row.names) %>%
Reduce(intersect, .)
# integrate data sets:
print('Integrate data sets')
Colon <- IntegrateData(anchorset = SI.anchors,
normalization.method = "SCT",
verbose = T,
features.to.integrate = SI.features2)
#-----------------------
# clustering
#-----------------------
DefaultAssay(Colon) <- "integrated"
# pca
Colon <- RunPCA(Colon, verbose = FALSE,npcs = 50)
ElbowPlot(Colon,ndims = 51)
# clustering
Colon <- FindNeighbors(object = Colon, dims = 1:50)
Colon <- FindClusters(object = Colon, resolution = 0.5)
# dimensionality reduction for visualisation
Colon <- RunUMAP(object = Colon,
dims = 1:50,
seed.use = 2019,
assay = 'integrated',
verbose = T,
features = NULL,
seed.use = 2020)
Colon <- RunTSNE(object = Colon,
dims = 1:50,
seed.use = 2019)
#-----------------------
# cell type annotation
#-----------------------
# - Smillie et al atlas
# - SI atlas
# - organoids internal atlas
# Smillie
#-----------------------
print("Cell type annotation: Smillie et al")
smillie.seurat.integrated <- readRDS(glue("{atlasDir}/smillie.seurat.SCT.rds"))
DefaultAssay(smillie.seurat.integrated) <- 'SCT'
DefaultAssay(Colon) <- 'integrated'
smillie.anchors <- FindTransferAnchors(reference = smillie.seurat.integrated,
query = Colon,
dims = 1:30,
normalization.method = "SCT")
smillie.predictions <- TransferData(anchorset = smillie.anchors,
refdata = smillie.seurat.integrated$Cluster,
dims = 1:30)
Colon <- AddMetaData(Colon,
metadata = smillie.predictions$predicted.id
col.name = "Smillie")
# SI reference
#-----------------------
print("Cell type annotation: SI reference")
Reference <- readRDS(glue("{atlasDir}/SI.reference.rds"))
DefaultAssay(Reference) <- 'integrated'
DefaultAssay(Colon) <- 'integrated'
si.anchors <- FindTransferAnchors(reference = Reference,
query = Colon,
dims = 1:30,
normalization.method = "SCT",
project.query = T)
si.predictions <- TransferData(anchorset = si.anchors,
refdata = Reference$Cell_types,
dims = 1:30)
Colon <- AddMetaData(Colon,
metadata = si.predictions$predicted.id,
col.name ="Ileum_Tissue")
# organoids internal atlas
#-----------------------
print("Cell type annotation: organoids")
Reference <- readRDS(glue("{atlasDir}/Org.reference.rds"))
DefaultAssay(Reference.Org) <- 'integrated'
DefaultAssay(Colon) <- 'integrated'
org.anchors <- FindTransferAnchors(reference = Reference.Org,
query = Colon,
dims = 1:30,
normalization.method = "SCT")
org.predictions <- TransferData(anchorset = org.anchors,
refdata = Reference.Org$Celltype,
dims = 1:30)
Colon <- AddMetaData(Colon,
metadata = org.predictions$predicted.id,
col.name ="Ileum_org")
# write to file
#-----------------------
saveRDS(Colon,"Colon_Final.rds")
rm(Colon)
# resume
# read file back in
# into Colon_H
#-----------------------
Colon_H <- readRDS("Colon_Final.rds")
#-----------------------
# add targeted assay
#-----------------------
tagetGenes <- c("ACE2","APOA4","CA4","CHGB","FABP6","FCGBP","ISG15","LYZ",
"MKI67","OLFM4","COVID19","SLC2A2","SMOC2")
# load data
#-----------------------
loadDGE <- function(Path) {
df <- vroom(Path, delim = "\t",col_names = T)
df <- tibble::column_to_rownames(df,"GENE")
rownames(df)[rownames(df) == "SARS_COV2"] <- "COVID19"
df <- df[tagetGenes,]
colnames(df) <- paste0(colnames(df),"-1")
return(df)
}
# Colon_Mock_TG
glue("{targetedDir}/SI.reference.rds")
Colon_Mock_TG_dge <- loadDGE(glue("{targetedDir}/Colon_Mock/dge.txt"))
# Colon_12h_TG
Colon_12h_TG_dge <- loadDGE(glue("{targetedDir}/Colon_12h/dge.txt"))
# Colon_24h_TG
Colon_24h_TG_dge <- loadDGE(glue("{targetedDir}/Colon_24h/dge.txt"))
colnames(Colon_Mock_TG_dge) <- paste0("Colon_Mock_", colnames(Colon_Mock_TG_dge))
colnames(Colon_12h_TG_dge) <- paste0("Colon_12h_", colnames(Colon_12h_TG_dge))
colnames(Colon_24h_TG_dge) <- paste0("Colon_24h_", colnames(Colon_24h_TG_dge))
# combine:
#-----------------------
Matrix_TG <- cbind(Colon_Mock_TG_dge,
Colon_12h_TG_dge,
Colon_24h_TG_dge)
# names of cells names:
cellsToGet <- colnames(Matrix_TG) %in% colnames(Colon_H) # logical
cellsToGet <- colnames(Matrix_TG)[cellsToGet]) # names
# subset into new object Colon_H_T:
Colon_H_T <- subset(Colon_H, cells = cellsToGet)
# make targeted assay:
#-----------------------
cellsToGet <- colnames(Matrix_TG) %in% colnames(Colon_H_T) # logical
Colon_H_T[["Targeted"]] <- CreateAssayObject(Matrix_TG[, cellsToGet])
# log normalise targeted counts:
#-----------------------
Colon_H_T <- SetAssayData(object = Colon_H_T,
assay = "Targeted",
slot = "data",
new.data = log1p((Colon_H_T@assays$Targeted@data/Colon_H_T$nCount_RNA)*10000))
# copy raw targeted counts:
Colon_H_T$Corrected_COVID19 <- GetAssayData(Colon_H_T,
slot = "counts",
assay = "Targeted")["COVID19",]
# plot expression on embeddings:
FeaturePlot(Colon_H_T, features = "nCount_RNA")
# update targeted normalised counts:
# center by retrieving mean of non-null counts
Colon_H_T$Corrected_COVID19 <- Colon_H_T@meta.data %>%
group_by(orig.ident) %>%
mutate(Corrected_COVID19 =
Corrected_COVID19 - mean(Corrected_COVID19[Corrected_COVID19>0])
) %>%
pull(Corrected_COVID19)
# if Colon_Mock then 0:
Colon_H_T$Corrected_COVID19[Colon_H_T$orig.ident == "Colon_Mock"] <- 0
# if negative then 0:
Colon_H_T$Corrected_COVID19[Colon_H_T$Corrected_COVID19 < 0] <- 0
# if NA then 0:
Colon_H_T$Corrected_COVID19[is.na(Colon_H_T$Corrected_COVID19)] <- 0
# re-normalise after update:
Colon_H_T$Normalized_Corrected_COVID19 <- log1p((Colon_H_T$Corrected_COVID19/
Colon_H_T$nCount_RNA)*10000)
# set data set levels:
Colon_H_T$orig.ident <- factor(Colon_H_T$orig.ident,
levels = c("Colon_Mock","Colon_12h","Colon_24h"))
# set 'Targeted' as default:
DefaultAssay(Colon_H_T) <- "Targeted"
# write to file:
#-----------------------
saveRDS(Colon_H_T, glue("{outDir}/Colon_H_T.rds"))
# Analysis of the illeum data with Seurat
# 10X data and targeted assay
#-----------------------
# libraries
#-----------------------
library(reticulate)
library(dplyr)
library(Matrix)
library(Seurat)
library(future)
library(ggplot2)
library(cowplot)
library(viridis)
library(patchwork)
library(readxl)
library(glue)
#-----------------------
# variables
#-----------------------
# working directory
wrkDir <- "PATH_TO_WORKING_DIR"
# path to 10X folder with one subfolder per cellranger run:
inpDir <- "PATH_TO_10X_DATA_DIR"
# atlases
atlasDir <- "PATH_TO_ATLAS_DIR"
# targeted data
targetedDir <- "PATH_TO_TARGETED_DIR"
# output dir
outDir <- "PATH_TO_OUTPUT_DIR"
setwd(wrkDir)
#-----------------------
# Load 10X data
#-----------------------
# load 10X filtered matrices and make Seurat objects
# then concat
print('Loading 10X filtered matrices ...')
# Colon_Mock
studyId <- "Illeum_Mock"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Illeum_Mock <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# Illeum_12h
studyId <- "Illeum_12h"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Illeum_12h <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# Illeum_24h
studyId <- "Illeum_24h"
filtMat <- Read10X(data.dir = glue("{inpDir}/{studyId}/outs/filtered_feature_bc_matrix/"))
filtMat <- CreateSeuratObject(counts = filtMat,
project = studyId,
min.cells = 3,
min.features = 200)
Illeum_24h <- RenameCells(filtMat, add.cell.id = studyId)
rm(studyId, filtMat)
# merge sets
#-----------------------
print('Merging sets ...')
MergeSI <- merge(x = Illeum_Mock,
y = c(Illeum_12h, Illeum_24h))
#-----------------------
# Preprocessing
#-----------------------
print('Preprocessing ...')
# QC
#-----------------------
# mitochondrial content
MergeSI[["percent.mt"]] <- PercentageFeatureSet(object = MergeSI,
pattern = "^MT-")
# set tissue name
MergeSI$Test <- ifelse(MergeSI$orig.ident %in% c("Illeum_Mock","Illeum_12h","Illeum_24h"),
"Illeum",
"Ileum")
# QC violin plots
VlnPlot(MergeSI,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
group.by = "Test",
ncol = 3,
pt.size = 0)
# cell filtering
#-----------------------
# filter cells, keep cells with:
# - number of genes detected: in [1500; 9000]
# - library size: < 90000 UMI
# - mitochondrial content: < 30%
# number of genes detected: in [1500; 9000]
MergeSI2 <- subset(MergeSI,
subset = nFeature_RNA > 1500 & nFeature_RNA < 9000)
# library size: < 90000 UMI
MergeSI2 <- subset(MergeSI2,
subset = nCount_RNA < 90000)
# mitochondrial content: < 30%
MergeSI2 <- subset(MergeSI2,
subset = percent.mt < 30)
# draw ridgeplot distribution of number of genes detected
RidgePlot(MergeSI2, features = "nFeature_RNA")
# QC violin plots
VlnPlot(MergeSI2,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 3,
pt.size = 0)
# split by time point
#-----------------------
MergeSI.list <- SplitObject(MergeSI2, split.by = "orig.ident")
#-----------------------
# normalisation
#-----------------------
# run SCTransform with lib size and mito content
print('Normalize data sets separately')
for (i in 1:length(MergeSI.list)) {
MergeSI.list[[i]] <- SCTransform(MergeSI.list[[i]],
vars.to.regress = "percent.mt",
verbose = T)
print(i)
}
#-----------------------
# integration
#-----------------------
print('Integrate data sets')
# select features for downstream integration
SI.features <- SelectIntegrationFeatures(object.list = MergeSI.list,
nfeatures = 3000)
# run PrepSCTIntegration to compute Pearson residuals
MergeSI.list <- PrepSCTIntegration(object.list = MergeSI.list,
anchor.features = SI.features,
verbose = T)
# find anchors
print('Find anchors')
# set params for parallelization
#plan("multiprocess", workers = 4)
#plan()
options(future.globals.maxSize = 4000 * 1024^2)
# run FindIntegrationAnchors
SI.anchors <- FindIntegrationAnchors(object.list = MergeSI.list,
normalization.method = "SCT",
anchor.features = SI.features,
verbose = T)
# get names of shared features:
all_features <- lapply(MergeSI.list, row.names) %>%
Reduce(intersect, .)
# integrate data sets:
print('Integrate data sets')
Illeum <- IntegrateData(anchorset = SI.anchors,
normalization.method = "SCT",
verbose = T,
features.to.integrate = SI.features2)
#-----------------------
# clustering
#-----------------------
DefaultAssay(Illeum) <- "integrated"
# pca
Illeum <- RunPCA(Illeum, verbose = FALSE,npcs = 50)
ElbowPlot(Illeum,ndims = 51)
# clustering
Illeum <- FindNeighbors(object = Illeum, dims = 1:50)
Illeum <- FindClusters(object = Illeum, resolution = 0.5)
# dimensionality reduction for visualisation
Illeum <- RunUMAP(object = Illeum,dims = 1:50,
seed.use = 2019,