From 11aebe6f0cde426edbc37083d56e2af43d005550 Mon Sep 17 00:00:00 2001 From: Patrycja Mulica <patrycja.mulica@uni.lu> Date: Wed, 8 Feb 2023 14:51:14 +0100 Subject: [PATCH] Update Figure3-4-5/rnaseq_analyses_patrycja.R --- Figure3-4-5/rnaseq_analyses_patrycja.R | 228 +++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 Figure3-4-5/rnaseq_analyses_patrycja.R diff --git a/Figure3-4-5/rnaseq_analyses_patrycja.R b/Figure3-4-5/rnaseq_analyses_patrycja.R new file mode 100644 index 0000000..1e92ee1 --- /dev/null +++ b/Figure3-4-5/rnaseq_analyses_patrycja.R @@ -0,0 +1,228 @@ +# +# RNA-seq data pre-processing and analysis +# + +library('Rsubread') +# Rsubread 2.0.1 + +############################ +# Step 1: aligning the reads +############################ + +# retrieve fastq files +fastq.files <- list.files(path = "/mnt/data/AnneProjects", pattern = ".fq.gz$", full.names = TRUE, recursive = TRUE) + +# check that all files are present +print(fastq.files) + +# replace fq.gz by bam in file name +targets = data.frame(inputfile=fastq.files, outputfile=gsub("fq.gz","bam",fastq.files)) + +r1files = as.matrix(targets$inputfile)[grep("\\_1\\.fq",as.matrix(targets$inputfile))] +r2files = as.matrix(targets$inputfile)[grep("\\_2\\.fq",as.matrix(targets$inputfile))] + +outfiles = gsub("fq.gz","bam", gsub("\\_1\\.fq","\\.fq",r1files)) + +align(index="new_index",readfile1=r1files, readfile2=r2files, output_file=as.character(outfiles),output_format="BAM", nthreads=10) + + + +############################ +# Step 2: Quality check +############################ + + +bam.files <- list.files(path = "/mnt/data/AnneProjects", pattern = ".bam$", full.names = TRUE, recursive = TRUE) +print(bam.files) + +props <- propmapped(files=bam.files) +print(props) + + + +# Extract quality scores for individual fastq.files (one example) +qs <- qualityScores(filename=fastq.files[1],nreads=100) + + + +############################# +# Step 3: Determine counts +############################# + + +# version: +gtf <- list.files("./",pattern=".*gtf$",full.names = T) + +# +# ispairedEnd = TRUE +# + +fc <- featureCounts(files = as.character(outfiles), GTF.featureType="exon",GTF.attrType="gene_name", annot.ext=gtf, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE, nthreads=10) + + + +############################## +# Functions for data analysis +############################## + +# +# PLS-DA ploot +# + +plsdaplot = function(combdat, phenotype, filestr="plsda_plot.pdf"){ + + comblab = match(phenotype, unique(phenotype)) + comblabunq = unique(comblab) + + const = which(apply(combdat, 1, var)==0) + combfilt = combdat + + if(length(const)) + combfilt = combdat[-const,] + + # variance filter + vars = apply(combfilt, 1, var) + var1000 = order(vars, decreasing=T)[1:1000] + + source("osplda_rainbow.R") + + require('bitops') + require('pls') + mods <- make.OSC.PLS.model(comblab, pls.data = as.data.frame(t((combfilt[var1000,]))), comp = 2, OSC.comp = 0, validation = "LOO", method = "oscorespls", cv.scale = FALSE, progress = FALSE) + pdf(filestr) + plot.OSC.results(mods, plot = "scores", groups = phenotype) + dev.off() + +} + + +# +# Volcano plot +# +plotvolcano = function(dat, logfc, fdr, logfc_thresh1 = 1, fdr_thresh = 0.05, lablogfc_thresh = 1.5, labfdr_thresh = 0.001, cexlab=2, filestr="volcano_plot_with_labels.pdf") +{ + mat = data.frame(x=logfc, y=-log10(fdr)) + matorange = data.frame(x=logfc, y=-log10(fdr))[which(abs(logfc)>logfc_thresh1),] + matgreen = data.frame(x=logfc, y=-log10(fdr))[intersect(which(fdr<fdr_thresh),which(abs(logfc)>logfc_thresh1)),] + matred =data.frame(x=logfc, y=-log10(fdr))[which(fdr<fdr_thresh),] + + matlab = data.frame(x=logfc, y=-log10(fdr))[intersect(which(fdr<labfdr_thresh),which(abs(logfc)>lablogfc_thresh)),] + + + topgenes = rownames(dat) + matlabels = topgenes[intersect(which(fdr<labfdr_thresh),which(abs(logfc)>lablogfc_thresh))] + + + if(!require('ggrepel')) + { + install.packages("ggrepel") + } + require('ggplot2') + # write volcano plot to pdf + pdf(filestr) + p = ggplot(mat, aes(x, y)) + geom_point(color = 'gray', size=0.8, fill="white") + theme_classic(base_size = 16) + geom_point(data = matorange, color="orange") + geom_point(data = matred, color="red") + geom_point(data = matgreen, color="green") + geom_text_repel(data = matlab, label=matlabels, nudge_y=0.1, size=cexlab) + xlab('Log fold change') + ylab('-log10(P-value)') + theme(axis.text=element_text(size=12), axis.title=element_text(size=12,face="bold")) + print(p) + dev.off() + +} + +# +# Differential expression analysis +# +differential_analyses <- function(countsdat, condition1_ids, condition2_ids, condition1_name, condition2_name, filename_prefix="case_vs_control", logfc_thresh1 = 1, use_nominal = FALSE, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05, cex.axis = 1, filtmean=NULL){ + + require('DESeq2') + + # convert outcome variable into factor (e.g. "PD" vs. "control") + outcome = c(rep(condition1_name,length(condition1_ids)),rep(condition2_name,length(condition2_ids))) + + + cts = countsdat[,c(condition1_ids,condition2_ids)] + colnames(cts)=outcome + + coldata = factor(outcome, levels=c(condition1_name, condition2_name)) + names(coldata)=outcome + + dds = DESeqDataSetFromMatrix(countData = cts, colData = DataFrame(coldata), design = ~coldata) + + + library("edgeR") + dge <- DGEList(counts=cts) + keep <- filterByExpr(dge, design=model.matrix(~0+outcome)) + + dds_filt <- dds[keep,] + + topgenes_rnaseq <- DESeq(dds_filt) + + res_case_vs_control <- results(topgenes_rnaseq) + + res_case_vs_control <- as.matrix(res_case_vs_control[order(res_case_vs_control$pvalue),]) + + if(!is.null(filtmean)){ + res_case_vs_control = res_case_vs_control[-which(res_case_vs_control[,1] < filtmean),] + } + + + pdf( paste(filename_prefix,"_boxplots_top100.pdf",collapse=""), width=7, height=7) + par(mfrow=c(2,2)) + for(i in 1:100) + boxplot(as.matrix(cts[match(rownames(res_case_vs_control),rownames(cts))[i],]) ~ outcome, col=c("lightblue","pink"), main=rownames(res_case_vs_control)[i], ylab="preprocessed expression count", xlab="", cex.axis=cex.axis, outline=T) + dev.off() + + # ranking table + write.xlsx(cbind(round(res_case_vs_control[,1:4], 3), format(res_case_vs_control[,5:6], digits=3)), paste(filename_prefix,"_ranking.xlsx",collapse="")) + + if(!use_nominal){ + plotvolcano(res_case_vs_control, res_case_vs_control[,"log2FoldChange"], res_case_vs_control[,"padj"], logfc_thresh1 = logfc_thresh1, fdr_thresh = fdr_thresh, lablogfc_thresh = lablogfc_thresh, labfdr_thresh = labfdr_thresh, filestr=paste(filename_prefix,"_volcano_plot_with_labels.pdf",collapse="")) + } else { + plotvolcano(res_case_vs_control, res_case_vs_control[,"log2FoldChange"], res_case_vs_control[,"pvalue"], logfc_thresh1 = logfc_thresh1, fdr_thresh = fdr_thresh, lablogfc_thresh = lablogfc_thresh, labfdr_thresh = labfdr_thresh, filestr=paste(filename_prefix,"_volcano_plot_with_labels.pdf",collapse="")) + } + + return(res_case_vs_control) +} + + + +############################## +# Astrocyte analysis +############################## + +library(xlsx) +astrocytes_annot <- xlsx::read.xlsx("samples_astrocytes.xlsx", 1) + +astrocytes_samp = grep("Oksanen|Palm",colnames(fc$counts)) + +phenotype = rep("oksanen_t129",length(astrocytes_samp)) +phenotype[grep("48.*Oksanen",colnames(fc$counts)[astrocytes_samp])] = rep("oksanen_48",length(grep("48.*Oksanen",colnames(fc$counts)[astrocytes_samp]))) +phenotype[grep("T12.*Palm",colnames(fc$counts)[astrocytes_samp])] = rep("palm_t129",length(grep("T12.*Palm",colnames(fc$counts)[astrocytes_samp]))) +phenotype[grep("48.*Palm",colnames(fc$counts)[astrocytes_samp])] = rep("palm_48",length(grep("48.*Palm",colnames(fc$counts)[astrocytes_samp]))) + +print(table(phenotype)) +#phenotype +# oksanen_48 oksanen_t129 palm_48 palm_t129 +# 3 3 3 3 + +oksanen_t129 = which(phenotype=="oksanen_t129") +oksanen_48 = which(phenotype=="oksanen_48") +palm_48 = which(phenotype=="palm_48") +palm_t129 = which(phenotype=="palm_t129") + +plsdaplot(fc$counts[,astrocytes_samp], phenotype, filestr="plsda_rnaseq_astrocytes_2022.pdf") + + + +############################## +# DESeq2 analysis +############################## + +res_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], c(oksanen_t129,oksanen_48), c(palm_t129,palm_48), "oksanen", "palm", filename_prefix="oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05) + +res_t129_vs_48_all = differential_analyses(fc$counts[,astrocytes_samp], c(oksanen_t129,palm_t129), c(oksanen_48,palm_48), "T12.9", "#48", filename_prefix="t129_vs_48_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05) + +res_t129_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], oksanen_t129, palm_t129, "T12.9_oksanen", "T12.9_palm", filename_prefix="t129_oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05) + +res_48_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], oksanen_48, palm_48, "#48_oksanen", "#48_palm", filename_prefix="48_oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05) + + + + -- GitLab