Update Figure3-4-5/rnaseq_analyses_patrycja.R

11aebe6f · Patrycja Mulica · cd71637e · 11aebe6f
Commit 11aebe6f authored 2 years ago by Patrycja Mulica
--- a/Figure3-4-5/rnaseq_analyses_patrycja.R
+++ b/Figure3-4-5/rnaseq_analyses_patrycja.R
+#
+# RNA-seq data pre-processing and analysis
+#
+library('Rsubread')
+# Rsubread 2.0.1
+############################
+# Step 1: aligning the reads
+############################
+# retrieve fastq files
+fastq.files <- list.files(path = "/mnt/data/AnneProjects", pattern = ".fq.gz$", full.names = TRUE, recursive = TRUE)
+# check that all files are present
+print(fastq.files)
+# replace fq.gz by bam in file name
+targets = data.frame(inputfile=fastq.files, outputfile=gsub("fq.gz","bam",fastq.files))
+r1files = as.matrix(targets$inputfile)[grep("\\_1\\.fq",as.matrix(targets$inputfile))]
+r2files =  as.matrix(targets$inputfile)[grep("\\_2\\.fq",as.matrix(targets$inputfile))]
+outfiles = gsub("fq.gz","bam", gsub("\\_1\\.fq","\\.fq",r1files))
+align(index="new_index",readfile1=r1files, readfile2=r2files, output_file=as.character(outfiles),output_format="BAM", nthreads=10)
+############################
+# Step 2: Quality check
+############################
+bam.files <- list.files(path = "/mnt/data/AnneProjects", pattern = ".bam$", full.names = TRUE, recursive = TRUE)
+print(bam.files)
+props <- propmapped(files=bam.files)
+print(props)
+# Extract quality scores for individual fastq.files (one example)
+qs <- qualityScores(filename=fastq.files[1],nreads=100)
+#############################
+# Step 3: Determine counts
+#############################
+# version: 
+gtf <- list.files("./",pattern=".*gtf$",full.names = T)
+#
+# ispairedEnd = TRUE
+#
+fc <- featureCounts(files = as.character(outfiles), GTF.featureType="exon",GTF.attrType="gene_name", annot.ext=gtf, isGTFAnnotationFile=TRUE, isPairedEnd=TRUE, nthreads=10)
+##############################
+# Functions for data analysis
+##############################
+#
+# PLS-DA ploot
+#
+plsdaplot = function(combdat, phenotype, filestr="plsda_plot.pdf"){
+	comblab = match(phenotype, unique(phenotype))
+	comblabunq = unique(comblab)
+	const = which(apply(combdat, 1, var)==0)
+	combfilt = combdat
+	if(length(const))
+		combfilt = combdat[-const,]
+	# variance filter
+	vars = apply(combfilt, 1, var)
+	var1000 = order(vars, decreasing=T)[1:1000]
+	source("osplda_rainbow.R")
+	require('bitops')
+	require('pls')
+	mods <- make.OSC.PLS.model(comblab, pls.data = as.data.frame(t((combfilt[var1000,]))), comp = 2, OSC.comp = 0, validation = "LOO", method = "oscorespls", cv.scale = FALSE, progress = FALSE)
+	pdf(filestr)
+	 plot.OSC.results(mods, plot = "scores", groups = phenotype)
+	dev.off()
+}
+#
+# Volcano plot
+#
+plotvolcano = function(dat, logfc, fdr, logfc_thresh1 = 1, fdr_thresh = 0.05, lablogfc_thresh = 1.5, labfdr_thresh = 0.001, cexlab=2, filestr="volcano_plot_with_labels.pdf")
+{
+	mat = data.frame(x=logfc, y=-log10(fdr))
+	matorange = data.frame(x=logfc, y=-log10(fdr))[which(abs(logfc)>logfc_thresh1),]
+	matgreen = data.frame(x=logfc, y=-log10(fdr))[intersect(which(fdr<fdr_thresh),which(abs(logfc)>logfc_thresh1)),]
+	matred =data.frame(x=logfc, y=-log10(fdr))[which(fdr<fdr_thresh),]
+	matlab = data.frame(x=logfc, y=-log10(fdr))[intersect(which(fdr<labfdr_thresh),which(abs(logfc)>lablogfc_thresh)),]
+	topgenes = rownames(dat)
+	matlabels = topgenes[intersect(which(fdr<labfdr_thresh),which(abs(logfc)>lablogfc_thresh))]
+	if(!require('ggrepel'))
+	{
+		install.packages("ggrepel")
+	}
+	require('ggplot2')
+	# write volcano plot to pdf
+	pdf(filestr)
+		p = ggplot(mat, aes(x, y))  +  geom_point(color = 'gray', size=0.8, fill="white") +  theme_classic(base_size = 16) + geom_point(data = matorange, color="orange") + geom_point(data = matred, color="red") + geom_point(data = matgreen, color="green") + geom_text_repel(data = matlab, label=matlabels, nudge_y=0.1, size=cexlab) + xlab('Log fold change') + ylab('-log10(P-value)') + theme(axis.text=element_text(size=12), axis.title=element_text(size=12,face="bold"))
+		print(p)
+	dev.off()
+}
+#
+# Differential expression analysis
+#
+differential_analyses <- function(countsdat, condition1_ids, condition2_ids, condition1_name, condition2_name, filename_prefix="case_vs_control", logfc_thresh1 = 1, use_nominal = FALSE, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05, cex.axis = 1, filtmean=NULL){
+	require('DESeq2')
+	# convert outcome variable into factor (e.g. "PD" vs. "control")
+	outcome = c(rep(condition1_name,length(condition1_ids)),rep(condition2_name,length(condition2_ids)))
+	cts = countsdat[,c(condition1_ids,condition2_ids)]
+	colnames(cts)=outcome
+	coldata = factor(outcome, levels=c(condition1_name, condition2_name))
+	names(coldata)=outcome
+	dds = DESeqDataSetFromMatrix(countData = cts, colData = DataFrame(coldata), design = ~coldata)
+	library("edgeR")
+	dge <- DGEList(counts=cts)
+	keep <- filterByExpr(dge, design=model.matrix(~0+outcome))
+	dds_filt <- dds[keep,]
+	topgenes_rnaseq <- DESeq(dds_filt)
+	res_case_vs_control <- results(topgenes_rnaseq)
+	res_case_vs_control <- as.matrix(res_case_vs_control[order(res_case_vs_control$pvalue),])
+	if(!is.null(filtmean)){
+		res_case_vs_control = res_case_vs_control[-which(res_case_vs_control[,1] < filtmean),]
+	}
+	pdf( paste(filename_prefix,"_boxplots_top100.pdf",collapse=""), width=7, height=7)
+	par(mfrow=c(2,2))
+	for(i in 1:100)
+	 boxplot(as.matrix(cts[match(rownames(res_case_vs_control),rownames(cts))[i],]) ~ outcome, col=c("lightblue","pink"), main=rownames(res_case_vs_control)[i], ylab="preprocessed expression count", xlab="", cex.axis=cex.axis, outline=T)
+	dev.off()
+	# ranking table
+	write.xlsx(cbind(round(res_case_vs_control[,1:4], 3), format(res_case_vs_control[,5:6], digits=3)), paste(filename_prefix,"_ranking.xlsx",collapse=""))
+	if(!use_nominal){
+	plotvolcano(res_case_vs_control, res_case_vs_control[,"log2FoldChange"], res_case_vs_control[,"padj"], logfc_thresh1 = logfc_thresh1, fdr_thresh = fdr_thresh, lablogfc_thresh = lablogfc_thresh, labfdr_thresh = labfdr_thresh, filestr=paste(filename_prefix,"_volcano_plot_with_labels.pdf",collapse=""))
+	} else {
+		plotvolcano(res_case_vs_control, res_case_vs_control[,"log2FoldChange"], res_case_vs_control[,"pvalue"], logfc_thresh1 = logfc_thresh1, fdr_thresh = fdr_thresh, lablogfc_thresh = lablogfc_thresh, labfdr_thresh = labfdr_thresh, filestr=paste(filename_prefix,"_volcano_plot_with_labels.pdf",collapse=""))
+	}
+	return(res_case_vs_control)
+}
+##############################
+# Astrocyte analysis
+##############################
+library(xlsx)
+astrocytes_annot <- xlsx::read.xlsx("samples_astrocytes.xlsx", 1)
+astrocytes_samp = grep("Oksanen|Palm",colnames(fc$counts))
+phenotype = rep("oksanen_t129",length(astrocytes_samp))
+phenotype[grep("48.*Oksanen",colnames(fc$counts)[astrocytes_samp])] = rep("oksanen_48",length(grep("48.*Oksanen",colnames(fc$counts)[astrocytes_samp])))
+phenotype[grep("T12.*Palm",colnames(fc$counts)[astrocytes_samp])] = rep("palm_t129",length(grep("T12.*Palm",colnames(fc$counts)[astrocytes_samp])))
+phenotype[grep("48.*Palm",colnames(fc$counts)[astrocytes_samp])] = rep("palm_48",length(grep("48.*Palm",colnames(fc$counts)[astrocytes_samp])))
+print(table(phenotype))
+#phenotype
+#  oksanen_48 oksanen_t129      palm_48    palm_t129 
+#           3            3            3            3
+oksanen_t129 = which(phenotype=="oksanen_t129")
+oksanen_48 = which(phenotype=="oksanen_48")
+palm_48 = which(phenotype=="palm_48")
+palm_t129 = which(phenotype=="palm_t129")
+plsdaplot(fc$counts[,astrocytes_samp], phenotype, filestr="plsda_rnaseq_astrocytes_2022.pdf")
+##############################
+# DESeq2 analysis
+##############################
+res_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], c(oksanen_t129,oksanen_48), c(palm_t129,palm_48), "oksanen", "palm", filename_prefix="oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05)
+res_t129_vs_48_all = differential_analyses(fc$counts[,astrocytes_samp], c(oksanen_t129,palm_t129), c(oksanen_48,palm_48), "T12.9", "#48", filename_prefix="t129_vs_48_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05)
+res_t129_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], oksanen_t129, palm_t129, "T12.9_oksanen", "T12.9_palm", filename_prefix="t129_oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05)
+res_48_oksanen_vs_palm_all = differential_analyses(fc$counts[,astrocytes_samp], oksanen_48, palm_48, "#48_oksanen", "#48_palm", filename_prefix="48_oksanen_vs_palm_deseq2_2022", logfc_thresh1 = 1, use_nominal=F, fdr_thresh = 0.05, lablogfc_thresh = 0.5, labfdr_thresh = 0.05)