Commit 2ea8953b authored by Leon-Charles Tranchevent's avatar Leon-Charles Tranchevent
Browse files

Preliminary version of the limma analyses.

parent ae29e367
#!/usr/bin/env Rscript
# ================================================================================================
# Libraries
# ================================================================================================
library("yaml")
library("zeallot")
library("Biobase")
library("limma")
message(paste0("[", Sys.time(), "] Libraries loaded."))
# ================================================================================================
# Configuration
# ================================================================================================
options(bitmapType = "cairo")
global_config <- yaml::read_yaml("../global_config.yml")
local_config <- yaml::read_yaml("./local_config.yml")
raw_data_dir <- global_config$global_raw_data_dir
input_data_dir <- paste0(global_config$global_data_dir, local_config$local_input_data_dir)
output_data_dir <- paste0(global_config$global_data_dir, local_config$local_data_dir)
# Optional command line parameters to process only one dataset.
args <- commandArgs(trailingOnly = TRUE)
selected_dataset_name <- ""
if (length(args) > 0) {
selected_dataset_name <- args[1]
}
# ================================================================================================
# Functions
# ================================================================================================
### COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the clinical data ('ClinicalData.tsv').
#'
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset for which the clinical data
#' are required.
#' @param raw_data_dir A string representing the folder that contains the input data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A annotated data-frame that contains the clinical data.
get_clinical_data <- function(dataset_name, raw_data_dir, verbose = TRUE) {
# We define the I/Os.
clinical_data_file <- paste0(raw_data_dir, dataset_name, "/", "ClinicalData.tsv")
# We load the clinical data as to make QC more useful (again).
pheno_data <- Biobase::AnnotatedDataFrame(read.delim(file = clinical_data_file,
row.names = 1,
colClasses = "factor"))
# We clean up and log information.
rm(clinical_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] Phenotypic data read."))
}
# We return the clinical data.
return(pheno_data)
}
### COPY OF THE FUNCTION OF STEP 03 --- PUT THEM IN LIB
#' This function loads the preprocessed data (obtained using SCAN) associated with a dataset.
#' It returns a matrix and an eset of the data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains the processed data.
#'
#' Note: the function does not check for the existence of folders and files (this is not a package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset to load.
#' @param data_dir A string representing the folder that contains the processed data.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return A list of two objects (matrix and ExpressionSet) containing the preprocessed
#' expression data.
get_scan_data <- function(dataset_name, data_dir, verbose = TRUE) {
# We define the I/Os.
processed_data_file <- paste0(data_dir, dataset_name, "/", dataset_name, "_normalized.tsv")
# We load the SCAN matrix and creates the associated eset object.
exprs_scan_mat <- as.matrix(read.table(processed_data_file,
header = TRUE,
sep = "\t",
row.names = 1,
as.is = TRUE))
exprs_scan_eset <- Biobase::ExpressionSet(assayData = exprs_scan_mat)
# We clean up and log information.
rm(processed_data_file)
if (verbose == TRUE) {
message(paste0("[", Sys.time(), "][", dataset_name, "] SCAN expression data read."))
}
# We return the data.
return(list(exprs_scan_mat, exprs_scan_eset))
}
# ================================================================================================
# Main
# ================================================================================================
# We do all datasets one by one.
for (i in 1:length(local_config$datasets)) {
# We get the dataset details (name, compress, array_type)
dataset_config <- local_config$datasets[[i]]
# We run the quality control of the current dataset (if necessary).
if (selected_dataset_name == "" || selected_dataset_name == dataset_config[1]) {
# We load the data (clinical and preproccesed data).
pheno_data <- get_clinical_data(dataset_config[1], raw_data_dir)
c(exprs_mat, exprs_eset) %<-% get_scan_data(dataset_config[1], input_data_dir)
# TODO: Additional QC
# plotMDS(exprs_eset)
#
}
}
# We log the session details for reproducibility.
sessionInfo()
## COPY
i = 3
dataset_config <- local_config$datasets[[i]]
pheno_data <- get_clinical_data(dataset_config[1], raw_data_dir)
c(exprs_mat, exprs_eset) %<-% get_scan_data(dataset_config[1], input_data_dir)
## END OF COPY
# Create my own design matrix (factorial design)
# Improve the clinical data (4 categories based on two descriptors)
clinical <- pData(pheno_data)
# TODO: uncomment.
newClinical <- paste(clinical$Gender, clinical$Disease.status, sep=".")
###newClinical <- paste(clinical$Disease.status, clinical$Disease.status, sep=".")
# TODO: uncomment.
newClinical <- factor(newClinical, levels=c("M.PD","M.Control","F.PD","F.Control"))
###newClinical <- factor(newClinical, levels=c("PD.PD","Control.Control"))
# We force to have no intercept (0+).
design <- model.matrix(~0+newClinical)
colnames(design) <- levels(newClinical)
# Factorial design so complex contrast to measure all effects.
# TODO: uncomment.
cont.matrix <- makeContrasts(DiseasevsControlinMale=M.PD-M.Control,
DiseasevsControlinFemale=F.PD-F.Control,
Diff=(F.PD-F.Control)-(M.PD-M.Control),
levels=design)
###cont.matrix <- makeContrasts(DizVsCtrl=PD.PD-Control.Control,
### levels=design)
# Important parameter: correlation when some samples are paired (moran (GSE8397): same patients and diff tissues) (p98-116)
# Important parameter: block to correct for batch effect (none for test)
# Important parameter: method = ls or robust (?)
fit <- lmFit(exprs_mat, design)
fit2 <- contrasts.fit(fit, cont.matrix)
# To compute stats
# Important parameter: proportion of estimated diff exp genes default = 0.01 (?) [little effect in practice]
# Important parameter: trend and robust (mostly for RNA-seq data?).
fit2<-eBayes(fit2)
# Stats again (// to decideTests)
# Important parameter: adjust.method between "BH", "BY", "holm" (more cons) or "hochberg", "hommel", "bonferonni" and "fdr"
# Important parameter: sort.by (to sort the rows) "p" or "B" most likely.
# Important parameter: confint (logical) for confidence intervals. [++]
# TODO: uncomment.
table_male<-topTable(fit2,coef="DiseasevsControlinMale",n=Inf, confint = TRUE)
head(table_male)
table_female<-topTable(fit2,coef="DiseasevsControlinFemale",n=Inf, confint = TRUE)
head(table_female)
table_diff<-topTable(fit2,coef="Diff",n=Inf, confint = TRUE)
head(table_diff)
###table_diff<-topTable(fit2,coef="DizVsCtrl",n=Inf, confint = TRUE)
###head(table_diff)
# Stats again (// to topTable)
# Important parameter: adjust.method between "BH", "BY", "holm" (more cons) or "hochberg", "hommel", "bonferroni" and "fdr"
# Important parameter: coefficient (to mimic topTable and since we probably don't want to have global stats based on several contrasts).
# Check it having multiple coeffs change the results without specifying the coeff param.
results <- decideTests(fit2, adjust.method = "bonferroni")
summary(results)
# Important parameter: include "both", "up", "down", or c("up","down")
# Important parameters: cex, lwd, circle.col, counts.col, show.include fr graphical options.
vennDiagram(results, include = c("up","down"), circle.col = "orange", counts.col = c("blue","green"), lwd = 3, show.include = TRUE)
vennDiagram(results, include = "both", circle.col = "orange", lwd = 3, show.include = TRUE)
# Results[,1] needs to be adapted according to cm.
# Important parameters: hl.pch and hl.cex to control the plotting characters.
plotMD(fit2,coef="DiseasevsControlinMale",status=results[,1],values=c(1,-1),hl.col=c("red","blue"), hl.pch = c(16,16), hl.cex = c(0.9,0.9))
plotMD(fit2,coef="DiseasevsControlinFemale",status=results[,2],values=c(1,-1),hl.col=c("red","blue"), hl.pch = c(16,16), hl.cex = c(0.9,0.9))
plotMD(fit2,coef="Diff",status=results[,3],values=c(1,-1),hl.col=c("red","blue"), hl.pch = c(16,16), hl.cex = c(0.9,0.9))
#LATER
library("annotationTools")
annotation_HGU133Plus2<-read.csv('HG-U133A.na36.annot.csv',colClasses='character',comment.char='#')
myPS<-row.names(table_zheng3)
gene_symbol<-listToCharacterVector(getGENESYMBOL(myPS,annotation_HGU133Plus2))
topTable_zheng3<-data.frame(X=row.names(table_zheng3),logFC=table_zheng3$logFC,
AveExpr=table_zheng3$AveExpr, t=table_zheng3$t,
P.Value=table_zheng3$P.Value, adj.P.Val=table_zheng3$adj.P.Val,
B=table_zheng3$B,gene_symbol=gene_symbol)
write.table(topTable_zheng3, file="topTable_zheng3.txt", sep = ",",row.names = FALSE)
\ No newline at end of file
local_input_data_dir: !!str '02/'
local_data_dir: !!str '05/'
local_code_dir: !!str '05-Get_DEGs/'
datasets:
- ['GSE20141']
- ['GSE20163']
- ['GSE20164']
- ['GSE20292']
- ['GSE7307']
- ['GSE7621']
- ['GSE8397']
- ['Simunovic']
- ['E-MEXP-1416']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment