Commit f11162e7 authored by Marek Ostaszewski's avatar Marek Ostaszewski
Browse files

Dataset normalisation with HiPathia

parent 86c4bf30
Pipeline #43574 passed with stages
in 45 seconds
##########################################################
### Normalization of GSE152075 SARS-CoV-2 dataset #####
########################################################
library(pacman)
#### Normalize Data #####
### Load necessary packages
pacman::p_load("here", "rentrez","reutils","hipathia", "biomaRt", "utils",
"stringr", "AnnotationDbi", "org.Hs.eg.db","dplyr","tidyr",
"openxlsx", "preprocessCore", "edgeR")
### Download the file if it doesn't exist
if(!file.exists("GSE152075_raw_counts_GEO.txt.gz")) {
download.file(url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE152075&format=file&file=GSE152075%5Fraw%5Fcounts%5FGEO%2Etxt%2Egz",
destfile = "GSE152075_raw_counts_GEO.txt.gz")
}
### Load and clean RNA-seq expression set from raw counts file.
expreset_raw <- read.delim(file = gzfile("GSE152075_raw_counts_GEO.txt.gz"), header = T, sep = " ")
identifiers <- rownames(expreset_raw)
identifiers_df <- data.frame(id = identifiers,
entrez = mapIds(org.Hs.eg.db, keys = identifiers, column = "ENTREZID", keytype = "SYMBOL"),
stringsAsFactors = F) %>% .[!is.na(.$entrez),] ## identify weird codes which do not have entrez ID
expreset_raw <- expreset_raw[rownames(expreset_raw) %in% identifiers_df$id, ] ## here we clean the dataset from pseudogenes and rows with none entrez ids which will not be used by Hipathia
print("read and clean from non entrez codes ...done")
# Explore how our data is organized
hist(as.numeric(expreset_raw[2,]),breaks=100)
var(as.numeric(expreset_raw[2,]))
getVari <- apply(expreset_raw, 1, var)
hist(getVari,100)
# Normalise by TMM with "edgeR" package
dge <- DGEList(counts=expreset_raw)
print("dge...done")
tmm <- calcNormFactors(dge, method="TMM")
print("tmm...done")
logcpm <- cpm(tmm, prior.count=3, log=TRUE)
print("logcpm...done")
### Normalise by Hipathia
gExp = logcpm
gExp = normalize.quantiles(gExp)
rownames(gExp)<- rownames(logcpm)
colnames(gExp) <- colnames(logcpm)
### Construct the metadata file with the info from GEO GSE152075 (Virus vs Control)
metadata <- data.frame(fileName = colnames(gExp),
type = c(rep("Virus",430), rep("Control",54)))
### Translate by Hipathia
trans_data <- translate_data(gExp, "hsa")
#### Export results table for the Cov-Hipathia Disease Maps differential signaling example.
trans_data_export <- tibble::rownames_to_column(as.data.frame(trans_data), var ="X")
write.table(trans_data_export, file = "GSE152075_input_data.tsv", col.names = T, row.names = F, quote = F, sep = "\t")
write.table(metadata, file = "GSE152075_design_matrix.tsv", row.names = F, col.names = F, quote = F, sep = "\t" )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment