Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
673b161e
Commit
673b161e
authored
Aug 08, 2019
by
Leon-Charles Tranchevent
Browse files
Some refactoring, better logs, improved GC / mem usage.
parent
24846b85
Changes
18
Show whitespace changes
Inline
Side-by-side
R/correct_batch_effect.R
View file @
673b161e
...
...
@@ -14,27 +14,32 @@
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The corrected expression data as an ESET object.
correct_batch_effect
<-
function
(
eset
,
input_data_dir
,
is_eset
=
TRUE
,
batch_filename
=
"Batch.tsv"
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We read the clinical and batch data.
batch_data_file
<-
paste0
(
input_data_dir
,
batch_filename
)
batch_data
<-
utils
::
read.delim
(
file
=
batch_data_file
,
row.names
=
1
)
pheno_data
<-
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
))
remove
(
batch_data_file
)
verbose
=
verbose
))
# If necessary, we convert the input matrix to an ExpressionSet.
if
(
!
is_eset
)
{
eset
<-
Biobase
::
ExpressionSet
(
eset
)
}
# We clean up and log information.
rm
(
batch_data_file
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data read."
))
}
# We then build the mod matrix that contains the clinically relevant co-factors.
# By default, we consider we have only the disease status, but we then also
# try to add the gender and age as co-factors to guide Combat to correct for batch effect
...
...
@@ -63,9 +68,14 @@ correct_batch_effect <- function(eset,
clean_probe_list
<-
setdiff
(
clean_probe_list
,
probe_var_0
)
}
}
r
emove
(
b
,
samples_in_batch
,
exp_data_batch
,
probe_vars
,
probe_var_0
)
r
m
(
b
,
samples_in_batch
,
exp_data_batch
,
probe_vars
,
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
exp_data
,
clean_probe_list
)
# We clean up and log information.
rm
(
exp_data
,
clean_probe_list
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data ready."
))
}
# ComBat corrects for the batch effect.
data_bc
<-
sva
::
ComBat
(
dat
=
Biobase
::
exprs
(
eset
),
...
...
@@ -74,7 +84,7 @@ correct_batch_effect <- function(eset,
eset_bc
<-
Biobase
::
ExpressionSet
(
data_bc
)
# We clean up and log information.
r
emove
(
batch_data
,
pheno_data
,
mod_data
)
r
m
(
batch_data
,
pheno_data
,
mod_data
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
...
...
R/extract_DEGs.R
View file @
673b161e
...
...
@@ -16,11 +16,14 @@
#' @param file_prefix A string used to prefix the file names. Default to "".
#' @param pval_adjust_method A string code indicating the multiple testing correction
#' method to use. Default to BH.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return NULL
extract_DEGs
<-
function
(
fit
,
limma_coeffs
,
k
,
output_data_dir
,
file_suffix
=
""
,
file_prefix
=
""
,
pval_adjust_method
=
"BH"
)
{
pval_adjust_method
=
"BH"
,
verbose
=
FALSE
)
{
# We create the output file names.
if
(
file_suffix
!=
""
)
{
...
...
@@ -29,7 +32,7 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
if
(
file_prefix
!=
""
)
{
file_prefix
<-
paste0
(
"_"
,
file_prefix
)
}
venn_f
ilename
<-
paste0
(
output_data_dir
,
file_suffix
,
"venn_"
,
venn_f
n
<-
paste0
(
output_data_dir
,
file_suffix
,
"venn_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
md_fn
<-
paste0
(
output_data_dir
,
file_suffix
,
"MD_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
...
...
@@ -49,9 +52,14 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coefficient
=
paste0
(
"coeff"
,
k
),
adjust.method
=
pval_adjust_method
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Differentially expressed genes extracted."
))
}
# We plot Venn diagrams that summarize the results and show to which
# extend the analyses overlap.
grDevices
::
png
(
venn_f
ilename
)
grDevices
::
png
(
venn_f
n
)
limma
::
vennDiagram
(
results
,
names
=
limma_coeffs
,
include
=
c
(
"up"
,
"down"
),
...
...
@@ -62,6 +70,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
show.include
=
TRUE
)
grDevices
::
dev.off
()
# We clean and log information.
rm
(
venn_fn
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Venn diagram created."
))
}
# We then plot the Mean vs Deviation of the comparisons and highlight the significant genes.
grDevices
::
png
(
md_fn
)
limma
::
plotMD
(
fit
,
...
...
@@ -73,6 +87,18 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
hl.cex
=
c
(
0.9
,
0.9
))
grDevices
::
dev.off
()
# We clean and log information.
rm
(
md_fn
,
results
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] MD plot created."
))
}
# Save the top tables to TSV files.
utils
::
write.table
(
table
,
file
=
table_fn
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
# We clean and log information.
rm
(
table
,
table_fn
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Differentially expressed gene list saved."
))
}
}
R/get_gene_annots_from_file.R
View file @
673b161e
...
...
@@ -18,6 +18,7 @@ get_gene_annots_from_file <- function(folder, filename, entities) {
# We read the file into a data-frame.
gene_annots
<-
read.delim
(
file_path
,
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
rm
(
file_path
)
# We do a bit of cleaning.
gene_annots
[
is.na
(
gene_annots
)]
<-
""
...
...
R/get_gene_annots_from_package.R
View file @
673b161e
...
...
@@ -23,5 +23,7 @@ get_gene_annots_from_package <- function(affy_library_name, entities,
columns
=
columns
,
keytype
=
keytype
),
list
(
local_db
=
as.name
(
affy_library_name
)))
eval
(
cmd
)
rm
(
cmd
)
return
(
gene_annots
)
}
R/load_clinical_data.R
View file @
673b161e
...
...
@@ -10,12 +10,12 @@
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data
<-
function
(
data_dir
,
clinical_file_name
=
"ClinicalData.tsv"
,
use_factors
=
TRUE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
clinical_data_file
<-
paste0
(
data_dir
,
clinical_file_name
)
...
...
@@ -40,6 +40,7 @@ load_clinical_data <- function(data_dir,
if
(
verbose
==
TRUE
)
{
data_dimensions
<-
paste0
(
dim
(
pheno_data
),
collapse
=
" * "
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Clinical data read ("
,
data_dimensions
,
")."
))
rm
(
data_dimensions
)
}
# We return the clinical data.
...
...
R/preprocess_data.R
View file @
673b161e
...
...
@@ -25,7 +25,7 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data
<-
function
(
input_data_dir
,
output_data_files
,
platform
=
"Affymetix"
,
...
...
@@ -34,7 +34,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We launch the correct function depending on the array platform and desired method.
esets
<-
NULL
...
...
@@ -73,7 +73,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
else
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Platform "
,
platform
,
message
(
paste0
(
"["
,
Sys.time
(),
"]
[WARNING]
Platform "
,
platform
,
" not yet supported (no preprocessing done)."
))
}
...
...
R/preprocess_data_affymetrix_gcrma.R
View file @
673b161e
...
...
@@ -19,24 +19,29 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_gcrma
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We run the RMA pre-processing method on the data.
input_data_files
<-
affy
::
list.celfiles
(
raw_data_input_dir
,
full.names
=
TRUE
)
remove
(
raw_data_input_dir
)
batch
<-
affy
::
ReadAffy
(
filenames
=
input_data_files
,
compress
=
compressed
,
verbose
=
verbose
)
eset
<-
gcrma
::
gcrma
(
batch
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
input_data_files
,
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the probes that have 0 variance accross the samples.
exp_data
<-
Biobase
::
exprs
(
eset
)
probe_vars
<-
apply
(
exp_data
,
1
,
var
)
...
...
@@ -44,33 +49,51 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
clean_probe_list
)
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
exp_data
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
exp_data
,
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
)
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
...
...
@@ -82,10 +105,9 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
quote
=
FALSE
)
}
# We clean up and log information.
remove
(
input_data_files
,
batch
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with RMA
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_affymetrix_scan.R
View file @
673b161e
...
...
@@ -19,14 +19,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_scan
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
...
...
@@ -34,9 +34,14 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp
<-
paste0
(
raw_data_input_dir
,
"*"
)
remove
(
raw_data_input_dir
)
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
,
outFilePath
=
output_data_files
[
1
])
# We clean up and log information.
rm
(
raw_data_input_dir
,
input_data_regexp
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the probes that have 0 variance accross the samples.
exp_data
<-
Biobase
::
exprs
(
eset
)
probe_vars
<-
apply
(
exp_data
,
1
,
var
)
...
...
@@ -44,33 +49,51 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
clean_probe_list
)
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
exp_data
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
exp_data
,
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
)
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset_bc data as TSV file. ESET was already done as part of SCAN.
...
...
@@ -81,10 +104,9 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
quote
=
FALSE
)
}
# We clean up and log information.
rm
(
input_data_regexp
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with SCAN
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_agilent_limma.R
View file @
673b161e
...
...
@@ -19,14 +19,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_agilent_limma
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
...
...
@@ -40,9 +40,14 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
verbose
=
TRUE
)
batch_data
<-
log2
(
batch
$
E
)
# We change the probe ids (1 to 45015) to be the probe names instead (A_XX_PXXXX).
# We change the probe ids (
e.g.,
1 to 45015) to be the probe names instead (
e.g.,
A_XX_PXXXX).
rownames
(
batch_data
)
<-
(
batch
$
genes
)
$
ProbeName
# nolint
remove
(
raw_data_input_dir
,
batch
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
raw_file_list
,
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data read."
))
}
# We run the LIMMA pre-processing method on the data.
# First, background correcttion.
...
...
@@ -53,12 +58,18 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
# Second, perform quantile normalization.
batch_data_norm
<-
limma
::
normalizeBetweenArrays
(
batch_data_bg
,
method
=
"quantile"
)
# We clean up and log information.
rm
(
batch_data_bg
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the duplicate rows (based on the row names only).
probe_id_counts
<-
table
(
row.names
(
batch_data
))
unique_probe_ids
<-
setdiff
(
names
(
probe_id_counts
),
names
(
probe_id_counts
[
probe_id_counts
>
1
]))
batch_data_norm
<-
batch_data_norm
[
unique_probe_ids
,
]
rm
(
probe_id_counts
,
unique_probe_ids
,
batch_data
)
# We remove the probes that have 0 variance accross the samples.
probe_vars
<-
apply
(
batch_data_norm
,
1
,
var
)
...
...
@@ -66,33 +77,52 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
batch_data_norm
),
probe_var_0
)
batch_data_norm
<-
batch_data_norm
[
clean_probe_list
,
]
remove
(
clean_probe_list
)
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
batch_data_norm_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
batch_data_norm_bc
<-
correct_batch_effect
(
eset
=
batch_data_norm
,
input_data_dir
=
input_data_dir
,
verbose
=
verbose
,
is_eset
=
FALSE
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
batch_data_norm
<-
batch_data_norm_bc
r
emove
(
batch_data_norm_bc
)
r
m
(
batch_data_norm_bc
)
}
}
else
{
r
emove
(
batch_data_norm_bc
)
r
m
(
batch_data_norm_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
batch_data_norm
<-
batch_data_norm
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
batch_data_norm_bc
<-
batch_data_norm_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
...
...
@@ -105,12 +135,12 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
utils
::
write.table
(
batch_data_norm_bc
,
file
=
output_data_files
[
2
],
sep
=
"\t"
,
quote
=
FALSE
)
rm
(
batch_data_norm_bc
)
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# We
clean up and
log information.
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with LIMMA
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_illumina_beadarray.R
View file @
673b161e
...
...
@@ -20,14 +20,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_illumina_beadarray
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
...
...
@@ -36,7 +36,12 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files
matrix_filename
<-
list.files
(
raw_data_input_dir
,
full.names
=
TRUE
)[
1
]
gse_eset
<-
GEOquery
::
getGEO
(
filename
=
matrix_filename
)
gse_data
<-
methods
::
as
(
gse_eset
,
"ExpressionSetIllumina"
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
matrix_filename
,
gse_eset
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data read."
))
}
# We do different nornalization depending on the array. That is whether we have
# controls probes (v3) or not (v2). The decision is based on the probe_quality field.
...
...
@@ -53,7 +58,12 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files
offset
<-
1.11
-
min_value
}
gse_data_filt
<-
log2
(
offset
+
Biobase
::
exprs
(
gse_data_norm
))
rm
(
gse_data
,
gse_data_norm
)
# We clean up and log information.
rm
(
gse_data
,
gse_data_norm
,
offset
,
min_value
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
}
else
{
# A bit of cleaning (specific to Illumina arrays).
...
...
@@ -66,14 +76,21 @@ preprocess_data_illumina_beadarray <- function(input_data_dir, output_data_files
# We run the beadarray pre-processing method on the data.
# Background correction and normalization at once.
gse_data_norm
<-
beadarray
::
normaliseIllumina
(
gse_data
,
method
=
"neqc"
,
status
=
probe_status
)
gse_data_norm
<-
beadarray
::
normaliseIllumina
(
gse_data
,
method
=
"neqc"
,
status
=
probe_status
)
# Additional cleaning (after normalization - also Illumina specific).
ids
<-
as.character
(
Biobase
::
featureNames
(
gse_data_norm
))
qual
<-
unlist
(
mget
(
ids
,
get
(
"illuminaHumanv3PROBEQUALITY"
),
ifnotfound
=
NA
))
rem
<-
qual
==
"No match"
|
qual
==
"Bad"
|
is.na
(
qual
)
gse_data_filt
<-
Biobase
::
exprs
(
gse_data_norm
[
!
rem
,
])
rm
(
gse_data
,
gse_data_norm
)
# We clean up and log information.
rm
(
gse_data
,
gse_data_norm
,
probe_status
,
ids
,
qual
,
rem
)
if
(
verbose
==
TRUE
)
{