Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
673b161e
Commit
673b161e
authored
Aug 08, 2019
by
Leon-Charles Tranchevent
Browse files
Some refactoring, better logs, improved GC / mem usage.
parent
24846b85
Changes
18
Hide whitespace changes
Inline
Side-by-side
R/correct_batch_effect.R
View file @
673b161e
...
...
@@ -14,38 +14,43 @@
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The corrected expression data as an ESET object.
correct_batch_effect
<-
function
(
eset
,
input_data_dir
,
is_eset
=
TRUE
,
batch_filename
=
"Batch.tsv"
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We read the clinical and batch data.
batch_data_file
<-
paste0
(
input_data_dir
,
batch_filename
)
batch_data
<-
utils
::
read.delim
(
file
=
batch_data_file
,
row.names
=
1
)
pheno_data
<-
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
))
remove
(
batch_data_file
)
verbose
=
verbose
))
# If necessary, we convert the input matrix to an ExpressionSet.
if
(
!
is_eset
)
{
eset
<-
Biobase
::
ExpressionSet
(
eset
)
}
# We clean up and log information.
rm
(
batch_data_file
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data read."
))
}
# We then build the mod matrix that contains the clinically relevant co-factors.
# By default, we consider we have only the disease status, but we then also
# try to add the gender and age as co-factors to guide Combat to correct for batch effect
# without removing the biologically relevant signal.
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
,
data
=
pheno_data
)
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
,
data
=
pheno_data
)
if
(
sum
(
is.na
(
pheno_data
$
Gender
))
==
0
)
{
if
(
sum
(
is.na
(
pheno_data
$
Age
))
==
0
)
{
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
+
Gender
+
as.numeric
(
as.character
(
Age
)),
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
+
Gender
+
as.numeric
(
as.character
(
Age
)),
data
=
pheno_data
)
}
else
{
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
+
Gender
,
data
=
pheno_data
)
mod_data
<-
stats
::
model.matrix
(
~
Disease.status
+
Gender
,
data
=
pheno_data
)
}
}
...
...
@@ -55,26 +60,31 @@ correct_batch_effect <- function(eset,
clean_probe_list
<-
rownames
(
exp_data
)
for
(
b
in
unique
(
batch_data
$
Batch
))
{
# We take care of batch b.
samples_in_batch
<-
which
(
batch_data
$
Batch
==
b
)
exp_data_batch
<-
exp_data
[,
samples_in_batch
]
probe_vars
<-
apply
(
exp_data_batch
,
1
,
var
)
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
samples_in_batch
<-
which
(
batch_data
$
Batch
==
b
)
exp_data_batch
<-
exp_data
[,
samples_in_batch
]
probe_vars
<-
apply
(
exp_data_batch
,
1
,
var
)
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
clean_probe_list
,
probe_var_0
)
}
}
r
emove
(
b
,
samples_in_batch
,
exp_data_batch
,
probe_vars
,
probe_var_0
)
r
m
(
b
,
samples_in_batch
,
exp_data_batch
,
probe_vars
,
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
exp_data
,
clean_probe_list
)
# We clean up and log information.
rm
(
exp_data
,
clean_probe_list
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data ready."
))
}
# ComBat corrects for the batch effect.
data_bc
<-
sva
::
ComBat
(
dat
=
Biobase
::
exprs
(
eset
),
batch
=
batch_data
$
Batch
,
mod
=
mod_data
)
data_bc
<-
sva
::
ComBat
(
dat
=
Biobase
::
exprs
(
eset
),
batch
=
batch_data
$
Batch
,
mod
=
mod_data
)
eset_bc
<-
Biobase
::
ExpressionSet
(
data_bc
)
# We clean up and log information.
r
emove
(
batch_data
,
pheno_data
,
mod_data
)
r
m
(
batch_data
,
pheno_data
,
mod_data
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
...
...
R/extract_DEGs.R
View file @
673b161e
...
...
@@ -16,11 +16,14 @@
#' @param file_prefix A string used to prefix the file names. Default to "".
#' @param pval_adjust_method A string code indicating the multiple testing correction
#' method to use. Default to BH.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return NULL
extract_DEGs
<-
function
(
fit
,
limma_coeffs
,
k
,
output_data_dir
,
file_suffix
=
""
,
file_prefix
=
""
,
pval_adjust_method
=
"BH"
)
{
pval_adjust_method
=
"BH"
,
verbose
=
FALSE
)
{
# We create the output file names.
if
(
file_suffix
!=
""
)
{
...
...
@@ -29,12 +32,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
if
(
file_prefix
!=
""
)
{
file_prefix
<-
paste0
(
"_"
,
file_prefix
)
}
venn_f
ilename
<-
paste0
(
output_data_dir
,
file_suffix
,
"venn_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
md_fn
<-
paste0
(
output_data_dir
,
file_suffix
,
"MD_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
table_fn
<-
paste0
(
output_data_dir
,
file_suffix
,
"toptable_"
,
limma_coeffs
[
k
],
file_prefix
,
".tsv"
)
venn_f
n
<-
paste0
(
output_data_dir
,
file_suffix
,
"venn_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
md_fn
<-
paste0
(
output_data_dir
,
file_suffix
,
"MD_"
,
limma_coeffs
[
k
],
file_prefix
,
".png"
)
table_fn
<-
paste0
(
output_data_dir
,
file_suffix
,
"toptable_"
,
limma_coeffs
[
k
],
file_prefix
,
".tsv"
)
# We extract the DEGs using toptable.
table
<-
limma
::
topTable
(
fit
,
...
...
@@ -49,9 +52,14 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
coefficient
=
paste0
(
"coeff"
,
k
),
adjust.method
=
pval_adjust_method
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Differentially expressed genes extracted."
))
}
# We plot Venn diagrams that summarize the results and show to which
# extend the analyses overlap.
grDevices
::
png
(
venn_f
ilename
)
grDevices
::
png
(
venn_f
n
)
limma
::
vennDiagram
(
results
,
names
=
limma_coeffs
,
include
=
c
(
"up"
,
"down"
),
...
...
@@ -62,6 +70,12 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
show.include
=
TRUE
)
grDevices
::
dev.off
()
# We clean and log information.
rm
(
venn_fn
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Venn diagram created."
))
}
# We then plot the Mean vs Deviation of the comparisons and highlight the significant genes.
grDevices
::
png
(
md_fn
)
limma
::
plotMD
(
fit
,
...
...
@@ -73,6 +87,18 @@ extract_DEGs <- function(fit, limma_coeffs, k, output_data_dir,
hl.cex
=
c
(
0.9
,
0.9
))
grDevices
::
dev.off
()
# We clean and log information.
rm
(
md_fn
,
results
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] MD plot created."
))
}
# Save the top tables to TSV files.
utils
::
write.table
(
table
,
file
=
table_fn
,
sep
=
"\t"
,
quote
=
FALSE
,
col.names
=
NA
)
# We clean and log information.
rm
(
table
,
table_fn
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Differentially expressed gene list saved."
))
}
}
R/get_gene_annots_from_file.R
View file @
673b161e
...
...
@@ -18,6 +18,7 @@ get_gene_annots_from_file <- function(folder, filename, entities) {
# We read the file into a data-frame.
gene_annots
<-
read.delim
(
file_path
,
row.names
=
NULL
,
stringsAsFactors
=
FALSE
)
rm
(
file_path
)
# We do a bit of cleaning.
gene_annots
[
is.na
(
gene_annots
)]
<-
""
...
...
R/get_gene_annots_from_package.R
View file @
673b161e
...
...
@@ -23,5 +23,7 @@ get_gene_annots_from_package <- function(affy_library_name, entities,
columns
=
columns
,
keytype
=
keytype
),
list
(
local_db
=
as.name
(
affy_library_name
)))
eval
(
cmd
)
rm
(
cmd
)
return
(
gene_annots
)
}
R/load_clinical_data.R
View file @
673b161e
...
...
@@ -10,12 +10,12 @@
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param use_factors A boolean stating whether the columns should be read as factors (default FALSE).
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data
<-
function
(
data_dir
,
clinical_file_name
=
"ClinicalData.tsv"
,
use_factors
=
TRUE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
clinical_data_file
<-
paste0
(
data_dir
,
clinical_file_name
)
...
...
@@ -40,6 +40,7 @@ load_clinical_data <- function(data_dir,
if
(
verbose
==
TRUE
)
{
data_dimensions
<-
paste0
(
dim
(
pheno_data
),
collapse
=
" * "
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Clinical data read ("
,
data_dimensions
,
")."
))
rm
(
data_dimensions
)
}
# We return the clinical data.
...
...
R/preprocess_data.R
View file @
673b161e
...
...
@@ -25,7 +25,7 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data
<-
function
(
input_data_dir
,
output_data_files
,
platform
=
"Affymetix"
,
...
...
@@ -34,7 +34,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We launch the correct function depending on the array platform and desired method.
esets
<-
NULL
...
...
@@ -73,7 +73,7 @@ preprocess_data <- function(input_data_dir, output_data_files,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
else
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Platform "
,
platform
,
message
(
paste0
(
"["
,
Sys.time
(),
"]
[WARNING]
Platform "
,
platform
,
" not yet supported (no preprocessing done)."
))
}
...
...
R/preprocess_data_affymetrix_gcrma.R
View file @
673b161e
...
...
@@ -19,24 +19,29 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_gcrma
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We run the RMA pre-processing method on the data.
input_data_files
<-
affy
::
list.celfiles
(
raw_data_input_dir
,
full.names
=
TRUE
)
remove
(
raw_data_input_dir
)
batch
<-
affy
::
ReadAffy
(
filenames
=
input_data_files
,
compress
=
compressed
,
verbose
=
verbose
)
eset
<-
gcrma
::
gcrma
(
batch
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
input_data_files
,
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the probes that have 0 variance accross the samples.
exp_data
<-
Biobase
::
exprs
(
eset
)
probe_vars
<-
apply
(
exp_data
,
1
,
var
)
...
...
@@ -44,33 +49,51 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
clean_probe_list
)
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
exp_data
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
exp_data
,
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
)
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
...
...
@@ -82,10 +105,9 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
quote
=
FALSE
)
}
# We clean up and log information.
remove
(
input_data_files
,
batch
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with RMA
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_affymetrix_scan.R
View file @
673b161e
...
...
@@ -19,14 +19,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_scan
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
...
...
@@ -34,9 +34,14 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp
<-
paste0
(
raw_data_input_dir
,
"*"
)
remove
(
raw_data_input_dir
)
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
,
outFilePath
=
output_data_files
[
1
])
# We clean up and log information.
rm
(
raw_data_input_dir
,
input_data_regexp
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the probes that have 0 variance accross the samples.
exp_data
<-
Biobase
::
exprs
(
eset
)
probe_vars
<-
apply
(
exp_data
,
1
,
var
)
...
...
@@ -44,33 +49,51 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
remove
(
clean_probe_list
)
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
exp_data
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
exp_data
,
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
)
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset_bc data as TSV file. ESET was already done as part of SCAN.
...
...
@@ -81,10 +104,9 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
quote
=
FALSE
)
}
# We clean up and log information.
rm
(
input_data_regexp
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with SCAN
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_agilent_limma.R
View file @
673b161e
...
...
@@ -19,30 +19,35 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is
TRU
E by default.
#' is
FALS
E by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_agilent_limma
<-
function
(
input_data_dir
,
output_data_files
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
TRU
E
)
{
verbose
=
FALS
E
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We read the data (aka agilent MA images).
raw_file_list
<-
list.files
(
path
=
raw_data_input_dir
)
batch
<-
limma
::
read.maimages
(
files
=
raw_file_list
,
source
=
"agilent"
,
path
=
raw_data_input_dir
,
green.only
=
TRUE
,
verbose
=
TRUE
)
batch
<-
limma
::
read.maimages
(
files
=
raw_file_list
,
source
=
"agilent"
,
path
=
raw_data_input_dir
,
green.only
=
TRUE
,
verbose
=
TRUE
)
batch_data
<-
log2
(
batch
$
E
)
# We change the probe ids (1 to 45015) to be the probe names instead (A_XX_PXXXX).
# We change the probe ids (
e.g.,
1 to 45015) to be the probe names instead (
e.g.,
A_XX_PXXXX).
rownames
(
batch_data
)
<-
(
batch
$
genes
)
$
ProbeName
# nolint
remove
(
raw_data_input_dir
,
batch
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
raw_file_list
,
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data read."
))
}
# We run the LIMMA pre-processing method on the data.
# First, background correcttion.
...
...
@@ -53,46 +58,71 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
# Second, perform quantile normalization.
batch_data_norm
<-
limma
::
normalizeBetweenArrays
(
batch_data_bg
,
method
=
"quantile"
)
# We clean up and log information.
rm
(
batch_data_bg
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data processed."
))
}
# We remove the duplicate rows (based on the row names only).
probe_id_counts
<-
table
(
row.names
(
batch_data
))
unique_probe_ids
<-
setdiff
(
names
(
probe_id_counts
),
names
(
probe_id_counts
[
probe_id_counts
>
1
]))
batch_data_norm
<-
batch_data_norm
[
unique_probe_ids
,
]
rm
(
probe_id_counts
,
unique_probe_ids
,
batch_data
)
# We remove the probes that have 0 variance accross the samples.
probe_vars
<-
apply
(
batch_data_norm
,
1
,
var
)
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
batch_data_norm
),
probe_var_0
)
batch_data_norm
<-
batch_data_norm
[
clean_probe_list
,
]
remove
(
clean_probe_list
)
batch_data_norm
<-
batch_data_norm
[
clean_probe_list
,
]
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
remove
(
probe_vars
,
probe_var_0
)
# We correct for the batch effect if necesary.
batch_data_norm_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
batch_data_norm_bc
<-
correct_batch_effect
(
eset
=
batch_data_norm
,
input_data_dir
=
input_data_dir
,
verbose
=
verbose
,
is_eset
=
FALSE
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
batch_data_norm
<-
batch_data_norm_bc
r
emove
(
batch_data_norm_bc
)
r
m
(
batch_data_norm_bc
)
}
}
else
{
r
emove
(
batch_data_norm_bc
)
r
m
(
batch_data_norm_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
FALSE
)))
verbose
=
verbose
)))
# We only keep the samples with clinical data.
batch_data_norm
<-
batch_data_norm
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
batch_data_norm_bc
<-
batch_data_norm_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
...
...
@@ -105,12 +135,12 @@ preprocess_data_agilent_limma <- function(input_data_dir, output_data_files,
utils
::
write.table
(
batch_data_norm_bc
,
file
=
output_data_files
[
2
],
sep
=
"\t"
,
quote
=
FALSE
)
rm
(
batch_data_norm_bc
)
}
else
{
r
emove
(
eset_bc
)
r
m
(
eset_bc
)
}
# We
clean up and
log information.
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]
Expression data pre-processed with LIMMA
."
))
message
(
paste0
(
"["
,
Sys.time
(),
"]
Processed data written to files
."
))
}
# We return the created ESET(s).
...
...
R/preprocess_data_illumina_beadarray.R
View file @
673b161e
...
...
@@ -20,14 +20,14 @@
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This