Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
688d482a
Commit
688d482a
authored
Oct 01, 2019
by
Leon-Charles Tranchevent
Browse files
Added the support for GCRMA data generated using APT (Affymetrix Power Tools).
parent
be579cbe
Changes
5
Hide whitespace changes
Inline
Side-by-side
R/preprocess_data.R
View file @
688d482a
...
...
@@ -3,7 +3,7 @@
#' @description This function preprocess a dataset and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (
as
cel files).
#' The function assumes that a folder containing the raw data exists (
with
cel files).
#' It currently supports Affymetrix, Illumina and Agilent arrays. This function is
#' just a handler over the platform dedicated functions.
#'
...
...
@@ -16,6 +16,8 @@
#' Default to Affymetrix.
#' @param method A string representing the preprocessing method to use (for Affymetrix arrays
#' where multiple methods are supported). Default to SCAN for Affymetrix data.
#' @param exprs_raw A matrix corresponding to the pre-processed data (in the case of APT-GCRMA).
#' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param batch_correction A String indicating whether batch correction should
...
...
@@ -30,6 +32,7 @@
preprocess_data
<-
function
(
input_data_dir
,
output_data_files
,
platform
=
"Affymetix"
,
method
=
"SCAN"
,
exprs_raw
=
NULL
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
...
...
@@ -55,6 +58,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename
=
batch_filename
,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
else
if
(
method
==
"APT-GCRMA"
)
{
esets
<-
preprocess_data_affymetrix_aptgcrma
(
exprs_raw
,
input_data_dir
,
output_data_files
,
batch_correction
=
batch_correction
,
batch_filename
=
batch_filename
,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
}
else
if
(
platform
==
"Agilent"
)
{
esets
<-
preprocess_data_agilent_limma
(
input_data_dir
,
...
...
R/preprocess_data_affymetrix_aptgcrma.R
0 → 100644
View file @
688d482a
#' @title Preprocess an Affymetrix dataset with APT-GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset that was already analysed
#' with APT-GCRMA in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param exprs_raw A matrix corresponding to the pre-processed APT-GCRMA data.
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_affymetrix_aptgcrma
<-
function
(
exprs_raw
,
input_data_dir
,
output_data_files
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
FALSE
)
{
# We create an ESET object from the existing expression matrix.
eset
<-
Biobase
::
ExpressionSet
(
exprs_raw
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data read"
))
}
# We remove the probes that have 0 variance accross the samples.
exp_data
<-
Biobase
::
exprs
(
eset
)
probe_vars
<-
apply
(
exp_data
,
1
,
var
)
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
rm
(
clean_probe_list
)
}
# We clean up and log information.
rm
(
exp_data
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
rm
(
eset_bc
)
}
}
else
{
rm
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
utils
::
write.table
(
Biobase
::
exprs
(
eset
),
file
=
output_data_files
[
1
],
sep
=
"\t"
,
quote
=
FALSE
)
if
(
batch_correction
==
"BOTH"
)
{
utils
::
write.table
(
Biobase
::
exprs
(
eset_bc
),
file
=
output_data_files
[
2
],
sep
=
"\t"
,
quote
=
FALSE
)
}
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Processed data written to files."
))
}
# We return the created ESET(s).
if
(
batch_correction
==
"BOTH"
)
{
return
(
list
(
eset
,
eset_bc
))
}
else
{
return
(
eset
)
}
}
R/preprocess_data_affymetrix_gcrma.R
View file @
688d482a
#' @title Preprocess an Affymetrix dataset with GC-RMA.
#'
#' @description This function preprocess an Affymetrix dataset using RMA and saves the
#' @description This function preprocess an Affymetrix dataset using
GC
RMA and saves the
#' results in a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
...
...
@@ -48,7 +48,7 @@ preprocess_data_affymetrix_gcrma <- function(input_data_dir, output_data_files,
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
rm
(
clean_probe_list
)
}
...
...
R/preprocess_data_affymetrix_scan.R
View file @
688d482a
...
...
@@ -34,7 +34,7 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp
<-
paste0
(
raw_data_input_dir
,
"*"
)
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
)
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
)
# We clean up and log information.
rm
(
raw_data_input_dir
,
input_data_regexp
)
...
...
@@ -48,7 +48,7 @@ preprocess_data_affymetrix_scan <- function(input_data_dir, output_data_files,
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
eset
<-
Biobase
::
ExpressionSet
(
exp_data
[
clean_probe_list
,
])
rm
(
clean_probe_list
)
}
...
...
R/run_limma.R
View file @
688d482a
...
...
@@ -77,7 +77,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created."
))
}
}
else
{
# correct_for_batch == TRUE
}
else
{
# Here we have correct_for_batch = TRUE
# We do not have a block, we do not include age but we do include batch.
batches
<-
factor
(
pheno_data
[[
"Batch"
]])
design
<-
stats
::
model.matrix
(
~
0
+
clinical
+
batches
)
...
...
@@ -89,7 +90,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created (batches)."
))
}
}
}
else
{
# correct_for_age == TRUE
}
else
{
# Here we have correct_for_age = TRUE
if
(
correct_for_batch
==
FALSE
)
{
# We do not have a block, we do include age, but not batch.
ages
<-
as.numeric
(
pheno_data
[[
"Age"
]])
...
...
@@ -101,7 +103,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created (ages)."
))
}
}
else
{
# correct_for_batch == TRUE
}
else
{
# Here we have correct_for_batch = TRUE
# We do not have a block, we do include age and batch.
ages
<-
as.numeric
(
pheno_data
[[
"Age"
]])
batches
<-
factor
(
pheno_data
[[
"Batch"
]])
...
...
@@ -115,7 +118,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
}
}
}
}
else
{
# has_block == TRUE
}
else
{
# Here we have has_block = TRUE
if
(
correct_for_age
==
FALSE
)
{
if
(
correct_for_batch
==
FALSE
)
{
# We do have a block, we do not include age, nor batch.
...
...
@@ -128,7 +132,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created (pairs)."
))
}
}
else
{
# correct_for_batch == TRUE
}
else
{
# Here we have correct_for_batch = TRUE
# We do have a block, we do not include age but we do include batch.
clinical_cofactor
<-
factor
(
pheno_data
[[
cofactor_name
]])
# nolint
batches
<-
factor
(
pheno_data
[[
"Batch"
]])
...
...
@@ -143,7 +148,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created (pairs, batches)."
))
}
}
}
else
{
# correct_for_age == TRUE
}
else
{
# Here we have correct_for_age = TRUE
if
(
correct_for_batch
==
FALSE
)
{
# We do have a block, we do include age, but not batch.
clinical_cofactor
<-
factor
(
pheno_data
[[
cofactor_name
]])
# nolint
...
...
@@ -156,12 +162,14 @@ run_limma <- function(pheno_data, eset, limma_parameters,
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Design matrix created (pairs, ages)."
))
}
}
else
{
# correct_for_batch == TRUE
}
else
{
# Here we have correct_for_batch = TRUE
# We do have a block, we do include age and batch.
clinical_cofactor
<-
factor
(
pheno_data
[[
cofactor_name
]])
# nolint
ages
<-
as.numeric
(
pheno_data
[[
"Age"
]])
batches
<-
factor
(
pheno_data
[[
"Batch"
]])
design
<-
stats
::
model.matrix
(
~
0
+
clinical
+
ages
+
batches
+
clinical_cofactor
)
design
<-
stats
::
model.matrix
(
~
0
+
clinical
+
ages
+
batches
+
clinical_cofactor
)
colnames
(
design
)
<-
c
(
levels
(
clinical
),
"Age"
,
levels
(
batches
)[
2
:
length
(
levels
(
batches
))],
...
...
@@ -181,7 +189,7 @@ run_limma <- function(pheno_data, eset, limma_parameters,
contrast
<-
eval
(
substitute
(
limma
::
makeContrasts
(
coeff1
=
c1
,
levels
=
design
),
list
(
c1
=
coefficients
)))
if
(
run_topconfects
)
{
design_topconfects
<-
limma
::
contrastAsCoef
(
design
,
contrast
)
$
design
design_topconfects
<-
limma
::
contrastAsCoef
(
design
,
contrast
)
$
design
}
# We log information.
...
...
@@ -233,7 +241,7 @@ run_limma <- function(pheno_data, eset, limma_parameters,
# We need to check whether we have any confect at all (if no effect is > 0 at FDR = 0.05,
# then all confects are set to NA values).
confects_genes
<-
confects
$
table
%>%
filter
(
!
is.na
(
confect
))
confects_genes
<-
confects
$
table
%>%
filter
(
!
is.na
(
confect
))
#nolint
if
(
nrow
(
confects_genes
)
>
0
)
{
# Log info.
...
...
@@ -255,8 +263,8 @@ run_limma <- function(pheno_data, eset, limma_parameters,
tcplot_fn
<-
paste0
(
output_data_dir
,
file_prefix
,
"topconfects_"
,
limma_parameters
$
name
,
file_suffix
,
".png"
)
grDevices
::
png
(
tcplot_fn
)
local_n
<-
min
(
50
,
nrow
(
confects_genes
))
p
<-
topconfects
::
confects_plot
(
confects
,
n
=
local_n
)
local_n
<-
min
(
50
,
nrow
(
confects_genes
))
p
<-
topconfects
::
confects_plot
(
confects
,
n
=
local_n
)
print
(
p
)
grDevices
::
dev.off
()
rm
(
p
,
tcplot_fn
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment