Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
056b87fa
Commit
056b87fa
authored
Mar 16, 2021
by
Leon-Charles Tranchevent
Browse files
Added RNA-seq support.
parent
4299aa24
Changes
2
Hide whitespace changes
Inline
Side-by-side
R/preprocess_data.R
View file @
056b87fa
...
...
@@ -20,6 +20,8 @@
#' Default to NULL since in most cases, we do not have pre-processed data already.
#' @param compressed A boolean representing whether the cel files are compressed. This
#' is FALSE by default.
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
...
...
@@ -30,14 +32,15 @@
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data
<-
function
(
input_data_dir
,
output_data_files
,
platform
=
"Affymetix"
,
method
=
"SCAN"
,
exprs_raw
=
NULL
,
compressed
=
FALSE
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
FALSE
)
{
platform
=
"Affymetix"
,
method
=
"SCAN"
,
exprs_raw
=
NULL
,
compressed
=
FALSE
,
expression_filename
=
"ExpData.tsv"
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
verbose
=
FALSE
)
{
# We launch the correct function depending on the array platform and desired method.
esets
<-
NULL
...
...
@@ -83,6 +86,14 @@ preprocess_data <- function(input_data_dir, output_data_files,
batch_filename
=
batch_filename
,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
else
if
(
platform
==
"RNAseq"
)
{
esets
<-
preprocess_data_rnaseq
(
input_data_dir
,
output_data_files
,
expression_filename
=
expression_filename
,
batch_correction
=
batch_correction
,
batch_filename
=
batch_filename
,
clean_samples
=
clean_samples
,
verbose
=
verbose
)
}
else
{
message
(
paste0
(
"["
,
Sys.time
(),
"] [WARNING] Platform "
,
platform
,
" not yet supported (no preprocessing done)."
))
...
...
R/preprocess_data_rnaseq.R
0 → 100644
View file @
056b87fa
#' @title Post preprocess a RNA-seq dataset.
#'
#' @description This function preprocess a dataset that was already pre-processed
#' in order to perform the cleaning / filtering / batch corection steps.
#' The results are saves in TSV file(s). In addition, it returns the ESET object(s).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_files An array of strings representing the files that should contain the
#' preprocessed data. At least one value, maximum two if batch_correction is "BOTH".
#' @param expression_filename A String indicating where the expression matrix can be found,
#' default to 'Expdata.tsv'.
#' @param batch_correction A String indicating whether batch correction should
#' be performed. Options are "TRUE", "FALSE", "BOTH", default to "FALSE".
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'. Note: not yet suported.
#' @param clean_samples A boolean indicating whether the dataset should be cleaned by removing
#' the samples that do not have clinical data. Default to FALSE.
#' @param remove_low_counts A boolean indicating whether low counts should be removed prior
#' to the normalization, TRUE by default.
#' @param low_count_threshold A numeric representing the maximum number of counts associated to
#' transcripts considered as not lowly expressed. Default value is 5.
#' @param verbose A boolean representing whether the function should display log information. This
#' is FALSE by default.
#' @return The expression data as ESET objects. Potentially only one object (therefore unlisted).
preprocess_data_rnaseq
<-
function
(
input_data_dir
,
output_data_files
,
expression_filename
=
"ExpData.tsv"
,
batch_correction
=
"FALSE"
,
batch_filename
=
"Batch.tsv"
,
clean_samples
=
FALSE
,
remove_low_counts
=
TRUE
,
low_count_threshold
=
5
,
verbose
=
FALSE
)
{
# We read the pre-processed data (RNA-seq).
exp_data_fn
<-
paste0
(
input_data_dir
,
expression_filename
)
exp_data
<-
ArrayUtils
::
read_eset
(
exp_data_fn
,
as_eset
=
FALSE
)
rm
(
exp_data_fn
)
# The RNA-seq data consists of counts so we need to post-process
# it to get some kind of microarray compatible data (for instance on a log2 scale).
# We first need to filter the low expressed transcripts.
if
(
remove_low_counts
)
{
keep
<-
rowSums
(
exp_data
)
>
low_count_threshold
exp_data
<-
exp_data
[
keep
,
]
rm
(
keep
)
}
# We need to normalize the counts.
exp_data_dge
<-
edgeR
::
DGEList
(
counts
=
exp_data
)
exp_data_dge
<-
edgeR
::
calcNormFactors
(
exp_data_dge
)
rm
(
exp_data
)
# CPM then does the normalization and the log-transformation.
exp_data_norm
<-
limma
::
voom
(
exp_data_dge
,
normalize.method
=
"quantile"
)
$
E
rm
(
exp_data_dge
)
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Raw data read"
))
}
# We remove the probes that have 0 variance accross the samples.
probe_vars
<-
apply
(
exp_data_norm
,
1
,
var
)
probe_var_0
<-
names
(
probe_vars
[
probe_vars
==
0
])
eset
<-
NULL
if
(
length
(
probe_var_0
)
>
0
)
{
clean_probe_list
<-
setdiff
(
rownames
(
exp_data_norm
),
probe_var_0
)
eset
<-
Biobase
::
ExpressionSet
(
exp_data_norm
[
clean_probe_list
,
])
rm
(
clean_probe_list
)
}
else
{
eset
<-
Biobase
::
ExpressionSet
(
exp_data_norm
)
}
# We clean up and log information.
rm
(
exp_data_norm
,
probe_vars
,
probe_var_0
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step I)."
))
}
# We correct for the batch effect if necesary.
eset_bc
<-
NULL
if
(
batch_correction
!=
"FALSE"
)
{
eset_bc
<-
correct_batch_effect
(
eset
=
eset
,
input_data_dir
=
input_data_dir
,
verbose
=
verbose
)
# We log some information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Batch effect corrected."
))
}
if
(
batch_correction
==
"TRUE"
)
{
eset
<-
eset_bc
rm
(
eset_bc
)
}
}
else
{
rm
(
eset_bc
)
}
# If necessary, we remove the samples that do not have clinical data.
if
(
clean_samples
)
{
# We load the clinical data as to get the samples to keep.
samples
<-
rownames
(
Biobase
::
pData
(
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)))
# We only keep the samples with clinical data.
eset
<-
eset
[,
samples
]
if
(
batch_correction
==
"BOTH"
)
{
eset_bc
<-
eset_bc
[,
samples
]
}
# We clean up and log information.
rm
(
samples
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Data cleaned (step II)."
))
}
}
# We save the eset data as TSV file.
utils
::
write.table
(
Biobase
::
exprs
(
eset
),
file
=
output_data_files
[
1
],
sep
=
"\t"
,
quote
=
FALSE
)
if
(
batch_correction
==
"BOTH"
)
{
utils
::
write.table
(
Biobase
::
exprs
(
eset_bc
),
file
=
output_data_files
[
2
],
sep
=
"\t"
,
quote
=
FALSE
)
}
# We log information.
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Processed data written to files."
))
}
# We return the created ESET(s).
if
(
batch_correction
==
"BOTH"
)
{
return
(
list
(
eset
,
eset_bc
))
}
else
{
return
(
eset
)
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment