Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Gitlab will go into maintenance Friday 3rd February from 9:00 to 10:00
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
ada4dbce
Commit
ada4dbce
authored
Apr 17, 2019
by
Leon-Charles Tranchevent
Browse files
Preprocessing functions added.
parent
53aad4aa
Changes
5
Hide whitespace changes
Inline
Side-by-side
DESCRIPTION
View file @
ada4dbce
...
...
@@ -6,9 +6,9 @@ Author: Leon-Charles Tranchevent
Maintainer: Leon-Charles Tranchevent <leon-charles.tranchevent@uni.lu>
Description: This package contains functions to analyse microarray data.
It is more a set of useful functions than a real package.
License: The Unlicense
License: The Unlicense
(see LICENSE)
Encoding: UTF-8
LazyData: true
Imports:
utils,affy,Biobase,arrayQualityMetrics
utils,affy,Biobase,arrayQualityMetrics
,SCAN.UPC,doParallel
RoxygenNote: 6.1.1
R/load_clinical_data.R
0 → 100644
View file @
ada4dbce
#' @title Loads a table containing clinical data.
#'
#' @description This function loads the clinical data associated with a dataset. It returns an annotated
#' data-frame that contains the clinical data.
#'
#' Note: the function assumes that a TSV file containing the clinical data exists. In
#' particular, it does not check for the existence of folders or files.
#'
#' @param data_dir A string representing the folder that contains the clinical data.
#' @param clinical_file_name A string containing the file name. By default, this is 'ClinicalData.tsv'
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return An annotated data-frame that contains the clinical data.
load_clinical_data
<-
function
(
data_dir
,
clinical_file_name
=
"ClinicalData.tsv"
,
verbose
=
TRUE
)
{
# We define the I/Os.
clinical_data_file
<-
paste0
(
data_dir
,
clinical_file_name
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] File set to "
,
clinical_data_file
))
}
# We load the clinical data.
pheno_data
<-
Biobase
::
AnnotatedDataFrame
(
utils
::
read.delim
(
file
=
clinical_data_file
,
row.names
=
1
,
colClasses
=
"factor"
))
# We clean up and log information.
rm
(
clinical_data_file
)
if
(
verbose
==
TRUE
)
{
data_dimensions
=
paste0
(
dim
(
pheno_data
),
collapse
=
" * "
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Clinical data read ("
,
data_dimensions
,
")."
))
}
# We return the clinical data.
return
(
pheno_data
)
}
R/preprocess_data_scan.R
0 → 100644
View file @
ada4dbce
#' @title Preprocess a dataset with SCAN.
#'
#' @description This function preprocess a dataset using SCAN and saves the results in
#' a given TSV file. In addition, it returns the ESET object.
#'
#' The function assumes that a folder containing the raw data exists (as cel files).
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_file A string representing the file that should contain the
#' preprocessed data.
#' @param correct_for_batch_effect A boolean indicating whether batch correction should
#' be performed, default to FALSE.
#' @param batch_filename A string indicating where the batch information can be found,
#' default to 'Batch.tsv'.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return The expression data as an ESET object.
preprocess_data_scan
<-
function
(
input_data_dir
,
output_data_file
,
correct_for_batch_effect
=
FALSE
,
batch_filename
=
"Batch.tsv"
,
verbose
=
TRUE
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We run the SCAN pre-processing method on the data.
# We do not run the fast analysis (by default).
input_data_regexp
<-
paste0
(
raw_data_input_dir
,
"*"
)
remove
(
raw_data_input_dir
)
eset
<-
vector
()
if
(
correct_for_batch_effect
==
FALSE
)
{
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
,
outFilePath
=
output_data_file
)
}
else
{
# We define the I/Os (for batch).
batch_data_file
<-
paste0
(
input_data_dir
,
batch_filename
)
eset
<-
SCAN.UPC
::
SCAN
(
input_data_regexp
,
outFilePath
=
output_data_file
,
batchFilePath
=
batch_data_file
)
remove
(
batch_data_file
)
}
# We clean up and log information.
rm
(
input_data_regexp
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Expression data pre-processed with SCAN."
))
}
# We return the created ESET.
return
(
eset
)
}
R/run_quality_control_on_preprocessed.R
0 → 100644
View file @
ada4dbce
#' @title Executes a quality control of a given microarray dataset (preprocessed data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated to annotate the preprocessed data and
#' then runs the quality control of the annotated data. The function assumes that a folder with
#' the clinical data exists. It then creates a report that contains various quality
#' indicators and is stored as an HTML document. It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param eset An ESET object that contains the preprocessed expression data.
#' @param input_data_dir A string representing the folder that contains the input data (clinical data).
#' @param output_data_dir A string representing the folder that will contain the output of the QC.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_preprocessed
<-
function
(
eset
,
input_data_dir
,
output_data_dir
,
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We load the clinical data as to annotate the ESET object and make QC more useful.
pheno_data
<-
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)
Biobase
::
phenoData
(
eset
)
<-
pheno_data
remove
(
pheno_data
)
# Now, we do the QC on the normalized data.
arrayQualityMetrics
::
arrayQualityMetrics
(
expressionset
=
eset
,
outdir
=
output_data_dir
,
force
=
TRUE
,
do.logtransform
=
TRUE
,
intgroup
=
phenotype_groups
)
# We clean up and log information.
rm
(
eset
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] QC analysis performed (preprocessed data)."
))
}
}
R/run_quality_control.R
→
R/run_quality_control
_on_raw
.R
View file @
ada4dbce
#' @title Executes a quality control of a given micro-array dataset.
#' @title Executes a quality control of a given microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then loads the raw data and
#' last runs the quality control of the annotated data.
#'
#' The function assumes that the dataset is associated with a unique name (e.g., GEO identifier)
#' and that a folder with this name exists and contains both the clinical data ('ClinicalData.tsv')
#' and the raw data (as cel files in a '/RAW/' folder).
#' last runs the quality control of the annotated data. The function assumes that a folder with
#' both the clinical data and the raw data exists (in a subfolder '/RAW/'). It then creates a report
#' that contains various quality indicators and is stored as an HTML document. It does not return
#' any value.
#'
#' It then creates a report that contains various quality indicators and is stored as an HTML
#' document. It does not return any value.
#' Note: the function does not check for the existence of folders or files.
#'
#' Note: the function does not check for the existence of folders and files (not a real package
#' function).
#'
#' @param dataset_name A string representing the name of the dataset to analyse.
#' @param raw_data_dir A string representing the folder that contains the input data.
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
...
...
@@ -22,29 +17,18 @@
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control
<-
function
(
dataset_name
,
raw_data_dir
,
#' @return NULL
run_quality_control_on_raw
<-
function
(
input_data_dir
,
output_data_dir
,
compressed
=
TRUE
,
phenotype_groups
=
c
(),
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We define the I/Os.
clinical_data_file
<-
paste0
(
raw_data_dir
,
dataset_name
,
"/"
,
"ClinicalData.tsv"
)
raw_data_input_dir
<-
paste0
(
raw_data_dir
,
dataset_name
,
"/"
,
"RAW/"
)
data_output_dir
<-
paste0
(
output_data_dir
,
dataset_name
,
"/"
)
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data
<-
Biobase
::
AnnotatedDataFrame
(
utils
::
read.delim
(
file
=
clinical_data_file
,
row.names
=
1
,
colClasses
=
"factor"
))
# We clean up and log information.
rm
(
clinical_data_file
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
dataset_name
,
"] Phenotypic data read."
))
}
pheno_data
<-
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list
<-
affy
::
list.celfiles
(
raw_data_input_dir
,
full.names
=
TRUE
)
...
...
@@ -54,19 +38,19 @@ run_quality_control <- function(dataset_name,
# We clean up and log information.
rm
(
raw_file_list
,
pheno_data
,
raw_data_input_dir
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
dataset_name
,
"] Expression data read."
))
message
(
paste0
(
"["
,
Sys.time
(),
"] Expression data read."
))
}
# We run the quality control itself.
arrayQualityMetrics
::
arrayQualityMetrics
(
expressionset
=
batch
,
outdir
=
data_
output_dir
,
outdir
=
output_
data_
dir
,
force
=
TRUE
,
do.logtransform
=
TRUE
,
intgroup
=
phenotype_groups
)
# We clean up and log information.
rm
(
data_output_dir
,
batch
)
rm
(
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"]["
,
dataset_name
,
"] QC analysis performed."
))
message
(
paste0
(
"["
,
Sys.time
(),
"] QC analysis performed
(raw data)
."
))
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment