Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
BDS
GeneDER
ArrayUtils
Commits
13644b47
Commit
13644b47
authored
Jun 07, 2019
by
Leon-Charles Tranchevent
Browse files
Added Agilent support for quality control.
parent
64af219d
Changes
7
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
13644b47
...
...
@@ -2,3 +2,4 @@
.Rbuildignore
ArrayUtils.Rproj
man/*
.Rproj.user
DESCRIPTION
View file @
13644b47
...
...
@@ -20,5 +20,9 @@ Imports:
limma,
stringr,
AnnotationDbi,
statmod
statmod,
heatmaply,
ggplot2,
ggfortify,
reshape2
RoxygenNote: 6.1.1
NAMESPACE
View file @
13644b47
exportPattern("^[[:alpha:]]+")
importFrom("grDevices", "dev.off", "png")
importFrom("stats", "cor", "df", "prcomp")
R/load_clinical_data.R
View file @
13644b47
...
...
@@ -38,7 +38,7 @@ load_clinical_data <- function(data_dir,
# We clean up and log information.
rm
(
clinical_data_file
)
if
(
verbose
==
TRUE
)
{
data_dimensions
=
paste0
(
dim
(
pheno_data
),
collapse
=
" * "
)
data_dimensions
<-
paste0
(
dim
(
pheno_data
),
collapse
=
" * "
)
message
(
paste0
(
"["
,
Sys.time
(),
"] Clinical data read ("
,
data_dimensions
,
")."
))
}
...
...
R/run_quality_control_on_raw.R
View file @
13644b47
#' @title Executes a quality control of a given microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then loads the raw data and
#' last runs the quality control of the annotated data. The function assumes that a folder with
#' both the clinical data and the raw data exists (in a subfolder '/RAW/'). It then creates a report
#' that contains various quality indicators and is stored as an HTML document. It does not return
#' any value.
#' It currently supports Affymetrix and Agilent arrays although the QC for Agilent produces only
#' a short report currently (mostly array images). This function is just a handler over the platform
#' dedicated functions.
#'
#' For Affymettrix arrays, it starts by loading the clinical data associated with the dataset, then
#' loads the raw data and last runs the quality control of the annotated data. The function assumes
#' that a folder with both the clinical data and the raw data exists (in a subfolder '/RAW/'). It
#' then creates a report that contains various quality indicators and is stored as an HTML document.
#' It does not return any value.
#'
#' For Agilent arrays, it ... TODO
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' @param platform A string representing the array platform among Affymetrix and Agilent.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
...
...
@@ -19,38 +26,26 @@
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw
<-
function
(
input_data_dir
,
output_data_dir
,
compressed
=
TRUE
,
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data
<-
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list
<-
affy
::
list.celfiles
(
raw_data_input_dir
,
full.names
=
TRUE
)
batch
<-
affy
::
ReadAffy
(
filenames
=
raw_file_list
,
compress
=
compressed
,
verbose
=
verbose
)
Biobase
::
phenoData
(
batch
)
<-
pheno_data
# We clean up and log information.
rm
(
raw_file_list
,
pheno_data
,
raw_data_input_dir
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Expression data read."
))
}
# We run the quality control itself.
arrayQualityMetrics
::
arrayQualityMetrics
(
expressionset
=
batch
,
outdir
=
output_data_dir
,
force
=
TRUE
,
do.logtransform
=
TRUE
,
intgroup
=
phenotype_groups
)
output_data_dir
,
platform
,
compressed
=
TRUE
,
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We clean up and log information.
rm
(
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] QC analysis performed (raw data)."
))
# We launch the correct function depending on the array platform.
if
(
platform
==
"Affymetrix"
)
{
run_quality_control_on_raw_affymetrix
(
input_data_dir
,
output_data_dir
,
compressed
,
phenotype_groups
,
verbose
)
}
else
if
(
platform
==
"Agilent"
)
{
run_quality_control_on_raw_agilent
(
input_data_dir
,
output_data_dir
,
compressed
,
phenotype_groups
,
verbose
)
}
else
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Platform "
,
platform
,
" not yet supported (no QC done)."
))
}
}
R/run_quality_control_on_raw_affymetrix.R
0 → 100644
View file @
13644b47
#' @title Executes a quality control of a given Affymetrix microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It starts by loading the clinical data associated with the dataset, then
#' loads the raw data and last runs the quality control of the annotated data. The function assumes
#' that a folder with both the clinical data and the raw data exists (in a subfolder '/RAW/'). It
#' then creates a report that contains various quality indicators and is stored as an HTML document.
#' It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw_affymetrix
<-
function
(
input_data_dir
,
output_data_dir
,
compressed
=
TRUE
,
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We load the clinical data as to annotate the AffyBatch object and make QC more useful.
pheno_data
<-
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)
# We load the CEL files to create the affyBatch object and then attach the clinical data.
raw_file_list
<-
affy
::
list.celfiles
(
raw_data_input_dir
,
full.names
=
TRUE
)
batch
<-
affy
::
ReadAffy
(
filenames
=
raw_file_list
,
compress
=
compressed
,
verbose
=
verbose
)
Biobase
::
phenoData
(
batch
)
<-
pheno_data
# We clean up and log information.
rm
(
raw_file_list
,
pheno_data
,
raw_data_input_dir
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Expression data read."
))
}
# We run the quality control itself.
arrayQualityMetrics
::
arrayQualityMetrics
(
expressionset
=
batch
,
outdir
=
output_data_dir
,
force
=
TRUE
,
do.logtransform
=
TRUE
,
intgroup
=
phenotype_groups
)
# We clean up and log information.
rm
(
batch
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] QC analysis performed (raw data)."
))
}
}
R/run_quality_control_on_raw_agilent.R
0 → 100644
View file @
13644b47
#' @title Executes a quality control of a given Agilent microarray dataset (raw data).
#'
#' @description This function executes a quality control of the dataset defined by the input parameters.
#' It ... TODO
#' It does not return any value.
#'
#' Note: the function does not check for the existence of folders or files.
#'
#' @param input_data_dir A string representing the folder that contains the input data.
#' @param output_data_dir A string representing the folder that contains the output data.
#' @param compressed A boolean representing whether the raw data are compressed or not. This is
#' TRUE by default.
#' @param phenotype_groups A list of phenotype factor names that can be used to highlight the
#' samples in the QC report. This is none by default.
#' @param verbose A boolean representing whether the function should display log information. This
#' is TRUE by default.
#' @return NULL
run_quality_control_on_raw_agilent
<-
function
(
input_data_dir
,
output_data_dir
,
compressed
=
TRUE
,
phenotype_groups
=
vector
(),
verbose
=
TRUE
)
{
# We define the I/Os.
raw_data_input_dir
<-
paste0
(
input_data_dir
,
"RAW/"
)
# We load the clinical data as to annotate the QC.
pheno_data_raw
<-
ArrayUtils
::
load_clinical_data
(
input_data_dir
,
verbose
=
verbose
)
pheno_data
<-
Biobase
::
pData
(
pheno_data_raw
)
# We load the data (aka agilent MA images).
raw_file_list
<-
list.files
(
path
=
raw_data_input_dir
)
batch
<-
limma
::
read.maimages
(
files
=
raw_file_list
,
source
=
"agilent"
,
path
=
raw_data_input_dir
,
green.only
=
TRUE
,
verbose
=
TRUE
)
batch
$
targets
<-
pheno_data
batch_log_data
=
log2
(
batch
$
E
)
# We clean up and log information.
rm
(
raw_file_list
,
raw_data_input_dir
,
pheno_data_raw
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] Expression data read."
))
}
# We do some kind of quality control (ourselves - no external packages).
dir.create
(
output_data_dir
)
# A- heatmap.
# Compute correlations.
cor_mat
<-
round
(
cor
(
batch_log_data
,
method
=
"pearson"
),
2
)
# Create the heatmap.
heatmap_filename
<-
paste0
(
output_data_dir
,
"hm.html"
)
heatmaply
::
heatmaply
(
cor_mat
,
file
=
heatmap_filename
,
showticklabels
=
c
(
FALSE
,
TRUE
),
row_side_colors
=
pheno_data
[,
1
:
2
])
dev.off
()
# We clean up.
rm
(
cor_mat
,
heatmap_filename
)
# B- PCA plots.
# Compute the PCA.
pca
<-
prcomp
(
t
(
batch_log_data
),
scale
=
FALSE
)
# Prepare the colors for the disease status and the gender.
ds_colors
<-
rep
(
heatmaply
::
RdBu
(
2
)[
1
],
dim
(
pheno_data
)[
1
])
gd_colors
<-
rep
(
heatmaply
::
PuOr
(
2
)[
1
],
dim
(
pheno_data
)[
1
])
ds_colors
[
pheno_data
$
Disease.status
==
"Control"
]
<-
heatmaply
::
RdBu
(
2
)[
2
]
gd_colors
[
pheno_data
$
Gender
==
"F"
]
<-
heatmaply
::
PuOr
(
2
)[
2
]
# Plots the first two PCs.
pca_ds_filename
<-
paste0
(
output_data_dir
,
"pca_status.png"
)
png
(
pca_ds_filename
)
pca_plot_ds
<-
ggplot2
::
autoplot
(
pca
,
colour
=
ds_colors
,
size
=
5
)
print
(
pca_plot_ds
)
dev.off
()
pca_gd_filename
<-
paste0
(
output_data_dir
,
"pca_gender.png"
)
png
(
pca_gd_filename
)
pca_plot_gd
<-
ggplot2
::
autoplot
(
pca
,
colour
=
gd_colors
,
size
=
5
)
print
(
pca_plot_gd
)
dev.off
()
# We clean up.
rm
(
pca
,
ds_colors
,
gd_colors
,
pca_ds_filename
,
pca_gd_filename
)
# C- boxplots (aka densities).
# We melt the data frame for ggplot2.
df_melted
<-
reshape2
::
melt
(
t
(
batch_log_data
))
# We plot the boxplots (one per array).
boxplot_filename
<-
paste0
(
output_data_dir
,
"bx.png"
)
png
(
boxplot_filename
)
boxplot_arrays
<-
ggplot2
::
ggplot
(
df_melted
,
ggplot2
::
aes
(
Var1
,
value
))
+
ggplot2
::
geom_boxplot
(
lwd
=
1.2
,
outlier.alpha
=
0.1
,
ggplot2
::
aes
(
colour
=
Var1
))
+
ggplot2
::
theme
(
legend.position
=
"none"
,
axis.title.x
=
ggplot2
::
element_blank
(),
panel.background
=
ggplot2
::
element_blank
(),
axis.line
=
ggplot2
::
element_line
(
colour
=
"black"
),
panel.grid.major
=
ggplot2
::
element_line
(
colour
=
"lightgrey"
),
axis.text.x
=
ggplot2
::
element_blank
())
print
(
boxplot_arrays
)
dev.off
()
# We clean up.
rm
(
df_melted
,
boxplot_filename
,
boxplot_arrays
)
# D- array images.
# We have to extract the layout of the array from the batch object.
# Basically to associate the correct value from the array (1..n) to
# the pixel (i, j).
rows
<-
batch
$
genes
$
Row
cols
<-
batch
$
genes
$
Col
nrows
<-
max
(
rows
)
ncols
<-
max
(
cols
)
array_data
<-
rep
(
NA
,
nrows
*
ncols
)
i
<-
(
rows
-
1
)
*
ncols
+
cols
# We now plot the arrays one by one.
for
(
j
in
seq_len
(
dim
(
batch_log_data
)[
2
]))
{
array_data
[
i
]
<-
batch_log_data
[
,
j
]
array_filename
<-
paste0
(
output_data_dir
,
"array_"
,
j
,
".png"
)
png
(
array_filename
)
limma
::
imageplot
(
array_data
,
main
=
rownames
(
df
)[
j
],
layout
=
list
(
ngrid.r
=
1
,
ngrid.c
=
1
,
nspot.r
=
nrows
,
nspot.c
=
ncols
))
dev.off
()
}
# We clean up.
rm
(
rows
,
cols
,
nrows
,
ncols
,
array_data
,
i
,
j
,
array_filename
)
# We clean up and log information.
rm
(
batch
,
batch_log_data
,
pheno_data
)
if
(
verbose
==
TRUE
)
{
message
(
paste0
(
"["
,
Sys.time
(),
"] QC analysis performed (raw data)."
))
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment