Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Armin Rauschenberger
joinet
Commits
3d09d6e4
Commit
3d09d6e4
authored
Sep 19, 2018
by
Rauschenberger
Browse files
automation
parent
49360ff7
Changes
57
Hide whitespace changes
Inline
Side-by-side
DESCRIPTION
View file @
3d09d6e4
Package:
spliceQTL
Package:
colasso
Version: 0.0.0
Title:
Alternative Splicing
Description: Implements
test for alternative splicing
.
Title:
colasso regression
Description: Implements
colasso
.
Depends: R (>= 3.0.0)
Imports:
lme4, globaltest, edgeR, snpStats, refGenome, R.utils, methods, SummarizedExperiment, vcfR
Imports:
glmnet, MASS
Suggests: knitr, testthat
Authors@R: c(person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role=c("aut","cre")),
person("Renee","Menezes",role=c("aut")))
Authors@R: person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role=c("aut","cre"))
VignetteBuilder: knitr
License: GPL-3
LazyData: true
RoxygenNote: 6.0.1
URL: https://github.com/rauschenberger/
spliceQTL
BugReports: https://github.com/rauschenberger/
spliceQTL
/issues
URL: https://github.com/rauschenberger/
colasso
BugReports: https://github.com/rauschenberger/
colasso
/issues
NAMESPACE
View file @
3d09d6e4
# Generated by roxygen2: do not edit by hand
export(adjust.samples)
export(adjust.variables)
export(drop.trivial)
export(get.exons.bbmri)
export(get.exons.geuvadis)
export(get.snps.bbmri)
export(get.snps.geuvadis)
export(grid)
export(map.exons)
export(map.genes)
export(map.snps)
export(match.samples)
export(test.multiple)
export(test.single)
export(visualise)
export(colasso)
export(colasso_compare)
export(colasso_covariate_weights)
export(colasso_marginal_significance)
export(colasso_moderate)
export(colasso_simulate)
export(colasso_weighted_correlation)
R/functions.R
View file @
3d09d6e4
#' @name spliceQTL-package
#' @md
#' @aliases spliceQTL
#'
#' @title
#'
#' Alternative Splicing
#'
#' @description
#'
#' This R package includes various functions
#' for applying the global test of alternative splicing.
#' Some functions only work on the virtual machine (see below).
#'
#' @seealso
#'
#' Prepare BBMRI and Geuvadis data:
#' * \code{\link{get.snps.geuvadis}} (not VM)
#' * \code{\link{get.snps.bbmri}} (only VM)
#' * \code{\link{get.exons.geuvadis}} (only VM)
#' * \code{\link{get.exons.bbmri}} (only VM)
#'
#' Process samples and covariates:
#' * \code{\link{match.samples}}
#' * \code{\link{adjust.samples}}
#' * \code{\link{adjust.variables}}
#'
#' Search for exons and SNPs:
#' * \code{\link{map.genes}}
#' * \code{\link{map.exons}}
#' * \code{\link{map.snps}}
#' * \code{\link{drop.trivial}}
#'
#' Test for alternative splicing:
#' * \code{\link{test.single}}
#' * \code{\link{test.multiple}}
#'
#' @keywords documentation
#' @docType package
#'
NULL
#' @export
#' @aliases colasso-package
#' @title
#'
Get SNP data (Geuvadis)
#'
colasso
#'
#' @description
#' This function transforms SNP data (local machine):
#' downloads missing genotype data from ArrayExpress,
#' transforms variant call format to binary files,
#' removes SNPs with low minor allele frequency,
#' labels SNPs in the format "chromosome:position",
#' changes sample identifiers
#' This function ...
#'
#' @param chr
#' chromosome: integer \eqn{1-22}
#'
#' @param data
#' local directory for VCF (variant call format)
#' and SDRF (sample and data relationship format) files;
#'
#' @param path
#' local directory for output
#'
#' @examples
#' path <- "C:/Users/a.rauschenbe/Desktop/spliceQTL/data"
#'
get.snps.geuvadis
<-
function
(
chr
,
data
=
NULL
,
path
=
getwd
()){
if
(
is.null
(
data
)){
data
<-
path
# download SNP data
file
<-
paste0
(
"GEUVADIS.chr"
,
chr
,
".PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz"
)
url
<-
paste0
(
"http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-1/genotypes/"
,
file
)
destfile
<-
file.path
(
data
,
file
)
if
(
!
file.exists
(
destfile
)){
utils
::
download.file
(
url
=
url
,
destfile
=
destfile
,
method
=
"auto"
)
}
# transform with PLINK
setwd
(
data
)
system
(
paste0
(
"plink --vcf GEUVADIS.chr"
,
chr
,
".PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz"
,
" --maf 0.05 --geno 0 --make-bed --out snps"
,
chr
),
invisible
=
FALSE
)
# obtain identifiers
file
<-
"E-GEUV-1.sdrf.txt"
url
<-
paste
(
"http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-1/"
,
file
,
sep
=
""
)
destfile
<-
file.path
(
data
,
file
)
if
(
!
file.exists
(
destfile
)){
utils
::
download.file
(
url
=
url
,
destfile
=
destfile
,
method
=
"auto"
)
}
}
# read into R
bed
<-
file.path
(
data
,
paste
(
"snps"
,
chr
,
".bed"
,
sep
=
""
))
bim
<-
file.path
(
data
,
paste
(
"snps"
,
chr
,
".bim"
,
sep
=
""
))
fam
<-
file.path
(
data
,
paste
(
"snps"
,
chr
,
".fam"
,
sep
=
""
))
X
<-
snpStats
::
read.plink
(
bed
=
bed
,
bim
=
bim
,
fam
=
fam
)
X
$
fam
<-
NULL
;
all
(
diff
(
X
$
map
$
position
)
>
0
)
# filter MAF
maf
<-
snpStats
::
col.summary
(
X
$
genotypes
)
$
MAF
cond
<-
maf
>=
0.05
X
$
genotypes
<-
X
$
genotypes
[,
cond
]
X
$
map
<-
X
$
map
[
cond
,]
# format
colnames
(
X
$
genotypes
)
<-
paste0
(
X
$
map
$
chromosome
,
":"
,
X
$
map
$
position
)
snps
<-
methods
::
as
(
object
=
X
$
genotypes
,
Class
=
"numeric"
)
class
(
snps
)
<-
"integer"
# change identifiers
samples
<-
utils
::
read.delim
(
file
=
file.path
(
data
,
"E-GEUV-1.sdrf.txt"
))
match
<-
match
(
rownames
(
snps
),
samples
$
Source.Name
)
rownames
(
snps
)
<-
samples
$
Comment.ENA_RUN.
[
match
]
snps
<-
snps
[
!
is.na
(
rownames
(
snps
)),]
save
(
object
=
snps
,
file
=
file.path
(
path
,
paste0
(
"Geuvadis.chr"
,
chr
,
".RData"
)))
}
#' @export
#' @title
#' Get SNP data (BBMRI)
#'
#' @description
#' This function transforms SNP data (virtual machine):
#' limits analysis to specified biobanks,
#' reads in genotype data in chunks,
#' removes SNPs with missing values (multiple biobanks/technologies),
#' removes SNPs with low minor allele frequency,
#' fuses data from multiple biobanks/technologies
#'
#' @param chr
#' chromosome: integer \eqn{1-22}
#' @param y
#' response\strong{:}
#' vector of length \eqn{n}
#'
#' @param biobank
#' character "CODAM", "LL", "LLS", "NTR", "PAN", "RS", or NULL (all)
#' @param X
#' covariates\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#'
#' @param
path
#'
data directory
#' @param
nfold
#'
number of folds
#'
#' @param size
#' maximum number of SNPs to read in at once;
#' trade-off between memory usage (low) and speed (high)
#' @param alpha
#' elastic net parameter
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.snps.bbmri
<-
function
(
chr
,
biobank
=
NULL
,
path
=
getwd
(),
size
=
500
*
10
^
3
){
start
<-
Sys.time
()
message
(
rep
(
"-"
,
times
=
20
),
" chromosome "
,
chr
,
" "
,
rep
(
"-"
,
times
=
20
))
p
<-
5
*
10
^
6
# (maximum number of SNPs per chromosome, before filtering)
skip
<-
seq
(
from
=
0
,
to
=
p
,
by
=
size
)
if
(
is.null
(
biobank
)){
study
<-
c
(
"CODAM"
,
"LL"
,
"LLS0"
,
"LLS1"
,
"NTR0"
,
"NTR1"
,
"PAN"
,
"RS"
)
}
else
if
(
biobank
==
"LLS"
){
study
<-
c
(
"LLS0"
,
"LLS1"
)
}
else
if
(
biobank
==
"NTR"
){
study
<-
c
(
"NTR0"
,
"NTR1"
)
}
else
if
(
biobank
%in%
c
(
"CODAM"
,
"LL"
,
"PAN"
,
"RS"
)){
study
<-
biobank
}
else
{
stop
(
"Invalid biobank."
,
call.
=
FALSE
)
}
collect
<-
matrix
(
list
(),
nrow
=
length
(
skip
),
ncol
=
length
(
study
))
colnames
(
collect
)
<-
study
for
(
i
in
seq_along
(
skip
)){
message
(
"\n"
,
"chunk "
,
i
,
": "
,
appendLF
=
FALSE
)
for
(
j
in
seq_along
(
study
)){
message
(
study
[
j
],
" "
,
appendLF
=
FALSE
)
# Locating files on virtual machine.
dir
<-
study
[
j
]
if
(
study
[
j
]
==
"LLS0"
){
dir
<-
"LLS/660Q"
}
if
(
study
[
j
]
==
"LLS1"
){
dir
<-
"LLS/OmniExpr"
}
if
(
study
[
j
]
==
"NTR0"
){
dir
<-
"NTR/Affy6"
}
if
(
study
[
j
]
==
"NTR1"
){
dir
<-
"NTR/GoNL"
}
path0
<-
file.path
(
"/mnt/virdir/Backup/RP3_data/HRCv1.1_Imputation"
,
dir
)
path1
<-
path
file0
<-
paste0
(
"chr"
,
chr
,
".dose.vcf.gz"
)
file1
<-
paste0
(
study
[
j
],
".chr"
,
chr
,
".dose.vcf.gz"
)
file2
<-
paste0
(
study
[
j
],
".chr"
,
chr
,
".dose.vcf"
)
# Reading in files.
#vcf <- vcfR::read.vcfR(file=file.path(path1,file2),skip=skip[i],nrows=size,verbose=FALSE)
vcf
<-
vcfR
::
read.vcfR
(
file
=
file.path
(
path0
,
file0
),
skip
=
skip
[
i
],
nrows
=
size
,
verbose
=
FALSE
)
vcf
<-
vcf
[
vcf
@
fix
[,
"CHROM"
]
!=
""
,]
# bug fix
vcf
@
fix
[,
"ID"
]
<-
paste0
(
vcf
@
fix
[,
"ID"
],
"_"
,
seq_len
(
dim
(
vcf
)[
"variants"
]))
collect
[
i
,
j
][[
1
]]
<-
vcf
stop
<-
dim
(
vcf
)[
"variants"
]
==
0
final
<-
dim
(
vcf
)[
"variants"
]
<
size
if
(
stop
){
break
}
}
print
(
utils
::
object.size
(
collect
),
units
=
"Gb"
)
end
<-
Sys.time
()
if
(
stop
){
break
}
#' n <- 100; p <- 20
#' X <- matrix(rnorm(n*p),nrow=n,ncol=p)
#' #y <- rbinom(n=n,size=1,prob=0.2)
#' y <- rnorm(n=n)
#' #y[1] <- 0.5
#' #a <- glmnet::glmnet(y=y,x=x,family="binomial")
#' #b <- stats::glm(y~x,family="binomial")
colasso
<-
function
(
y
,
X
,
nfold
=
5
,
alpha
=
1
){
# properties
n
<-
nrow
(
X
);
p
<-
ncol
(
X
)
if
(
length
(
y
)
!=
n
){
stop
(
"sample size"
)}
foldid
<-
sample
(
x
=
rep
(
x
=
seq_len
(
5
),
length.out
=
n
))
pi
<-
seq
(
from
=
0
,
to
=
0.5
,
by
=
0.1
)
# adapt this
# model fitting
fit
<-
list
()
Y
<-
colasso_moderate
(
y
=
y
,
X
=
X
,
pi
=
pi
)
for
(
j
in
seq_along
(
pi
)){
fit
[[
j
]]
<-
glmnet
::
glmnet
(
y
=
Y
[,
j
],
x
=
X
,
alpha
=
alpha
)
}
# inner cross-validation
pred
<-
lapply
(
pi
,
function
(
x
)
matrix
(
data
=
NA
,
nrow
=
length
(
y
),
ncol
=
100
))
for
(
k
in
sort
(
unique
(
foldid
))){
y0
<-
y
[
foldid
!=
k
]
y1
<-
y
[
foldid
==
k
]
X0
<-
X
[
foldid
!=
k
,,
drop
=
FALSE
]
X1
<-
X
[
foldid
==
k
,,
drop
=
FALSE
]
# only retain SNPs measured in all studies
position
<-
lapply
(
seq_along
(
study
),
function
(
j
)
collect
[
i
,
j
][[
1
]]
@
fix
[,
"POS"
])
common
<-
Reduce
(
f
=
intersect
,
x
=
position
)
for
(
j
in
seq_along
(
study
)){
cond
<-
match
(
x
=
common
,
table
=
position
[[
j
]])
collect
[
i
,
j
][[
1
]]
<-
collect
[
i
,
j
][[
1
]][
cond
,]
Y0
<-
colasso_moderate
(
y
=
y0
,
X
=
X0
,
pi
=
pi
)
for
(
j
in
seq_along
(
pi
)){
glmnet
<-
glmnet
::
glmnet
(
y
=
Y0
[,
j
],
x
=
X0
,
alpha
=
alpha
)
# was lambda=lambda
temp
<-
stats
::
predict
(
object
=
glmnet
,
newx
=
X1
,
type
=
"response"
,
s
=
fit
[[
j
]]
$
lambda
)
pred
[[
j
]][
foldid
==
k
,
seq_len
(
ncol
(
temp
))]
<-
temp
}
# Calculating minor allele frequency.
num
<-
numeric
();
maf
<-
list
()
for
(
j
in
seq_along
(
study
)){
num
[
j
]
<-
dim
(
collect
[
i
,
j
][[
1
]])[
"gt_cols"
]
# replace by adjusted sample sizes?
maf
[[
j
]]
<-
num
[
j
]
*
vcfR
::
maf
(
collect
[
i
,
j
][[
1
]])[,
"Frequency"
]
}
cond
<-
rowSums
(
do.call
(
what
=
"cbind"
,
args
=
maf
))
/
sum
(
num
)
>
0.05
if
(
sum
(
cond
)
==
0
){
if
(
final
){
break
}
else
{
next
}}
# Extracting genotypes.
for
(
j
in
seq_along
(
study
)){
gt
<-
vcfR
::
extract.gt
(
collect
[
i
,
j
][[
1
]][
cond
,])
gt
[
gt
==
"0|0"
]
<-
0
gt
[
gt
==
"0|1"
|
gt
==
"1|0"
]
<-
1
gt
[
gt
==
"1|1"
]
<-
2
storage.mode
(
gt
)
<-
"integer"
collect
[
i
,
j
][[
1
]]
<-
gt
}
if
(
final
){
break
}
}
# Removing empty rows.
cond
<-
apply
(
collect
,
1
,
function
(
x
)
all
(
sapply
(
x
,
length
)
==
0
))
collect
<-
collect
[
!
cond
,,
drop
=
FALSE
]
# Fusing all matrices.
snps
<-
NULL
for
(
i
in
seq_len
(
nrow
(
collect
))){
inner
<-
NULL
for
(
j
in
seq_len
(
ncol
(
collect
))){
add
<-
collect
[
i
,
j
][[
1
]]
colnames
(
add
)
<-
paste0
(
colnames
(
collect
)[
j
],
":"
,
colnames
(
add
))
inner
<-
cbind
(
inner
,
add
)
}
snps
<-
rbind
(
snps
,
inner
)
}
attributes
(
snps
)
$
time
<-
end
-
start
rownames
(
snps
)
<-
sapply
(
strsplit
(
x
=
rownames
(
snps
),
split
=
"_"
),
function
(
x
)
x
[[
1
]])
snps
<-
t
(
snps
)
# Filter samples.
rownames
(
snps
)
<-
sub
(
x
=
rownames
(
snps
),
pattern
=
"LLS0|LLS1"
,
replacement
=
"LLS"
)
rownames
(
snps
)
<-
sub
(
x
=
rownames
(
snps
),
pattern
=
"NTR0|NTR1"
,
replacement
=
"NTR"
)
if
(
is.null
(
biobank
)){
save
(
object
=
snps
,
file
=
file.path
(
path1
,
paste0
(
"BBMRI.chr"
,
chr
,
".RData"
)))
}
else
{
save
(
object
=
snps
,
file
=
file.path
(
path1
,
paste0
(
biobank
,
".chr"
,
chr
,
".RData"
)))
# loss sequence
for
(
i
in
seq_along
(
pi
)){
fit
[[
i
]]
$
cvm
<-
apply
(
X
=
pred
[[
i
]],
MARGIN
=
2
,
FUN
=
function
(
x
)
mean
((
x
-
y
)
^
2
))
fit
[[
i
]]
$
lambda.min
<-
fit
[[
i
]]
$
lambda
[
which.min
(
fit
[[
i
]]
$
cvm
)]
}
}
#' @export
#' @title
#' Get exon data (Geuvadis)
#'
#' @description
#' This function transforms exon data (virtual machine):
#' retains exons on the autosomes,
#' labels exons in the format "chromosome_start_end",
#' extracts corresponding gene names
#'
#' @param path
#' data directory
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.exons.geuvadis
<-
function
(
path
=
getwd
()){
nrows
<-
303544
file
<-
"/virdir/Scratch/rmenezes/data_counts.txt"
exons
<-
utils
::
read.table
(
file
=
file
,
header
=
TRUE
,
nrows
=
nrows
)
exons
<-
exons
[
exons
[,
"chr"
]
%in%
1
:
22
,]
# autosomes
rownames
(
exons
)
<-
exon_id
<-
paste0
(
exons
[,
"chr"
],
"_"
,
exons
[,
"start"
],
"_"
,
exons
[,
"end"
])
gene_id
<-
as.character
(
exons
[,
4
])
exons
<-
t
(
exons
[,
-
c
(
1
:
4
)])
save
(
list
=
c
(
"exons"
,
"exon_id"
,
"gene_id"
),
file
=
file.path
(
path
,
"Geuvadis.exons.RData"
))
}
#' @export
#' @title
#' Get exon data (BBMRI)
#'
#' @description
#' This function transforms exon data (virtual machine):
#' loads quality controlled gene expression data,
#' extracts sample identifiers,
#' removes samples without SNP data,
#' loads exon expression data,
#' extracts sample identifiers,
#' retains samples that passed quality control,
#' retains exons on the autosomes
#'
#' @param path
#' data directory
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.exons.bbmri
<-
function
(
path
=
getwd
()){
# sample identifiers:
# (1) loading quality controlled gene expression data
# (2) extracting sample identifiers
# (3) removing identifiers without SNP data
# (4) translating identifiers
utils
::
data
(
rnaSeqData_ReadCounts_BIOS_cleaned
,
package
=
"BBMRIomics"
)
# (1)
cd
<-
SummarizedExperiment
::
colData
(
counts
)[,
c
(
"biobank_id"
,
"imputation_id"
,
"run_id"
)]
# (2)
counts
<-
NULL
names
(
cd
)
<-
substr
(
names
(
cd
),
start
=
1
,
stop
=
3
)
# abbreviate names
cd
<-
cd
[
!
is.na
(
cd
$
imp
),]
# (3)
cd
$
id
<-
NA
# (4)
cd
$
id
[
cd
$
bio
==
"CODAM"
]
<-
sapply
(
strsplit
(
x
=
cd
$
imp
[
cd
$
bio
==
"CODAM"
],
split
=
"_"
),
function
(
x
)
x
[[
2
]])
cd
$
id
[
cd
$
bio
==
"LL"
]
<-
sub
(
pattern
=
"1_LLDeep_"
,
replacement
=
""
,
x
=
cd
$
imp
[
cd
$
bio
==
"LL"
])
cd
$
id
[
cd
$
bio
==
"LLS"
]
<-
sapply
(
strsplit
(
x
=
cd
$
imp
[
cd
$
bio
==
"LLS"
],
split
=
"_"
),
function
(
x
)
x
[[
2
]])
cd
$
id
[
cd
$
bio
==
"NTR"
]
<-
sapply
(
strsplit
(
x
=
cd
$
imp
[
cd
$
bio
==
"NTR"
],
split
=
"_"
),
function
(
x
)
x
[[
2
]])
cd
$
id
[
cd
$
bio
==
"PAN"
]
<-
cd
$
imp
[
cd
$
bio
==
"PAN"
]
cd
$
id
[
cd
$
bio
==
"RS"
]
<-
sub
(
pattern
=
"RS1_|RS2_|RS3_"
,
replacement
=
""
,
x
=
cd
$
imp
[
cd
$
bio
==
"RS"
])
# Identify individual not with "id" but with "bio:id".
any
(
duplicated
(
cd
$
id
))
# TRUE
sapply
(
unique
(
cd
$
bio
),
function
(
x
)
any
(
duplicated
(
cd
$
id
[
x
])))
# FALSE
# selection
cvm
<-
sapply
(
fit
,
function
(
x
)
x
$
cvm
[
which
(
x
$
lambda
==
x
$
lambda.min
)])
sel
<-
which.min
(
cvm
)
# BUT "sel <- which.max(cvm)" FOR AUC!
fit
[[
length
(
pi
)
+1
]]
<-
fit
[[
sel
]]
# exon data:
# (1) loading exon expression data
# (2) extracting sample identifiers
# (3) retaining autosomes
# (4) retaining samples from above
load
(
"/virdir/Backup/RP3_data/RNASeq/v2.1.3/exon_base/exon_base_counts.RData"
)
# (1)
colnames
(
counts
)
<-
sub
(
pattern
=
".exon.base.count.gz"
,
replacement
=
""
,
x
=
colnames
(
counts
))
# (2)
autosomes
<-
sapply
(
strsplit
(
x
=
rownames
(
counts
),
split
=
"_"
),
function
(
x
)
x
[[
1
]]
%in%
1
:
22
)
# (3)
exons
<-
counts
[
autosomes
,
cd
$
run
]
# (3) and (4)
exon_id
<-
exon_id
[
autosomes
]
# (3)
gene_id
<-
gene_id
[
autosomes
]
# (3)
colnames
(
exons
)
<-
paste0
(
cd
$
bio
,
":"
,
cd
$
id
)
exons
<-
t
(
exons
)
save
(
list
=
c
(
"exons"
,
"exon_id"
,
"gene_id"
),
file
=
file.path
(
path
,
"BBMRI.exons.RData"
))
#graphics::plot(cvm); graphics::abline(v=sel,lty=2)
names
(
fit
)
<-
c
(
"standard"
,
paste0
(
"pi"
,
pi
[
-1
]),
"select"
)
return
(
fit
)
}
#' @export
#' @title
#' Prepare data matrices
#'
#' @description
#' This function removes duplicate samples from each matrix,
#' only retains samples appearing in all matrices,
#' and brings samples into the same order.
#'
#' @param ...
#' matrices with samples in the rows and variables in the columns,
#' with sample identifiers as rows names
#'
#' @param message
#' display messages\strong{:} logical
#'
#' @examples
#' X <- matrix(rnorm(6),nrow=3,ncol=2,dimnames=list(c("A","B","C")))
#' Z <- matrix(rnorm(9),nrow=3,ncol=3,dimnames=list(c("B","A","B")))
#' match.samples(X,Z)
#'
match.samples
<-
function
(
...
,
message
=
TRUE
){
list
<-
list
(
...
)
if
(
length
(
list
)
==
1
&
is.list
(
list
[[
1
]])){
list
<-
list
[[
1
]]}
if
(
is.null
(
names
(
list
))){
names
(
list
)
<-
sapply
(
substitute
(
list
(
...
))[
-1
],
deparse
)
}
names
<-
names
(
list
)
# check input
cond
<-
sapply
(
list
,
function
(
x
)
!
is.matrix
(
x
))
if
(
any
(
cond
)){
stop
(
"Provide matrices!"
,
call.
=
FALSE
)
}
cond
<-
sapply
(
list
,
function
(
x
)
is.null
(
rownames
(
x
)))
if
(
any
(
cond
)){
stop
(
"Provide row names!"
,
call.
=
FALSE
)
}
# remove duplicated samples
duplic
<-
lapply
(
list
,
function
(
x
)
duplicated
(
rownames
(
x
)))
for
(
i
in
seq_along
(
list
)){
number
<-
round
(
100
*
mean
(
duplic
[[
i
]]))
if
(
message
){
message
(
number
,
" duplicates in \""
,
names
[
i
],
"\""
)}
list
[[
i
]]
<-
list
[[
i
]][
!
duplic
[[
i
]],,
drop
=
FALSE
]
}
# retain overlapping samples
all
<-
Reduce
(
f
=
intersect
,
x
=
lapply
(
list
,
rownames
))
for
(
i
in
seq_along
(
list
)){
percent
<-
round
(
100
*
mean
(
rownames
(
list
[[
i
]])
%in%
all
))
if
(
message
){
message
(
percent
,
"% overlap in \""
,
names
[
i
],
"\""
)}
list
[[
i
]]
<-
list
[[
i
]][
all
,,
drop
=
FALSE
]
}
# check output
cond
<-
sapply
(
list
,
function
(
x
)
any
(
duplicated
(
rownames
(
x
))))
if
(
any
(
cond
)){
stop
(
"Duplicate samples!"
,
call.
=
FALSE
)
}
cond
<-
sapply
(
list
,
function
(
x
)
nrow
(
x
)
!=
nrow
(
list
[[
1
]]))
if
(
any
(
cond
)){
stop
(
"Different sample sizes!"
,
call.
=
FALSE
)
}
cond
<-
sapply
(
list
,
function
(
x
)
any
(
rownames
(
x
)
!=
rownames
(
list
[[
1
]])))
if
(
any
(
cond
)){
stop
(
"Different sample names!"
,
call.
=
FALSE
)
}
return
(
list
)
}
#' @export
#' @title
#' Adjust library sizes
#'
#' @description
#' This function adjusts RNA-seq expression data for different library sizes.
#'
#' @param x
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#'
#' @examples
#' n <- 5; p <- 10
#' x <- matrix(rnbinom(n=n*p,mu=5,size=1/0.5),nrow=n,ncol=p)
#' x[1,] <- 10*x[1,]
#' adjust.samples(x)
#'
adjust.samples
<-
function
(
x
){
if
(
!
is.matrix
(
x
)){
stop
(
"no matrix argument"
,
call.
=
FALSE
)
}
if
(
!
is.numeric
(
x
)){
stop
(
"no numeric argument"
,
call.
=
FALSE
)
}
if
(
!
is.integer
(
x
)
&&
any
(
round
(
x
)
!=
x
)){
warning
(
"non-integer values"
,
call.
=
FALSE
)
}
if
(
any
(
x
<
0
)){
warning
(
"negative values"
,
call.
=
FALSE
)
}
n
<-
nrow
(
x
);
p
<-
ncol
(
x
)
lib.size
<-
rowSums
(
x
)
norm.factors
<-
edgeR
::
calcNormFactors
(
object
=
t
(
x
),
lib.size
=
lib.size
)
gamma
<-
norm.factors
*
lib.size
/
mean
(
lib.size
)
gamma
<-
matrix
(
gamma
,
nrow
=
n
,
ncol
=
p
,
byrow
=
FALSE
)
x
<-
x
/
gamma
return
(
x
)
}
#' @export
#' @title
#' Adjust exon length
#'
#' @description
#' This function adjusts exon expression data for different exon lengths.
#'
#' @param x
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @param offset
#' exon length\strong{:} vector of length \eqn{p}
#'
#' @param group
#' gene names\strong{:} vector of length \eqn{p}
#'
#' @examples
#' NA
#'
adjust.variables
<-
function
(
x
,
offset
,
group
){
if
(
!
is.numeric
(
x
)
|!
is.matrix
(
x
)){
stop
(
"Argument \"x\" is no numeric matrix."
,
call.
=
FALSE
)
}
if
(
!
is.numeric
(
offset
)
|!
is.vector
(
offset
)){
stop
(
"Argument \"offset\" is no numeric vector."
,
call.
=
FALSE
)
}