Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit c9b0ed92 authored by Rauschenberger's avatar Rauschenberger
Browse files

initialise

parents
^Readme\.Rmd$
^\.travis\.yml$
^_pkgdown\.yml$
^docs$
^cran-comments\.md$
^appveyor\.yml$
# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
language: R
os:
- linux
sudo: false
cache: packages
use_bioc: true
bioc_required: true
r:
- devel
r_packages:
- covr
after_success:
- Rscript -e 'library(covr); codecov()'
Package: spliceQTL
Version: 0.0.0
Title: Alternative Splicing
Description: Implements test for alternative splicing.
Depends: R (>= 3.0.0)
Imports: lme4, globaltest, edgeR, snpStats, refGenome, R.utils, methods
Suggests: knitr, testthat
Authors@R: c(person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role=c("aut","cre")),
person("Renee","Menezes",role=c("aut")))
VignetteBuilder: knitr
License: GPL-3
LazyData: true
RoxygenNote: 6.0.1
URL: https://github.com/rauschenberger/spliceQTL
BugReports: https://github.com/rauschenberger/spliceQTL/issues
# Generated by roxygen2: do not edit by hand
export(adjust.exon.length)
export(adjust.library.sizes)
export(drop.trivial.genes)
export(map.exons)
export(map.genes)
export(map.snps)
export(prepare.data.matrices)
export(spliceQTL)
## spliceQTL 0.0.0 (2018-06-12)
* Added functions
\ No newline at end of file
#' @export
#' @title
#' Prepare data matrices
#'
#' @description
#' This function removes duplicate samples, retains overlapping samples, and
#' orders samples.
#'
#' @param Y
#' matrix with \eqn{n_y} rows (samples) and \eqn{p_y} columns (exons)
#'
#' @param X
#' matrix with \eqn{n_x} rows (samples) and \eqn{p_x} columns (SNPs)
#'
#' @examples
#' NA
#'
prepare.data.matrices <- function(Y,X){
# check input
if(!is.matrix(Y)|!is.matrix(X)){
stop("Provide X and Y as matrices!",call.=FALSE)
}
if(is.null(rownames(Y))|is.null(rownames(X))){
stop("Missing sample names!",call.=FALSE)
}
# remove duplicate samples
dup_y <- duplicated(rownames(Y))
dup_x <- duplicated(rownames(X))
message("Duplicates: removing ",round(mean(dup_y),2),"% of Y.")
message("Duplicates: removing ",round(mean(dup_x),2),"% of X.")
Y <- Y[!dup_y,]
X <- X[!dup_x,]
# retain overlapping samples
both <- intersect(x=rownames(Y),y=rownames(X))
message("Overlap: retaining ",round(mean(rownames(Y) %in% both),2),"% of Y.")
message("Overlap: retaining ",round(mean(rownames(X) %in% both),2),"% of X.")
Y <- Y[both,]
X <- X[both,]
# check output
if(any(duplicated(rownames(Y))) | any(duplicated(rownames(X)))){
stop("Duplicate samples!",call.=FALSE)
}
if(nrow(Y)!=nrow(X)){
stop("Different sample sizes!",call.=FALSE)
}
if(any(rownames(Y)!=rownames(X))){
stop("Different sample names!",call.=FALSE)
}
return(list(Y=Y,X=X))
}
#' @export
#' @title
#' Adjust library sizes
#'
#' @description
#' This function adjusts exon expression data for different library sizes.
#'
#' @param y
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @examples
#' NA
#'
adjust.library.sizes <- function(y){
n <- nrow(y); p <- ncol(y)
lib.size <- colSums(y)
norm.factors <- edgeR::calcNormFactors(object=y,lib.size=lib.size)
gamma <- norm.factors*lib.size/mean(lib.size)
gamma <- matrix(gamma,nrow=p,ncol=n,byrow=TRUE)
return(y/gamma)
}
#' @export
#' @title
#' Adjust exon length
#'
#' @description
#' This function adjusts exon expression data for different exon lengths.
#'
#' @param y
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @param gene
#' gene (not exon) names\strong{:} vector of length \eqn{p}
#'
#' @param start
#' exon start positions\strong{:} vector of length \eqn{p}
#'
#' @param end
#' exon end positions\strong{:} vector of length \eqn{p}
#'
#' @details
#' No information on chromosomes required.
#'
#' @examples
#' NA
#'
adjust.exon.length <- function(y,gene,start,end){
n <- nrow(y); p <- ncol(y); names <- dimnames(y)
y <- as.numeric(y)
x <- rep(end-start,times=n)
gene <- rep(gene,times=n)
lmer <- lme4::lmer(y ~ x + (1|gene))
y <- matrix(stats::residuals(lmer),nrow=p,ncol=n,dimnames=names)
return(y - min(y))
}
#' @export
#' @title
#' Search for genes
#'
#' @description
#' This function retrieves all genes on a chromosome.
#'
#' @param chr
#' chromosome\strong{:} integer 1-22
#'
#' @param path
#' path to gene transfer format files (.gtf)
#'
#' @param release
#' character "NCBI36", "GRCh37", or "GRCh38"
#'
#' @param build
#' integer 49-91
#'
#' @details
#' This functions ...
#'
#' @examples
#' NA
#'
map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){
file <- paste0("Homo_sampiens.",release,".",build,".gtf")
if(!file.exists(file)){
url <- paste0("ftp://ftp.ensembl.org/pub/release-",build,
"/gtf/homo_sapiens/",file,".gz")
destfile <- file.path(path,paste0(file,".gz"))
utils::download.file(url=url,destfile=destfile,method="auto")
R.utils::gunzip(filename=destfile,remove=FALSE,overwrite=TRUE)
}
object <- refGenome::ensemblGenome()
refGenome::basedir(object) <- path
refGenome::read.gtf(object,filename=file)
genes <- refGenome::getGenePositions(object=object,by="gene_id")
genes <- genes[genes$seqid==chr & genes$gene_biotype=="protein_coding",]
genes <- genes[,c("gene_id","seqid","start","end")]
colnames(genes)[colnames(genes)=="seqid"] <- "chr"
return(genes)
}
#' @export
#' @title
#' Search for exons
#'
#' @description
#' This function
#'
#' @param gene_id
#' gene names\strong{:} vector with one entry per gene
#'
#' @param exon_id
#' exon names\strong{:} vector with one entry per exon
#'
#' @details
#' The exon names should contain the gene names. For each gene, this function
#' returns the indices of the exons.
#'
#' @examples
#' NA
#'
map.exons <- function(gene_id,exon_id){
if(length(gene_id)!=length(exon_id)){stop("Invalid.",call.=FALSE)}
p <- length(gene_id)
exons <- list()
pb <- utils::txtProgressBar(min=0,max=p,style=3)
for(i in seq_len(p)){
utils::setTxtProgressBar(pb=pb,value=i)
which <- as.integer(grep(pattern=gene_id[i],x=exon_id)) # Why not "=="?
exons[[i]] <- which
}
return(exons)
}
#' @export
#' @title
#' Search for SNPs
#'
#' @description
#' This function
#'
#' @param gene.chr
#' chromosome\strong{:}
#' numeric vector with one entry per gene
#'
#' @param gene.start
#' start position\strong{:}
#' numeric vector with one entry per gene
#'
#' @param gene.end
#' end position\strong{:}
#' numeric vector with one entry per gene
#'
#' @param snp.chr
#' integer 1-22
#'
#' @param snp.pos
#' chromosomal position of SNPs\strong{:}
#' numeric vector with one entry per SNP
#'
#' @details
#' This functions ...
#'
#' @examples
#' NA
#'
map.snps <- function(gene.chr,gene.start,gene.end,snp.chr,snp.pos){
if(length(gene.chr)!=length(gene.start)|length(gene.chr)!=length(gene.end)){
stop("Invalid.",call.=FALSE)
}
p <- length(gene.start)
snps <- data.frame(from=integer(length=p),to=integer(length=p))
pb <- utils::txtProgressBar(min=0,max=p,style=3)
for(i in seq_len(p)){ #
utils::setTxtProgressBar(pb=pb,value=i)
chr <- snp.chr == gene.chr[i]
if(!any(chr)){next}
start <- snp.pos >= gene.start[i] - 1*10^3
end <- snp.pos <= gene.end[i] + 0
which <- as.integer(which(chr & start & end))
if(length(which)==0){next}
snps$from[i] <- min(which)
snps$to[i] <- max(which)
if(length(which)==1){next}
if(!all(diff(which)==1)){stop("SNPs are in wrong order!")}
}
return(snps)
}
#' @export
#' @title
#' Drop "trivial" genes
#'
#' @description
#' This function
#'
#' @param map
#' list with names "genes", "exons", and "snps"
#' (output from \code{map.genes}, \code{map.exons}, and \code{map.snps})
#'
#' @details
#' This functions drops genes without SNPs or with a single exon.
#'
#' @examples
#' NA
#'
drop.trivial.genes <- function(map){
p <- length(map$genes)
pass <- rep(NA,times=p)
pb <- utils::txtProgressBar(min=0,max=p,style=3)
for(i in seq_len(p)){ # seq_len(p)
utils::setTxtProgressBar(pb=pb,value=i)
ys <- map$exons[[i]]
check <- logical()
# Exclude genes without SNPs:
check[1] <- map$snps$from[i] > 0
check[2] <- map$snps$to[i] > 0
# Exclude genes with single exon:
check[3] <- length(ys) > 1
pass[i] <- all(check)
}
map$genes <- map$genes[pass,]
map$exons <- map$exons[pass]
map$snps <- map$snps[pass,]
return(map)
}
#' @export
#' @title
#' Conduct tests
#'
#' @description
#' This function
#'
#' @param Y
#' exon expression\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @param X
#' SNP genotype\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{q} columns (SNPs)
#'
#' @param map
#' list with names "genes", "exons", and "snps"
#' (output from \code{map.genes}, \code{map.exons}, and \code{map.snps})
#'
#' @param i
#' gene index\strong{:}
#' integer between \eqn{1} and \code{nrow(map$genes)}
#'
#' @param limit
#' cutoff for rounding \code{p}-values
#'
#' @param steps
#' size of permutation chunks\strong{:}
#' integer vector
#'
#' @details
#' The maximum number of permutations equals \code{sum(steps)}. Permutations is
#' interrupted if at least \code{limit} test statistics for the permuted data
#' are larger than the test statistic for the observed data.
#'
#' @examples
#' NA
#'
spliceQTL <- function(Y,X,map,i,limit,steps){
### extraction ###
ys <- map$exons[[i]]
y <- list()
y$norm <- t(as.matrix(Y$norm[ys,,drop=FALSE]))
y$counts <- t(Y$counts[ys,,drop=FALSE])
xs <- seq(from=map$snps$from[i],to=map$snps$to[i],by=1)
x <- methods::as(X$genotypes[,xs],"numeric")
pvalue <- list()
### level 1: spliceQTL test ###
for(type in c("norm","counts")){
for(rho in c(0,0.2,0.5,1)){
tstat <- G2.multin(dep.data=y[[type]],indep.data=x,
nperm=steps[1]-1,rho=rho)$Sg
for(nperm in steps[-1]){
tstat <- c(tstat,G2.multin(dep.data=y[[type]],indep.data=x,
nperm=nperm,rho=rho)$Sg[-1])
if(sum(tstat >= tstat[1]) >= limit){break}
}
pvalue[[paste(type,rho,sep="")]] <-
mean(tstat >= tstat[1],na.rm=TRUE)
}
}
#### level 2: global test ###
pvalue$global <- rep(NA,times=length(ys))
for(j in seq_along(ys)){
gt <- globaltest::gt(response=y$norm[,j],alternative=x)
pvalue$global[j] <- globaltest::p.value(gt)
}
### level 3: pairwise test ###
pvalue$pair <- matrix(NA,nrow=length(ys),ncol=length(xs))
for(j in seq_along(ys)){
for(k in seq_along(xs)){
lm <- stats::lm(y$norm[,j] ~ x[,k])
pvalue$pair[j,k] <- summary(lm)$coef["x[, k]","Pr(>|t|)"]
}
}
return(pvalue)
}
#--- spliceQTL test functions --------------------------------------------------
# Function: G2.multin
# This is to compute the G2 test statistic under the assumption that the response follows a multinomial distribution
### Input
### dep data and indep data with samples on the rows and genes on the columns
### grouping: Either a logical value = F or a matrix with a single column and same number of rows as samples.
### Column name should be defined.
### Contains clinical information of the samples.
### Should have two groups only.
### nperm : number of permutations
### rho: the null correlation between SNPs
### mu: the null correlation between observations corresponding to different exons and different individuals
### Output
### A list containing G2 p.values and G2 test statistics
### Example : G2T = G2(dep.data = cgh, indep.data = expr, grouping=F, stand=TRUE, nperm=1000)
### G2 p.values : G2T$G2p
### G2 TS : G2T$$Sg
G2.multin <- function(dep.data,indep.data,stand = TRUE, nperm=100,grouping=F,rho=0,mu=0){
nperm = nperm
## check for the number of samples in dep and indep data
if (nrow(dep.data)!=nrow(indep.data)){
cat("number of samples not same in dep and indep data","\n")
}
if(any(abs(rho)>1)){
cat("correlations rho larger than abs(1) are not allowed")
}
nresponses <- ncol(dep.data)
ncovariates <- ncol(indep.data)
### centering and standardizing the data are not done in this case
# dep.data = scale(dep.data,center=T,scale=stand)
# indep.data = scale(indep.data,center=T,scale=stand)
#### No grouping of the samples.
## Calculate U=(I-H)Y and UU', where Y has observations on rows; also tau.mat=X*W.rho*X',
## where X has observations on rows and variables on columns
## and W.rho = I + rho*(J-I), a square matrix with as many rows as columns in X
## NOTE: this formulation uses X with n obs on the rows and m covariates no the columns, so it is the transpose of the first calculations
nsamples <- nrow(dep.data)
n.persample <- rowSums(dep.data)
n.all <- sum(dep.data)
H <- (1/n.all)*matrix( rep(n.persample,each=nsamples),nrow=nsamples,byrow=T)
U <- (diag(rep(1,nsamples)) - H) %*% dep.data
## Now we may have a vector of values for rho - so we define tau.mat as an array, with the 3rd index corresponding to the value of rho
tau.mat <- array(0,dim=c(nsamples,nsamples,length(rho)))
for(xk in 1:length(rho))
{
if (rho[xk]==0) { tau.mat[,,xk] <- tcrossprod(indep.data) }
else { w.rho <- diag(rep(1,ncovariates)) + rho[xk]*(tcrossprod(rep(1,ncovariates)) -diag(rep(1,ncovariates)) )
tau.mat[,,xk] <- indep.data %*% w.rho %*% t(indep.data)}
}
######################################
### NOTES ARMIN START ################
# all(X %*% t(X) == tau.mat[,,1]) # rho = 0 -> TRUE
# all(X %*% (t(X) %*% X) %*% t(X) == tau.mat[,,1]) # rho = 1
# plot(as.numeric(X %*% (t(X) %*% X) %*% t(X)),as.numeric(tau.mat[,,1]))
### NOTES ARMIN END ##################
######################################
samp_names = 1:nsamples ## this was rownames(indep.data), but I now do this so that rownames do not have to be added to the array tau.mat
Sg = get.g2stat.multin(U,mu=mu,rho=rho,tau.mat)
### now we will have a vector as result, with one value per combination of values of rho and mu
#
### G2
### Permutations
# When using permutations: only the rows of tau.mat are permuted
# To check how the permutations can be efficiently applied, see tests_permutation_g2_multin.R
perm_samp = matrix(0, nrow=nrow(indep.data), ncol=nperm) ## generate the permutation matrix
for(i in 1:ncol(perm_samp)){
perm_samp[,i] = samp_names[sample(1:length(samp_names),length(samp_names))]
}
## permutation starts - recompute tau.mat (or recompute U each time)
for (perm in 1:nperm){
tau.mat.perm = tau.mat[perm_samp[,perm],,,drop=FALSE] # permute rows
tau.mat.perm = tau.mat.perm[,perm_samp[,perm],,drop=FALSE] # permute columns
Sg = c(Sg, get.g2stat.multin(U, mu=mu,rho=rho,tau.mat.perm) )
}
########################################################################
#### G2 test statistic
# *** recompute for a vector of values for each case - just reformat the result with as many rows as permutations + 1,
# and as many columns as combinations of values of rho and mu
Sg = matrix(Sg,nrow=nperm+1,ncol=length(mu)*length(rho))
colnames(Sg) <- paste(rep("rho",ncol(Sg)),rep(1:length(rho),each=length(mu)),rep("mu",ncol(Sg)),rep(1:length(mu),length(rho)) )
### Calculte G2 pval
G2p = apply(Sg,2,get.pval.percol)
return (list(perm = perm_samp,G2p = G2p,Sg = Sg))
}
# Function: get.g2stat.multin
# Computes the G2 test statistic given two data matrices, under a multinomial distribution
# This is used internally by the G2 function
# Inputs:
# U = (I-H)Y, a n*K matrix where n=number obs and K=number multinomial responses possible
# tau.mat = X' W.rho X, a n*n matrix : both square, symmetric matrices with an equal number of rows
# Output: test statistic (single value)
#
get.g2stat.multin <- function(U, mu, rho, tau.mat)
{
g2tstat <- NULL
for(xk in 1:length(rho))
{
for(xj in 1:length(mu))
{
if(mu[xj]==0) { g2tstat <- c(g2tstat, sum( diag( tcrossprod(U) %*% tau.mat[,,xk] ) ) )
} else {
g2tstat <- c(g2tstat, (1-mu[xj])*sum(diag( tcrossprod(U) %*% tau.mat[,,xk] ) ) + mu[xj]*sum( t(U) %*% tau.mat[,,xk] %*% U ) )
}
}
}
g2tstat
}
# Function: get.pval.percol
# This function takes a vector containing the observed test stat as the first entry, followed by values generated by permutation,
# and computed the estimated p-value
# Input
# x: a vector with length nperm+1
# Output
# the pvalue computed
get.pval.percol <- function(x){
pval = mean(x[1]<= c(Inf , x[2:length(x)]))
pval
}
---
output: github_document
---
<!-- Modify xxx.Rmd, not xxx.md! -->
```{r, echo = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "README-"
)
```
## Scope
spliceQTL
## Installation
```{r,eval=FALSE}
#install.packages("devtools")
devtools::install_github("rauschenberger/spliceQTL")
```
## Reference
<!-- Modify xxx.Rmd, not xxx.md! -->
## Scope