Commit 3d09d6e4 authored by Rauschenberger's avatar Rauschenberger
Browse files

automation

parent 49360ff7
Package: spliceQTL
Package: colasso
Version: 0.0.0
Title: Alternative Splicing
Description: Implements test for alternative splicing.
Title: colasso regression
Description: Implements colasso.
Depends: R (>= 3.0.0)
Imports: lme4, globaltest, edgeR, snpStats, refGenome, R.utils, methods, SummarizedExperiment, vcfR
Imports: glmnet, MASS
Suggests: knitr, testthat
Authors@R: c(person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role=c("aut","cre")),
person("Renee","Menezes",role=c("aut")))
Authors@R: person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role=c("aut","cre"))
VignetteBuilder: knitr
License: GPL-3
LazyData: true
RoxygenNote: 6.0.1
URL: https://github.com/rauschenberger/spliceQTL
BugReports: https://github.com/rauschenberger/spliceQTL/issues
URL: https://github.com/rauschenberger/colasso
BugReports: https://github.com/rauschenberger/colasso/issues
# Generated by roxygen2: do not edit by hand
export(adjust.samples)
export(adjust.variables)
export(drop.trivial)
export(get.exons.bbmri)
export(get.exons.geuvadis)
export(get.snps.bbmri)
export(get.snps.geuvadis)
export(grid)
export(map.exons)
export(map.genes)
export(map.snps)
export(match.samples)
export(test.multiple)
export(test.single)
export(visualise)
export(colasso)
export(colasso_compare)
export(colasso_covariate_weights)
export(colasso_marginal_significance)
export(colasso_moderate)
export(colasso_simulate)
export(colasso_weighted_correlation)
#' @name spliceQTL-package
#' @md
#' @aliases spliceQTL
#'
#' @title
#'
#' Alternative Splicing
#'
#' @description
#'
#' This R package includes various functions
#' for applying the global test of alternative splicing.
#' Some functions only work on the virtual machine (see below).
#'
#' @seealso
#'
#' Prepare BBMRI and Geuvadis data:
#' * \code{\link{get.snps.geuvadis}} (not VM)
#' * \code{\link{get.snps.bbmri}} (only VM)
#' * \code{\link{get.exons.geuvadis}} (only VM)
#' * \code{\link{get.exons.bbmri}} (only VM)
#'
#' Process samples and covariates:
#' * \code{\link{match.samples}}
#' * \code{\link{adjust.samples}}
#' * \code{\link{adjust.variables}}
#'
#' Search for exons and SNPs:
#' * \code{\link{map.genes}}
#' * \code{\link{map.exons}}
#' * \code{\link{map.snps}}
#' * \code{\link{drop.trivial}}
#'
#' Test for alternative splicing:
#' * \code{\link{test.single}}
#' * \code{\link{test.multiple}}
#'
#' @keywords documentation
#' @docType package
#'
NULL
#' @export
#' @aliases colasso-package
#' @title
#' Get SNP data (Geuvadis)
#' colasso
#'
#' @description
#' This function transforms SNP data (local machine):
#' downloads missing genotype data from ArrayExpress,
#' transforms variant call format to binary files,
#' removes SNPs with low minor allele frequency,
#' labels SNPs in the format "chromosome:position",
#' changes sample identifiers
#' This function ...
#'
#' @param chr
#' chromosome: integer \eqn{1-22}
#'
#' @param data
#' local directory for VCF (variant call format)
#' and SDRF (sample and data relationship format) files;
#'
#' @param path
#' local directory for output
#'
#' @examples
#' path <- "C:/Users/a.rauschenbe/Desktop/spliceQTL/data"
#'
get.snps.geuvadis <- function(chr,data=NULL,path=getwd()){
if(is.null(data)){
data <- path
# download SNP data
file <- paste0("GEUVADIS.chr",chr,".PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz")
url <- paste0("http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-1/genotypes/",file)
destfile <- file.path(data,file)
if(!file.exists(destfile)){
utils::download.file(url=url,destfile=destfile,method="auto")
}
# transform with PLINK
setwd(data)
system(paste0("plink --vcf GEUVADIS.chr",chr,".PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz",
" --maf 0.05 --geno 0 --make-bed --out snps",chr),invisible=FALSE)
# obtain identifiers
file <- "E-GEUV-1.sdrf.txt"
url <- paste("http://www.ebi.ac.uk/arrayexpress/files/E-GEUV-1/",file,sep="")
destfile <- file.path(data,file)
if(!file.exists(destfile)){
utils::download.file(url=url,destfile=destfile,method="auto")
}
}
# read into R
bed <- file.path(data,paste("snps",chr,".bed",sep=""))
bim <- file.path(data,paste("snps",chr,".bim",sep=""))
fam <- file.path(data,paste("snps",chr,".fam",sep=""))
X <- snpStats::read.plink(bed=bed,bim=bim,fam=fam)
X$fam <- NULL; all(diff(X$map$position) > 0)
# filter MAF
maf <- snpStats::col.summary(X$genotypes)$MAF
cond <- maf >= 0.05
X$genotypes <- X$genotypes[,cond]
X$map <- X$map[cond,]
# format
colnames(X$genotypes) <- paste0(X$map$chromosome,":",X$map$position)
snps <- methods::as(object=X$genotypes,Class="numeric")
class(snps) <- "integer"
# change identifiers
samples <- utils::read.delim(file=file.path(data,"E-GEUV-1.sdrf.txt"))
match <- match(rownames(snps),samples$Source.Name)
rownames(snps) <- samples$Comment.ENA_RUN.[match]
snps <- snps[!is.na(rownames(snps)),]
save(object=snps,file=file.path(path,paste0("Geuvadis.chr",chr,".RData")))
}
#' @export
#' @title
#' Get SNP data (BBMRI)
#'
#' @description
#' This function transforms SNP data (virtual machine):
#' limits analysis to specified biobanks,
#' reads in genotype data in chunks,
#' removes SNPs with missing values (multiple biobanks/technologies),
#' removes SNPs with low minor allele frequency,
#' fuses data from multiple biobanks/technologies
#'
#' @param chr
#' chromosome: integer \eqn{1-22}
#' @param y
#' response\strong{:}
#' vector of length \eqn{n}
#'
#' @param biobank
#' character "CODAM", "LL", "LLS", "NTR", "PAN", "RS", or NULL (all)
#' @param X
#' covariates\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#'
#' @param path
#' data directory
#' @param nfold
#' number of folds
#'
#' @param size
#' maximum number of SNPs to read in at once;
#' trade-off between memory usage (low) and speed (high)
#' @param alpha
#' elastic net parameter
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.snps.bbmri <- function(chr,biobank=NULL,path=getwd(),size=500*10^3){
start <- Sys.time()
message(rep("-",times=20)," chromosome ",chr," ",rep("-",times=20))
p <- 5*10^6 # (maximum number of SNPs per chromosome, before filtering)
skip <- seq(from=0,to=p,by=size)
if(is.null(biobank)){
study <- c("CODAM","LL","LLS0","LLS1","NTR0","NTR1","PAN","RS")
} else if(biobank=="LLS"){
study <- c("LLS0","LLS1")
} else if(biobank=="NTR"){
study <- c("NTR0","NTR1")
} else if(biobank %in% c("CODAM","LL","PAN","RS")){
study <- biobank
} else{
stop("Invalid biobank.",call.=FALSE)
}
collect <- matrix(list(),nrow=length(skip),ncol=length(study))
colnames(collect) <- study
for(i in seq_along(skip)){
message("\n","chunk ",i,": ",appendLF=FALSE)
for(j in seq_along(study)){
message(study[j]," ",appendLF=FALSE)
# Locating files on virtual machine.
dir <- study[j]
if(study[j]=="LLS0"){dir <- "LLS/660Q"}
if(study[j]=="LLS1"){dir <- "LLS/OmniExpr"}
if(study[j]=="NTR0"){dir <- "NTR/Affy6"}
if(study[j]=="NTR1"){dir <- "NTR/GoNL"}
path0 <- file.path("/mnt/virdir/Backup/RP3_data/HRCv1.1_Imputation",dir)
path1 <- path
file0 <- paste0("chr",chr,".dose.vcf.gz")
file1 <- paste0(study[j],".chr",chr,".dose.vcf.gz")
file2 <- paste0(study[j],".chr",chr,".dose.vcf")
# Reading in files.
#vcf <- vcfR::read.vcfR(file=file.path(path1,file2),skip=skip[i],nrows=size,verbose=FALSE)
vcf <- vcfR::read.vcfR(file=file.path(path0,file0),skip=skip[i],nrows=size,verbose=FALSE)
vcf <- vcf[vcf@fix[,"CHROM"]!="",] # bug fix
vcf@fix[,"ID"] <- paste0(vcf@fix[,"ID"],"_",seq_len(dim(vcf)["variants"]))
collect[i,j][[1]] <- vcf
stop <- dim(vcf)["variants"]==0
final <- dim(vcf)["variants"]<size
if(stop){break}
}
print(utils::object.size(collect),units="Gb")
end <- Sys.time()
if(stop){break}
#' n <- 100; p <- 20
#' X <- matrix(rnorm(n*p),nrow=n,ncol=p)
#' #y <- rbinom(n=n,size=1,prob=0.2)
#' y <- rnorm(n=n)
#' #y[1] <- 0.5
#' #a <- glmnet::glmnet(y=y,x=x,family="binomial")
#' #b <- stats::glm(y~x,family="binomial")
colasso <- function(y,X,nfold=5,alpha=1){
# properties
n <- nrow(X); p <- ncol(X)
if(length(y)!=n){stop("sample size")}
foldid <- sample(x=rep(x=seq_len(5),length.out=n))
pi <- seq(from=0,to=0.5,by=0.1) # adapt this
# model fitting
fit <- list()
Y <- colasso_moderate(y=y,X=X,pi=pi)
for(j in seq_along(pi)){
fit[[j]] <- glmnet::glmnet(y=Y[,j],x=X,alpha=alpha)
}
# inner cross-validation
pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
for(k in sort(unique(foldid))){
y0 <- y[foldid!=k]
y1 <- y[foldid==k]
X0 <- X[foldid!=k,,drop=FALSE]
X1 <- X[foldid==k,,drop=FALSE]
# only retain SNPs measured in all studies
position <- lapply(seq_along(study),function(j) collect[i,j][[1]]@fix[,"POS"])
common <- Reduce(f=intersect,x=position)
for(j in seq_along(study)){
cond <- match(x=common,table=position[[j]])
collect[i,j][[1]] <- collect[i,j][[1]][cond,]
Y0 <- colasso_moderate(y=y0,X=X0,pi=pi)
for(j in seq_along(pi)){
glmnet <- glmnet::glmnet(y=Y0[,j],x=X0,alpha=alpha) # was lambda=lambda
temp <- stats::predict(object=glmnet,newx=X1,type="response",s=fit[[j]]$lambda)
pred[[j]][foldid==k,seq_len(ncol(temp))] <- temp
}
# Calculating minor allele frequency.
num <- numeric(); maf <- list()
for(j in seq_along(study)){
num[j] <- dim(collect[i,j][[1]])["gt_cols"] # replace by adjusted sample sizes?
maf[[j]] <- num[j]*vcfR::maf(collect[i,j][[1]])[,"Frequency"]
}
cond <- rowSums(do.call(what="cbind",args=maf))/sum(num)>0.05
if(sum(cond)==0){if(final){break}else{next}}
# Extracting genotypes.
for(j in seq_along(study)){
gt <- vcfR::extract.gt(collect[i,j][[1]][cond,])
gt[gt=="0|0"] <- 0
gt[gt=="0|1"|gt=="1|0"] <- 1
gt[gt=="1|1"] <- 2
storage.mode(gt) <- "integer"
collect[i,j][[1]] <- gt
}
if(final){break}
}
# Removing empty rows.
cond <- apply(collect,1,function(x) all(sapply(x,length)==0))
collect <- collect[!cond,,drop=FALSE]
# Fusing all matrices.
snps <- NULL
for(i in seq_len(nrow(collect))){
inner <- NULL
for(j in seq_len(ncol(collect))){
add <- collect[i,j][[1]]
colnames(add) <- paste0(colnames(collect)[j],":",colnames(add))
inner <- cbind(inner,add)
}
snps <- rbind(snps,inner)
}
attributes(snps)$time <- end-start
rownames(snps) <- sapply(strsplit(x=rownames(snps),split="_"),function(x) x[[1]])
snps <- t(snps)
# Filter samples.
rownames(snps) <- sub(x=rownames(snps),pattern="LLS0|LLS1",replacement="LLS")
rownames(snps) <- sub(x=rownames(snps),pattern="NTR0|NTR1",replacement="NTR")
if(is.null(biobank)){
save(object=snps,file=file.path(path1,paste0("BBMRI.chr",chr,".RData")))
} else {
save(object=snps,file=file.path(path1,paste0(biobank,".chr",chr,".RData")))
# loss sequence
for(i in seq_along(pi)){
fit[[i]]$cvm <- apply(X=pred[[i]],MARGIN=2,FUN=function(x) mean((x-y)^2))
fit[[i]]$lambda.min <- fit[[i]]$lambda[which.min(fit[[i]]$cvm)]
}
}
#' @export
#' @title
#' Get exon data (Geuvadis)
#'
#' @description
#' This function transforms exon data (virtual machine):
#' retains exons on the autosomes,
#' labels exons in the format "chromosome_start_end",
#' extracts corresponding gene names
#'
#' @param path
#' data directory
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.exons.geuvadis <- function(path=getwd()){
nrows <- 303544
file <-"/virdir/Scratch/rmenezes/data_counts.txt"
exons <- utils::read.table(file=file,header=TRUE,nrows=nrows)
exons <- exons[exons[,"chr"] %in% 1:22,] # autosomes
rownames(exons) <- exon_id <- paste0(exons[,"chr"],"_",exons[,"start"],"_",exons[,"end"])
gene_id <- as.character(exons[,4])
exons <- t(exons[,-c(1:4)])
save(list=c("exons","exon_id","gene_id"),file=file.path(path,"Geuvadis.exons.RData"))
}
#' @export
#' @title
#' Get exon data (BBMRI)
#'
#' @description
#' This function transforms exon data (virtual machine):
#' loads quality controlled gene expression data,
#' extracts sample identifiers,
#' removes samples without SNP data,
#' loads exon expression data,
#' extracts sample identifiers,
#' retains samples that passed quality control,
#' retains exons on the autosomes
#'
#' @param path
#' data directory
#'
#' @examples
#' path <- "/virdir/Scratch/arauschenberger/trial"
#'
get.exons.bbmri <- function(path=getwd()){
# sample identifiers:
# (1) loading quality controlled gene expression data
# (2) extracting sample identifiers
# (3) removing identifiers without SNP data
# (4) translating identifiers
utils::data(rnaSeqData_ReadCounts_BIOS_cleaned,package="BBMRIomics") # (1)
cd <- SummarizedExperiment::colData(counts)[,c("biobank_id","imputation_id","run_id")] # (2)
counts <- NULL
names(cd) <- substr(names(cd),start=1,stop=3) # abbreviate names
cd <- cd[!is.na(cd$imp),] # (3)
cd$id <- NA # (4)
cd$id[cd$bio=="CODAM"] <- sapply(strsplit(x=cd$imp[cd$bio=="CODAM"],split="_"),function(x) x[[2]])
cd$id[cd$bio=="LL"] <- sub(pattern="1_LLDeep_",replacement="",x=cd$imp[cd$bio=="LL"])
cd$id[cd$bio=="LLS"] <- sapply(strsplit(x=cd$imp[cd$bio=="LLS"],split="_"),function(x) x[[2]])
cd$id[cd$bio=="NTR"] <- sapply(strsplit(x=cd$imp[cd$bio=="NTR"],split="_"),function(x) x[[2]])
cd$id[cd$bio=="PAN"] <- cd$imp[cd$bio=="PAN"]
cd$id[cd$bio=="RS"] <- sub(pattern="RS1_|RS2_|RS3_",replacement="",x=cd$imp[cd$bio=="RS"])
# Identify individual not with "id" but with "bio:id".
any(duplicated(cd$id)) # TRUE
sapply(unique(cd$bio),function(x) any(duplicated(cd$id[x]))) # FALSE
# selection
cvm <- sapply(fit,function(x) x$cvm[which(x$lambda==x$lambda.min)])
sel <- which.min(cvm) # BUT "sel <- which.max(cvm)" FOR AUC!
fit[[length(pi)+1]] <- fit[[sel]]
# exon data:
# (1) loading exon expression data
# (2) extracting sample identifiers
# (3) retaining autosomes
# (4) retaining samples from above
load("/virdir/Backup/RP3_data/RNASeq/v2.1.3/exon_base/exon_base_counts.RData") # (1)
colnames(counts) <- sub(pattern=".exon.base.count.gz",replacement="",x=colnames(counts)) # (2)
autosomes <- sapply(strsplit(x=rownames(counts),split="_"),function(x) x[[1]] %in% 1:22) # (3)
exons <- counts[autosomes,cd$run] # (3) and (4)
exon_id <- exon_id[autosomes] # (3)
gene_id <- gene_id[autosomes] # (3)
colnames(exons) <- paste0(cd$bio,":",cd$id)
exons <- t(exons)
save(list=c("exons","exon_id","gene_id"),file=file.path(path,"BBMRI.exons.RData"))
#graphics::plot(cvm); graphics::abline(v=sel,lty=2)
names(fit) <- c("standard",paste0("pi",pi[-1]),"select")
return(fit)
}
#' @export
#' @title
#' Prepare data matrices
#'
#' @description
#' This function removes duplicate samples from each matrix,
#' only retains samples appearing in all matrices,
#' and brings samples into the same order.
#'
#' @param ...
#' matrices with samples in the rows and variables in the columns,
#' with sample identifiers as rows names
#'
#' @param message
#' display messages\strong{:} logical
#'
#' @examples
#' X <- matrix(rnorm(6),nrow=3,ncol=2,dimnames=list(c("A","B","C")))
#' Z <- matrix(rnorm(9),nrow=3,ncol=3,dimnames=list(c("B","A","B")))
#' match.samples(X,Z)
#'
match.samples <- function(...,message=TRUE){
list <- list(...)
if(length(list)==1 & is.list(list[[1]])){list <- list[[1]]}
if(is.null(names(list))){
names(list) <- sapply(substitute(list(...))[-1],deparse)
}
names <- names(list)
# check input
cond <- sapply(list,function(x) !is.matrix(x))
if(any(cond)){
stop("Provide matrices!",call.=FALSE)
}
cond <- sapply(list,function(x) is.null(rownames(x)))
if(any(cond)){
stop("Provide row names!",call.=FALSE)
}
# remove duplicated samples
duplic <- lapply(list,function(x) duplicated(rownames(x)))
for(i in seq_along(list)){
number <- round(100*mean(duplic[[i]]))
if(message){message(number," duplicates in \"",names[i],"\"")}
list[[i]] <- list[[i]][!duplic[[i]],,drop=FALSE]
}
# retain overlapping samples
all <- Reduce(f=intersect,x=lapply(list,rownames))
for(i in seq_along(list)){
percent <- round(100*mean(rownames(list[[i]]) %in% all))
if(message){message(percent,"% overlap in \"",names[i],"\"")}
list[[i]] <- list[[i]][all,,drop=FALSE]
}
# check output
cond <- sapply(list,function(x) any(duplicated(rownames(x))))
if(any(cond)){
stop("Duplicate samples!",call.=FALSE)
}
cond <- sapply(list,function(x) nrow(x)!=nrow(list[[1]]))
if(any(cond)){
stop("Different sample sizes!",call.=FALSE)
}
cond <- sapply(list,function(x) any(rownames(x)!=rownames(list[[1]])))
if(any(cond)){
stop("Different sample names!",call.=FALSE)
}
return(list)
}
#' @export
#' @title
#' Adjust library sizes
#'
#' @description
#' This function adjusts RNA-seq expression data for different library sizes.
#'
#' @param x
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#'
#' @examples
#' n <- 5; p <- 10
#' x <- matrix(rnbinom(n=n*p,mu=5,size=1/0.5),nrow=n,ncol=p)
#' x[1,] <- 10*x[1,]
#' adjust.samples(x)
#'
adjust.samples <- function(x){
if(!is.matrix(x)){
stop("no matrix argument",call.=FALSE)
}
if(!is.numeric(x)){
stop("no numeric argument",call.=FALSE)
}
if(!is.integer(x)&&any(round(x)!=x)){
warning("non-integer values",call.=FALSE)
}
if(any(x<0)){
warning("negative values",call.=FALSE)
}
n <- nrow(x); p <- ncol(x)
lib.size <- rowSums(x)
norm.factors <- edgeR::calcNormFactors(object=t(x),lib.size=lib.size)
gamma <- norm.factors*lib.size/mean(lib.size)
gamma <- matrix(gamma,nrow=n,ncol=p,byrow=FALSE)
x <- x/gamma
return(x)
}
#' @export
#' @title
#' Adjust exon length
#'
#' @description
#' This function adjusts exon expression data for different exon lengths.
#'
#' @param x
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @param offset
#' exon length\strong{:} vector of length \eqn{p}
#'
#' @param group
#' gene names\strong{:} vector of length \eqn{p}
#'
#' @examples
#' NA
#'
adjust.variables <- function(x,offset,group){
if(!is.numeric(x)|!is.matrix(x)){
stop("Argument \"x\" is no numeric matrix.",call.=FALSE)
}
if(!is.numeric(offset)|!is.vector(offset)){
stop("Argument \"offset\" is no numeric vector.",call.=FALSE)
}