Commit 557394d2 authored by Rauschenberger's avatar Rauschenberger
Browse files

automation

parent 601e5a7c
...@@ -7,4 +7,5 @@ export(map.exons) ...@@ -7,4 +7,5 @@ export(map.exons)
export(map.genes) export(map.genes)
export(map.snps) export(map.snps)
export(match.samples) export(match.samples)
export(test.multiple)
export(test.single) export(test.single)
...@@ -6,17 +6,19 @@ ...@@ -6,17 +6,19 @@
#' @description #' @description
#' This function removes duplicate samples from each matrix, #' This function removes duplicate samples from each matrix,
#' only retains samples appearing in all matrices, #' only retains samples appearing in all matrices,
#' and brings them into the same order. #' and brings samples into the same order.
#' #'
#' @param ... #' @param ...
#' matrices with samples in the rows and variables in the columns #' matrices with samples in the rows and variables in the columns,
#' (separated by commas) #' with sample identifiers as rows names
#' #'
#' @param message #' @param message
#' display messages\strong{:} logical #' display messages\strong{:} logical
#' #'
#' @examples #' @examples
#' NA #' X <- matrix(rnorm(6),nrow=3,ncol=2,dimnames=list(c("A","B","C")))
#' Z <- matrix(rnorm(9),nrow=3,ncol=3,dimnames=list(c("B","A","B")))
#' match.samples(X,Z)
#' #'
match.samples <- function(...,message=TRUE){ match.samples <- function(...,message=TRUE){
...@@ -27,7 +29,6 @@ match.samples <- function(...,message=TRUE){ ...@@ -27,7 +29,6 @@ match.samples <- function(...,message=TRUE){
} }
names <- names(list) names <- names(list)
# check input # check input
cond <- sapply(list,function(x) !is.matrix(x)) cond <- sapply(list,function(x) !is.matrix(x))
if(any(cond)){ if(any(cond)){
...@@ -39,11 +40,11 @@ match.samples <- function(...,message=TRUE){ ...@@ -39,11 +40,11 @@ match.samples <- function(...,message=TRUE){
} }
# remove duplicated samples # remove duplicated samples
duplic <- lapply(list,function(x) duplicated(x)) duplic <- lapply(list,function(x) duplicated(rownames(x)))
for(i in seq_along(list)){ for(i in seq_along(list)){
percent <- round(100*mean(duplic[[i]])) number <- round(100*mean(duplic[[i]]))
if(message){message(percent,"% duplicates in \"",names[i],"\"")} if(message){message(number," duplicates in \"",names[i],"\"")}
list[[i]] <- list[[i]][!duplic[[i]],] list[[i]] <- list[[i]][!duplic[[i]],,drop=FALSE]
} }
# retain overlapping samples # retain overlapping samples
...@@ -51,7 +52,7 @@ match.samples <- function(...,message=TRUE){ ...@@ -51,7 +52,7 @@ match.samples <- function(...,message=TRUE){
for(i in seq_along(list)){ for(i in seq_along(list)){
percent <- round(100*mean(rownames(list[[i]]) %in% all)) percent <- round(100*mean(rownames(list[[i]]) %in% all))
if(message){message(percent,"% overlap in \"",names[i],"\"")} if(message){message(percent,"% overlap in \"",names[i],"\"")}
list[[i]] <- list[[i]][all,] list[[i]] <- list[[i]][all,,drop=FALSE]
} }
# check output # check output
...@@ -82,7 +83,10 @@ match.samples <- function(...,message=TRUE){ ...@@ -82,7 +83,10 @@ match.samples <- function(...,message=TRUE){
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables) #' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#' #'
#' @examples #' @examples
#' NA #' n <- 5; p <- 10
#' x <- matrix(rnbinom(n=n*p,mu=5,size=1/0.5),nrow=n,ncol=p)
#' x[1,] <- 10*x[1,]
#' adjust.samples(x)
#' #'
adjust.samples <- function(x){ adjust.samples <- function(x){
if(!is.matrix(x)){ if(!is.matrix(x)){
...@@ -91,10 +95,10 @@ adjust.samples <- function(x){ ...@@ -91,10 +95,10 @@ adjust.samples <- function(x){
if(!is.numeric(x)){ if(!is.numeric(x)){
stop("no numeric argument",call.=FALSE) stop("no numeric argument",call.=FALSE)
} }
if(!is.integer(x)){ if(!is.integer(x)&&any(round(x)!=x)){
warning("non-integer values",call.=FALSE) warning("non-integer values",call.=FALSE)
} }
if(any(x)<0){ if(any(x<0)){
warning("negative values",call.=FALSE) warning("negative values",call.=FALSE)
} }
n <- nrow(x); p <- ncol(x) n <- nrow(x); p <- ncol(x)
...@@ -116,15 +120,11 @@ adjust.samples <- function(x){ ...@@ -116,15 +120,11 @@ adjust.samples <- function(x){
#' @param x #' @param x
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons) #' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#' #'
#' @param group
#' gene (not exon) names\strong{:} vector of length \eqn{p}
#'
#' @param offset #' @param offset
#' exon start positions\strong{:} vector of length \eqn{p} #' exon length\strong{:} vector of length \eqn{p}
#' exon end positions\strong{:} vector of length \eqn{p}
#' #'
#' @details #' @param group
#' No information on chromosomes required. #' gene names\strong{:} vector of length \eqn{p}
#' #'
#' @examples #' @examples
#' NA #' NA
...@@ -182,7 +182,19 @@ adjust.covariates <- function(x,offset,group){ ...@@ -182,7 +182,19 @@ adjust.covariates <- function(x,offset,group){
#' @examples #' @examples
#' NA #' NA
#' #'
map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){ map.genes <- function(chr,path=getwd(),release="GRCh37",build=71){
# check input
if(chr %in% 1:22){
stop("Invalid argument \"chr\".",call.=FALSE)
}
if(!release %in% c("NCBI36","GRCh37","GRCh38")){
stop("Invalid argument \"release\".",call.=FALSE)
}
if(!build %in% 49:91){
stop("Invalid argument \"build\".",call.=FALSE)
}
file <- paste0("Homo_sapiens.",release,".",build,".gtf") file <- paste0("Homo_sapiens.",release,".",build,".gtf")
if(!file.exists(file.path(path,file))){ if(!file.exists(file.path(path,file))){
url <- paste0("ftp://ftp.ensembl.org/pub/release-",build, url <- paste0("ftp://ftp.ensembl.org/pub/release-",build,
...@@ -194,12 +206,12 @@ map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){ ...@@ -194,12 +206,12 @@ map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){
object <- refGenome::ensemblGenome() object <- refGenome::ensemblGenome()
refGenome::basedir(object) <- path refGenome::basedir(object) <- path
refGenome::read.gtf(object,filename=file) refGenome::read.gtf(object,filename=file)
genes <- refGenome::getGenePositions(object=object,by="gene_id") x <- refGenome::getGenePositions(object=object,by="gene_id")
genes <- genes[genes$seqid==chr & genes$gene_biotype=="protein_coding",] x <- x[x$seqid==chr & x$gene_biotype=="protein_coding",]
genes <- genes[,c("gene_id","seqid","start","end")] x <- x[,c("gene_id","seqid","start","end")]
rownames(genes) <- NULL rownames(x) <- NULL
colnames(genes)[colnames(genes)=="seqid"] <- "chr" colnames(x)[colnames(x)=="seqid"] <- "chr"
return(genes) return(x)
} }
#' @export #' @export
...@@ -210,29 +222,35 @@ map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){ ...@@ -210,29 +222,35 @@ map.genes <- function(chr,path=getwd(),release="GRCh37",build="71"){
#' This function #' This function
#' #'
#' @param gene_id #' @param gene_id
#' gene names\strong{:} vector with one entry per gene (gene names!) #' gene names\strong{:} vector with one entry per gene,
#' including the gene names
#' #'
#' @param exon_id #' @param exon_id
#' exon names\strong{:} vector with one entry per exon #' exon names\strong{:} vector with one entry per exon,
#' (also gene names! separated by comma if multiple genes) #' including the corresponding \emph{gene} names
#' (separated by comma if multiple gene names)
#' #'
#' @details #' @details
#' The exon names should contain the gene names. For each gene, this function #' The exon names should contain the gene names. For each gene, this function
#' returns the indices of the exons. #' returns the indices of the exons.
#' #'
#' @examples #' @examples
#' NA #' gene <- c("A","B","C")
#' exon <- c("A","A,B","B","B,C","C")
#' map.exons(gene,exon)
#' #'
map.exons <- function(gene_id,exon_id){ map.exons <- function(gene,exon){
p <- length(gene_id) p <- length(gene)
exons <- list() x <- list()
pb <- utils::txtProgressBar(min=0,max=p,style=3) pb <- utils::txtProgressBar(min=0,max=p,style=3)
for(i in seq_len(p)){ for(i in seq_len(p)){
utils::setTxtProgressBar(pb=pb,value=i) utils::setTxtProgressBar(pb=pb,value=i)
which <- as.integer(grep(pattern=gene_id[i],x=exon_id)) which <- as.integer(grep(pattern=gene[i],x=exon))
exons[[i]] <- which x[[i]] <- which
} }
return(exons) close(con=pb)
names(x) <- gene
return(x)
} }
#' @export #' @export
...@@ -261,33 +279,45 @@ map.exons <- function(gene_id,exon_id){ ...@@ -261,33 +279,45 @@ map.exons <- function(gene_id,exon_id){
#' chromosomal position of SNPs\strong{:} #' chromosomal position of SNPs\strong{:}
#' numeric vector with one entry per SNP #' numeric vector with one entry per SNP
#' #'
#' @param dist
#' number of base pairs before start position\strong{:}
#' integer
#'
#' @details #' @details
#' This functions ... #' This function ...
#' #'
#' @examples #' @examples
#' NA #' gene.chr <- rep(1,times=5)
#' gene.start <- 1:5
#' gene.end <- 2:6
#' #'
map.snps <- function(gene.chr,gene.start,gene.end,snp.chr,snp.pos){ #' snp.chr <- rep(1,times=100)
#' snp.pos <- seq(from=1,to=4.9,length.out=100)
#'
#' map.snps(gene.chr,gene.start,gene.end,snp.chr,snp.pos,dist=0)
#'
map.snps <- function(gene.chr,gene.start,gene.end,snp.chr,snp.pos,dist=10^3){
if(length(gene.chr)!=length(gene.start)|length(gene.chr)!=length(gene.end)){ if(length(gene.chr)!=length(gene.start)|length(gene.chr)!=length(gene.end)){
stop("Invalid.",call.=FALSE) stop("Invalid.",call.=FALSE)
} }
p <- length(gene.start) p <- length(gene.start)
snps <- data.frame(from=integer(length=p),to=integer(length=p)) x <- data.frame(from=integer(length=p),to=integer(length=p))
pb <- utils::txtProgressBar(min=0,max=p,style=3) pb <- utils::txtProgressBar(min=0,max=p,style=3)
for(i in seq_len(p)){ # for(i in seq_len(p)){ #
utils::setTxtProgressBar(pb=pb,value=i) utils::setTxtProgressBar(pb=pb,value=i)
chr <- snp.chr == gene.chr[i] chr <- snp.chr == gene.chr[i]
if(!any(chr)){next} if(!any(chr)){next}
start <- snp.pos >= gene.start[i] - 1*10^3 start <- snp.pos >= (gene.start[i] - dist)
end <- snp.pos <= gene.end[i] + 0 end <- snp.pos <= gene.end[i] + 0
which <- as.integer(which(chr & start & end)) which <- as.integer(which(chr & start & end))
if(length(which)==0){next} if(length(which)==0){next}
snps$from[i] <- min(which) x$from[i] <- min(which)
snps$to[i] <- max(which) x$to[i] <- max(which)
if(length(which)==1){next} if(length(which)==1){next}
if(!all(diff(which)==1)){stop("SNPs are in wrong order!")} if(!all(diff(which)==1)){stop("SNPs are in wrong order!")}
} }
return(snps) close(con=pb)
return(x)
} }
#' @export #' @export
...@@ -332,6 +362,7 @@ drop.trivial.genes <- function(map){ ...@@ -332,6 +362,7 @@ drop.trivial.genes <- function(map){
check[3] <- length(ys) > 1 check[3] <- length(ys) > 1
pass[i] <- all(check) pass[i] <- all(check)
} }
close(con=pb)
# check output # check output
if(any(pass[map$snps$to==0 & map$snps$from==0])){ if(any(pass[map$snps$to==0 & map$snps$from==0])){
...@@ -430,40 +461,62 @@ test.single <- function(Y,X,map,i,limit=NULL,steps=NULL,rho=c(0,0.5,1)){ ...@@ -430,40 +461,62 @@ test.single <- function(Y,X,map,i,limit=NULL,steps=NULL,rho=c(0,0.5,1)){
} }
#' @export
#' @title
#' Conduct tests
#'
#' @description
#' This function
#'
#' @param Y
#' exon expression\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (exons)
#'
#' @param X
#' SNP genotype\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{q} columns (SNPs)
#'
#' @param rho
#' correlation\strong{:}
#' numeric vector with values between \eqn{0} and \eqn{1}
#'
#' @details
#' Automatic adjustment of the number of permutations
#' such that Bonferroni-significant p-values are possible.
#'
#' @examples
#' NA
#'
test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){ test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){
p <- nrow(map$genes) p <- nrow(map$genes)
# permutations # permutations
min <- 5 if(FALSE){
max <- p/0.05+1 min <- 5
limit <- ceiling(0.05*max/p) max <- p/0.05+1
base <- 1.5 # adjust sequence limit <- ceiling(0.05*max/p)
from <- log(min,base=base) base <- 1.5 # adjust sequence
to <- log(max,base=base) from <- log(min,base=base)
steps <- c(min,diff(unique(round(base^(seq(from=from,to=to,length.out=20)))))) to <- log(max,base=base)
steps <- c(min,diff(unique(round(base^(seq(from=from,to=to,length.out=20))))))
#max <- p/0.05+1 }
#limit <- ceiling(0.05*max/p)
#perms <- exp(seq(from=log(limit),to=log(max),by=0.5*log(limit))) if(TRUE){
#perms[length(steps)] <- max max <- p/0.05+1
#diff(perms) limit <- ceiling(0.05*max/p)
steps <- diff(limit^seq(from=1,to=log(max)/log(limit),length.out=pmin(p,20)))
##steps <- c(min,exp(seq(from=from,to=to,length.out=20))) steps <- c(limit,round(steps))
##steps <- seq(from=0,to=log(exp(1)),length.out=20) steps[length(steps)] <- max-sum(steps[-length(steps)])
##steps <- diff(exp(seq(from=log(min),to=log(max),length.out=20))) }
##steps[1] <- min
##rev(steps)[1] <- max-sum(rev(steps)[-1])
if(max != sum(steps)){stop("Invalid combination?",call.=FALSE)} if(max != sum(steps)){stop("Invalid combination?",call.=FALSE)}
# parallel computation # parallel computation
type <- ifelse(test=.Platform$OS.type=="windows",yes="PSOCK",no="FORK") type <- ifelse(test=.Platform$OS.type=="windows",yes="PSOCK",no="FORK")
cluster <- parallel::makeCluster(spec=8,type=type) cluster <- parallel::makeCluster(spec=8,type=type)
parallel::clusterSetRNGStream(cl=cluster,iseed=1) parallel::clusterSetRNGStream(cl=cluster,iseed=1)
#test.single <- spliceQTL::test.single
#parallel::clusterExport(cl=cluster,varlist="test.single")
parallel::clusterExport(cl=cluster,varlist=c("Y","X","map","limit","steps","rho"),envir=environment()) parallel::clusterExport(cl=cluster,varlist=c("Y","X","map","limit","steps","rho"),envir=environment())
#parallel::clusterEvalQ(cl=cluster,expr=library(spliceQTL))
start <- Sys.time() start <- Sys.time()
pvalue <- parallel::parLapply(cl=cluster,X=seq_len(p),fun=function(i) spliceQTL::test.single(Y=Y,X=X,map=map,i=i,limit=limit,steps=steps,rho=rho)) pvalue <- parallel::parLapply(cl=cluster,X=seq_len(p),fun=function(i) spliceQTL::test.single(Y=Y,X=X,map=map,i=i,limit=limit,steps=steps,rho=rho))
end <- Sys.time() end <- Sys.time()
...@@ -471,12 +524,8 @@ test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){ ...@@ -471,12 +524,8 @@ test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){
rm(cluster) rm(cluster)
# tyding up # tyding up
#names <- names(pvalue[[1]])
#pvalue <- lapply(names,function(x) lapply(pvalue,function(y) y[[x]]))
#names(pvalue) <- names
#attributes(pvalue)$time <- end - start
pvalue <- do.call(what=rbind,args=pvalue) pvalue <- do.call(what=rbind,args=pvalue)
colnames(pvalue) <- rho colnames(pvalue) <- paste0("rho=",rho)
rownames(pvalue) <- map$genes$gene_id rownames(pvalue) <- map$genes$gene_id
return(pvalue) return(pvalue)
...@@ -485,6 +534,8 @@ test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){ ...@@ -485,6 +534,8 @@ test.multiple <- function(Y,X,map,rho=c(0,0.5,1)){
#--- spliceQTL test functions -------------------------------------------------- #--- spliceQTL test functions --------------------------------------------------
# Function: G2.multin # Function: G2.multin
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment