Gitlab migration complete. If you have any issue please read the FAQ.

Commit a2b2ad73 authored by Armin Rauschenberger's avatar Armin Rauschenberger
Browse files

automation

parent 0dc4a4b9
This diff is collapsed.
Package: colasso
Version: 0.0.0
Title: colasso regression
Title: Stable Sparse Regression
Description: Implements colasso.
Depends: R (>= 3.0.0)
Imports: glmnet, MASS, weights
......@@ -9,6 +9,6 @@ Authors@R: person("Armin","Rauschenberger",email="a.rauschenberger@vumc.nl",role
VignetteBuilder: knitr
License: GPL-3
LazyData: true
RoxygenNote: 6.0.1
RoxygenNote: 6.1.0
URL: https://github.com/rauschenberger/colasso
BugReports: https://github.com/rauschenberger/colasso/issues
......@@ -2,8 +2,5 @@
export(colasso)
export(colasso_compare)
export(colasso_covariate_weights)
export(colasso_marginal_significance)
export(colasso_moderate)
export(colasso_simulate)
export(colasso_weighted_correlation)
#' @export
#' @aliases colasso-package
#' @title
#' colasso
#'
#' @description
#' This function ...
#' Implements penalised regression with response moderation.
#'
#' @param y
#' response\strong{:}
#' vector of length \eqn{n}
#'
#' @param Y
#' response\strong{:}
#' matrix with \eqn{n} rows and \eqn{p} columns,
#' or vector of length \eqn{n} (see details)
#'
#' @param X
#' covariates\strong{:}
#' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#'
#' @param alpha
#' elastic net parameter\strong{:}
#' numeric between \eqn{0} and \eqn{1};
#' \eqn{alpha=1} for lasso,
#' \eqn{alpha=0} for ridge
#'
#' @param nfolds
#' number of folds
#'
#' @param alpha
#' elastic net parameter
#' @param family
#' see glmnet
#'
#' @param type.measure
#' see glmnet
#'
#' @examples
#' n <- 100; p <- 20
......@@ -30,236 +43,94 @@
#' #y[1] <- 0.5
#' #a <- glmnet::glmnet(y=y,x=x,family="binomial")
#' #b <- stats::glm(y~x,family="binomial")
colasso <- function(y,X,alpha=1,nfolds=10){
# properties
n <- nrow(X); p <- ncol(X)
if(length(y)!=n){stop("sample size")}
foldid <- sample(x=rep(x=seq_len(nfolds),length.out=n))
pi <- seq(from=0,to=0.5,by=0.1) # adapt this
#'
colasso <- function(y,Y,X,alpha=1,nfolds=10,family="gaussian",type.measure="deviance"){
# properties
n <- nrow(X); p <- ncol(X)
if(length(y)!=n){stop("sample size")}
foldid <- sample(x=rep(x=seq_len(nfolds),length.out=n))
# weights
pi <- seq(from=0,to=1,by=0.2) # adapt this
# model fitting
fit <- list()
Y <- colasso_moderate(y=y,X=X,pi=pi)
for(j in seq_along(pi)){
fit[[j]] <- glmnet::glmnet(y=Y[,j],x=X,alpha=alpha)
}
# model fitting
fit <- list()
ym <- colasso_moderate(Y=Y) # trial
for(i in seq_along(pi)){
weights <- rep(c(1-pi[[i]],pi[[i]]),each=n)
fit[[i]] <- glmnet::glmnet(y=c(y,ym),x=rbind(X,X),weights=weights,family=family,alpha=alpha)
}
# inner cross-validation
pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
for(k in sort(unique(foldid))){
y0 <- y[foldid!=k]
#y1 <- y[foldid==k]
Y0 <- Y[foldid!=k,,drop=FALSE]
#Y1 <- Y[foldid==k,,drop=FALSE]
X0 <- X[foldid!=k,,drop=FALSE]
X1 <- X[foldid==k,,drop=FALSE]
# inner cross-validation
pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
for(k in sort(unique(foldid))){
y0 <- y[foldid!=k]
y1 <- y[foldid==k]
X0 <- X[foldid!=k,,drop=FALSE]
X1 <- X[foldid==k,,drop=FALSE]
Y0 <- colasso_moderate(y=y0,X=X0,pi=pi)
for(j in seq_along(pi)){
glmnet <- glmnet::glmnet(y=Y0[,j],x=X0,alpha=alpha) # was lambda=lambda
temp <- stats::predict(object=glmnet,newx=X1,type="response",s=fit[[j]]$lambda)
pred[[j]][foldid==k,seq_len(ncol(temp))] <- temp
}
}
# loss sequence
y0m <- colasso_moderate(Y=Y0)
for(i in seq_along(pi)){
fit[[i]]$cvm <- apply(X=pred[[i]],MARGIN=2,FUN=function(x) mean((x-y)^2))
fit[[i]]$lambda.min <- fit[[i]]$lambda[which.min(fit[[i]]$cvm)]
weights <- rep(c(1-pi[[i]],pi[[i]]),each=sum(foldid!=k)) # trial
glmnet <- glmnet::glmnet(y=c(y0,y0m),x=rbind(X0,X0),weights=weights,family=family,alpha=alpha)
temp <- stats::predict(object=glmnet,newx=X1,type="response",s=fit[[i]]$lambda)
pred[[i]][foldid==k,seq_len(ncol(temp))] <- temp
}
# selection
cvm <- sapply(fit,function(x) x$cvm[which(x$lambda==x$lambda.min)])
sel <- which.min(cvm) # BUT "sel <- which.max(cvm)" FOR AUC!
fit[[length(pi)+1]] <- fit[[sel]]
#graphics::plot(cvm); graphics::abline(v=sel,lty=2)
names(fit) <- c("standard",paste0("pi",pi[-1]),"select")
return(fit)
}
# loss sequence
for(i in seq_along(pi)){
fit[[i]]$cvm <- apply(X=pred[[i]],MARGIN=2,FUN=function(x) mean((x-y)^2))
fit[[i]]$lambda.min <- fit[[i]]$lambda[which.min(fit[[i]]$cvm)]
}
# selection
cvm <- sapply(fit,function(x) x$cvm[which(x$lambda==x$lambda.min)])
if(type.measure=="AUC"){
sel <- which.max(cvm)
} else {
sel <- which.min(cvm)
}
fit[[length(pi)+1]] <- fit[[sel]]
#graphics::plot(cvm); graphics::abline(v=sel,lty=2)
names(fit) <- c("standard",paste0("pi",pi[-1]),"select")
return(fit)
}
# colasso <- function(y,X,alpha=1,nfolds=10){
# n <- nrow(X); p <- ncol(X)
# if(length(y)!=n){stop("sample size")}
# foldid <- sample(x=rep(x=seq_len(nfolds),length.out=n))
# pi <- seq(from=0,to=0.5,by=0.1) # adapt this
# lambda <- glmnet::glmnet(y=y,x=X,alpha=alpha)$lambda
#
# pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
# for(k in sort(unique(foldid))){
# y0 <- y[foldid!=k]
# y1 <- y[foldid==k]
# X0 <- X[foldid!=k,,drop=FALSE]
# X1 <- X[foldid==k,,drop=FALSE]
#
# Y0 <- colasso_moderate(y=y0,X=X0,pi=pi)
# for(j in seq_along(pi)){
# glmnet <- glmnet::glmnet(y=Y0[,j],x=X0,alpha=alpha) # was lambda=lambda
# temp <- stats::predict(object=glmnet,newx=X1,type="response",s=lambda)
# pred[[j]][foldid==k,seq_len(ncol(temp))] <- temp
# }
# }
#
# Y <- colasso_moderate(y=y,X=X,pi=pi)
# fit <- list()
# for(j in seq_along(pi)){
# fit[[j]] <- glmnet::glmnet(y=Y[,j],x=X,alpha=alpha) # was lambda=lambda
# #fit[[j]]$cvm <- palasso::loss.trial(y=y,fit=pred[[j]],family=family,type.measure=type.measure)[[1]]
# fit[[j]]$cvm <- apply(X=pred[[j]],MARGIN=2,FUN=function(x) mean((x-y)^2))
# fit[[j]]$lambda.min <- lambda[which.min(fit[[j]]$cvm)]
# }
#
# cvm <- sapply(fit,function(x) x$cvm[which.min(abs(x$lambda-x$lambda.min))[1]])
#
# #sel <- which.max(cvm) # only for AUC
# sel <- which.min(cvm)
#
# fit[[length(pi)+1]] <- fit[[sel]]
# #graphics::plot(cvm); graphics::abline(v=sel,lty=2)
# names(fit) <- c("standard",paste0("pi",pi[-1]),"select")
# return(fit)
# }
#' @export
#' @title
#' marginal significance
#'
#' @description
#' This function ...
#'
#' @inheritParams colasso
#'
#' @examples
#' NA
#'
# obtain p-values --------------------------------------------------------------
# input: y, X; output: p-value
colasso_marginal_significance <- function(y,X){
# X = scale(X)
x <- vector()
for(i in seq_len(ncol(X))){
if(stats::var(X[,i])==0){
x[i] <- 0
} else {
x[i] <- summary(stats::lm(y~X[,i]))$coefficients["X[, i]","Pr(>|t|)"]
}
}
return(x)
}
#' @export
#' @title
#' covariate weights
#'
#' @description
#' This function ...
#'
#' @param x
#' \eqn{p}-values\strong{:}
#' vector with entries between \eqn{0} and \eqn{1}
#'
#' @param max
#' maximum \eqn{p}-value to receive weight one
#'
#' @param min
#' minimum \eqn{p}-value to receive weight zero
#'
#' @param version
#' (temporary argument)
#'
#' @examples
#' # alternative weights: weight <- abs(cor(y,X))
#'
colasso_covariate_weights <- function(x,max=0.05/length(x),min=0.05,version=1){ # was min=0.2
if(version==1){
weight <- pmin(-log10(max),pmax(-log10(x)+log10(min),0))
weight <- weight/(-log10(max))
# more compact: (log10(x)-log10(min))/(log10(max)) # (and then truncate)
} else {
weight <- pmax(1-(1/min)*x,0)
}
message("non-zero weight: ",100*round(mean(weight!=0),digits=2),"%")
return(weight)
}
#' @export
#' @title
#' weighted correlation
#'
#' @description
#' This function ...
#'
#' @inheritParams colasso
#'
#' @param w
#' covariate weights\strong{:}
#' vector of length \eqn{p}, with entries between \eqn{0} and \eqn{1}
#'
#' @examples
#' NA
#'
# weighted covariate correlation -----------------------------------------------
# input: X, weights; output: correlation
colasso_weighted_correlation <- function(X,w=NULL){
n <- nrow(X); p <- ncol(X)
if(is.null(w)){w <- rep(x=1,times=p)}
mx <- rep(x=NA,times=p)
for(i in seq_len(p)){
mx[i] <- sum(w*X[,i])/sum(w)
}
sx <- cor <- matrix(NA,nrow=p,ncol=p)
diag(cor) <- 1
for(i in seq(from=1,to=p,by=1)){
sx[i,i] <- sum(w*(X[,i]-mx[i])^2)/sum(w)
}
for(i in seq(from=1,to=p,by=1)){
for(j in seq(from=i,to=p,by=1)){
sx[i,j] <- sx[j,i] <- sum(w*(X[,i]-mx[i])*(X[,j]-mx[j]))/sum(w)
cor[i,j] <- cor[j,i] <- sx[i,j]/sqrt(sx[i,i]*sx[j,j])
}
}
return(cor)
}
#' @export
#' @title
#' moderated response
#'
#'
#' @description
#' This function ...
#'
#'
#' @inheritParams colasso
#'
#'
#' @param pi
#' vector with entries between \eqn{0} and \eqn{1} (rename argument)
#'
#'
#' @param plot
#' logical
#'
#'
#'
#'
#' @examples
#' NA
#'
colasso_moderate <- function(y,X,pi,plot=FALSE){
pvalue <- colasso_marginal_significance(y=y,X=X)
weight <- colasso_covariate_weights(x=pvalue)
#cor <- abs(colasso_weighted_correlation(t(X),w=weight)) # robust
cor <- abs(weights::wtd.cors(t(X),weight=weight)) # fast
Y <- matrix(data=NA,nrow=length(y),ncol=length(pi),dimnames=list(NULL,pi))
for(i in seq_along(y)){
for(j in seq_along(pi)){
Y[i,j] <- (1-pi[j])*y[i]+pi[j]*sum(cor[,i]*y)/sum(cor[,i])
}
}
if(plot){
#plot(ecdf(x)); abline(a=0,b=1,lty=2)
#plot(x=x,y=weight,ylim=c(0,1)); abline(v=0.05,lty=2)
#graphics::image(cor)
#graphics::image(Y)
}
return(Y)
colasso_moderate <- function(Y){
# (most basic version possible)
y <- rowMeans(Y)
y <- apply(Y,1,median)
if(all(y) %in% c(0,0.5,1)){
y[y==0.5] <- 1
warning("Invalid unless binomial family.")
}
return(y)
}
#' @export
......@@ -284,7 +155,7 @@ colasso_moderate <- function(y,X,pi,plot=FALSE){
#' @examples
#' # CONTINUE HERE
#'
colasso_simulate <- function(n=100,p=500,cor="constant",plot=TRUE){
colasso_simulate <- function(n=100,p=500,cor="constant",family="gaussian",plot=TRUE){
# correlation matrix
if(cor=="none"){
Sigma <- matrix(data=0,nrow=p,ncol=p)
......@@ -292,14 +163,13 @@ colasso_simulate <- function(n=100,p=500,cor="constant",plot=TRUE){
Sigma <- matrix(data=0.05,nrow=p,ncol=p)
} else if(cor=="autoregressive"){
# adjust 0.9 to p, such that mean(Sigma)=0.05
#sum(2*(p-seq_len(p)+1)*0.9^seq_len(p))/(p*p)
# sum(2*(p-seq_len(p)+1)*0.9^seq_len(p))/(p*p)
Sigma <- 0.9^abs(col(diag(p))-row(diag(p)))
} else if(cor=="unstructured"){
Sigma <- matrix(data=stats::rbeta(n=p,shape1=0.05,shape2=1),nrow=p,ncol=p)
}
diag(Sigma) <- 1
X <- MASS::mvrnorm(n=n,mu=rep(0,p),Sigma=Sigma)
stats::median(abs(as.numeric(cor(X))))
......@@ -309,24 +179,36 @@ colasso_simulate <- function(n=100,p=500,cor="constant",plot=TRUE){
#y <- stats::rnorm(n=n,mean=mu)
# sparse effects
beta <- rep(1,times=p)
beta[stats::rbinom(n=p,size=1,prob=0.97)==1] <- 0
beta <- rep(x=1,times=p)
beta[stats::rbinom(n=p,size=1,prob=0.95)==1] <- 0
mu <- X %*% beta
y <- stats::rnorm(n=n,mean=mu,sd=10)
if(family=="gaussian"){
Y <- replicate(n=10,expr=stats::rnorm(n=n,mean=mu,sd=10))
} else if(family=="binomial"){
prob <- exp(mu)/(1+exp(mu))
Y <- replicate(n=10,expr=stats::rbinom(n=n,size=1,prob=prob))
} else if(family=="poisson"){
lambda <- exp(mu)
Y <- replicate(n=10,expr=stats::rpois(n=n,lambda=lambda))
} else if(family=="cox"){
warning("Cox regression not implemented!")
}
#y <- stats::rnorm(n=n,mean=mu,sd=10)
y <- Y[,1]
# predictivity -----------------------------------------------------------------
if(plot){
graphics::par(mar=c(3,3,1,1))
test <- glmnet::cv.glmnet(x=X,y=y)
graphics::plot(x=log(test$lambda),y=test$cvm)
graphics::abline(h=test$cvm[test$lambda==max(test$lambda)],lty=2)
}
return(list(y=y,X=X))
return(list(y=y,Y=Y,X=X))
}
#' @export
#' @title
#' External cross-validation
......@@ -335,7 +217,10 @@ colasso_simulate <- function(n=100,p=500,cor="constant",plot=TRUE){
#' This function ...
#'
#' @param y
#' response
#' response vector
#'
#' @param Y
#' response moderation matrix
#'
#' @param X
#' covariates
......@@ -349,14 +234,14 @@ colasso_simulate <- function(n=100,p=500,cor="constant",plot=TRUE){
#' @examples
#' NA
#'
colasso_compare <- function(y,X,plot=TRUE,nfolds.int=10){
colasso_compare <- function(y,Y,X,plot=TRUE,nfolds.int=10,family="gaussian",type.measure="deviance"){
fold <- sample(x=rep(x=seq_len(5),length.out=length(y)))
pred <- matrix(data=NA,nrow=length(y),ncol=8)
select <- list()
for(i in sort(unique(fold))){
cat("i =",i,"\n")
fit <- colasso(y=y[fold!=i],X=X[fold!=i,],alpha=1,nfolds=nfolds.int)
fit <- colasso(y=y[fold!=i],Y=Y[fold!=i,],X=X[fold!=i,],alpha=1,nfolds=nfolds.int,type.measure=type.measure)
for(j in seq_along(fit)){
pred[fold==i,j] <- glmnet::predict.glmnet(object=fit[[j]],
newx=X[fold==i,],
......@@ -367,7 +252,14 @@ colasso_compare <- function(y,X,plot=TRUE,nfolds.int=10){
pred[fold==i,8] <- mean(y[fold!=i]) # intercept-only model
}
colnames(pred) <- c(names(fit),"intercept")
loss <- apply(X=pred,MARGIN=2,FUN=function(x) sum((y-x)^2))
if(family=="gaussian" & type.measure=="deviance" | family=="binomial" & type.measure=="mse"){
loss <- apply(X=pred,MARGIN=2,FUN=function(x) sum((y-x)^2))
} else if(family=="binomial" & type.measure=="class"){
loss <- apply(X=pred,MARGIN=2,FUN=function(x) mean(y!=x))
} else {
warning("Implement other loss functions!")
}
### start temporary ###
#stability <- numeric()
......@@ -400,5 +292,3 @@ colasso_compare <- function(y,X,plot=TRUE,nfolds.int=10){
return(loss)
}
#'
#'
#' #' @export
#' #' @aliases colasso-package
#' #' @title
#' #' colasso
#' #'
#' #' @description
#' #' This function ...
#' #'
#' #' @param y
#' #' response\strong{:}
#' #' vector of length \eqn{n}
#' #'
#' #' @param Y
#' #' moderated response (Trial)
#' #'
#' #' @param X
#' #' covariates\strong{:}
#' #' matrix with \eqn{n} rows (samples) and \eqn{p} columns (variables)
#' #'
#' #' @param nfolds
#' #' number of folds
#' #'
#' #' @param alpha
#' #' elastic net parameter
#' #'
#' #' @examples
#' #' n <- 100; p <- 20
#' #' X <- matrix(rnorm(n*p),nrow=n,ncol=p)
#' #' #y <- rbinom(n=n,size=1,prob=0.2)
#' #' y <- rnorm(n=n)
#' #' #y[1] <- 0.5
#' #' #a <- glmnet::glmnet(y=y,x=x,family="binomial")
#' #' #b <- stats::glm(y~x,family="binomial")
#' colasso <- function(y,Y,X,alpha=1,nfolds=10){
#'
#' # properties
#' n <- nrow(X); p <- ncol(X)
#' if(length(y)!=n){stop("sample size")}
#' foldid <- sample(x=rep(x=seq_len(nfolds),length.out=n))
#'
#' # weights
#' pi <- seq(from=0,to=1,by=0.2) # adapt this
#'
#' # model fitting
#' fit <- list()
#' ym <- colasso_moderate(Y=Y) # trial
#' for(i in seq_along(pi)){
#' weights <- rep(c(1-pi[[i]],pi[[i]]),each=n)
#' fit[[i]] <- glmnet::glmnet(y=c(y,ym),x=rbind(X,X),weights=weights,alpha=alpha)
#' }
#'
#' # inner cross-validation
#' pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
#' for(k in sort(unique(foldid))){
#' y0 <- y[foldid!=k]
#' y1 <- y[foldid==k]
#' Y0 <- Y[foldid!=k,,drop=FALSE] # trial
#' Y1 <- Y[foldid==k,,drop=FALSE] # trial
#' X0 <- X[foldid!=k,,drop=FALSE]
#' X1 <- X[foldid==k,,drop=FALSE]
#'
#' y0m <- colasso_moderate(Y=Y0) # trial
#' for(i in seq_along(pi)){
#' weights <- rep(c(1-pi[[i]],pi[[i]]),each=sum(foldid!=k)) # trial
#' glmnet <- glmnet::glmnet(y=c(y0,y0m),x=rbind(X0,X0),weights=weights,alpha=alpha)
#' temp <- stats::predict(object=glmnet,newx=X1,type="response",s=fit[[i]]$lambda)
#' pred[[i]][foldid==k,seq_len(ncol(temp))] <- temp
#' }
#' }
#'
#' # loss sequence
#' for(i in seq_along(pi)){
#' fit[[i]]$cvm <- apply(X=pred[[i]],MARGIN=2,FUN=function(x) mean((x-y)^2))
#' fit[[i]]$lambda.min <- fit[[i]]$lambda[which.min(fit[[i]]$cvm)]
#' }
#'
#' # selection
#' cvm <- sapply(fit,function(x) x$cvm[which(x$lambda==x$lambda.min)])
#' sel <- which.min(cvm) # BUT "sel <- which.max(cvm)" FOR AUC!
#' fit[[length(pi)+1]] <- fit[[sel]]
#'
#' #graphics::plot(cvm); graphics::abline(v=sel,lty=2)
#' names(fit) <- c("standard",paste0("pi",pi[-1]),"select")
#' return(fit)
#' }
#'
#'
#' # colasso <- function(y,ym,X,alpha=1,nfolds=10){
#' #
#' # # properties
#' # n <- nrow(X); p <- ncol(X)
#' # if(length(y)!=n){stop("sample size")}
#' # foldid <- sample(x=rep(x=seq_len(nfolds),length.out=n))
#' # pi <- seq(from=0,to=0.5,by=0.1) # adapt this
#' #
#' # # model fitting
#' # fit <- list()
#' # Y <- colasso_moderate(y=y,ym=ym,pi=pi) # trial
#' # for(j in seq_along(pi)){
#' # fit[[j]] <- glmnet::glmnet(y=Y[,j],x=X,alpha=alpha)
#' # }
#' #
#' # # inner cross-validation
#' # pred <- lapply(pi,function(x) matrix(data=NA,nrow=length(y),ncol=100))
#' # for(k in sort(unique(foldid))){
#' # y0 <- y[foldid!=k]
#' # y1 <- y[foldid==k]
#' # ym0 <- ym[foldid!=k] # trial
#' # ym1 <- ym[foldid==k] # trial
#' # X0 <- X[foldid!=k,,drop=FALSE]
#' # X1 <- X[foldid==k,,drop=FALSE]