#--- import unexported functions: # FUNCTION <- get("FUNCTION",envir=asNamespace("PACKAGE")) #--- deactivate on Solaris: # if(!grepl('SunOS',Sys.info()['sysname'])){} #--- Main function ------------------------------------------------------------- #' @export #' @title #' Multivariate Elastic Net Regression #' #' @description #' Implements multivariate elastic net regression. #' #' @param Y #' outputs\strong{:} #' numeric matrix with \eqn{n} rows (samples) #' and \eqn{q} columns (variables) #' #' @param X #' inputs\strong{:} #' numeric matrix with \eqn{n} rows (samples) #' and \eqn{p} columns (variables) #' #' @param family #' distribution\strong{:} #' vector of length \eqn{1} or \eqn{q} with entries #' \code{"gaussian"}, \code{"binomial"} or \code{"poisson"} #' #' @param nfolds #' number of folds #' #' @param foldid #' fold identifiers\strong{:} #' vector of length \eqn{n} with entries between \eqn{1} and \code{nfolds}; #' or \code{NULL} (balance) #' #' @param type.measure #' loss function\strong{:} #' vector of length \eqn{1} or \eqn{q} with entries #' \code{"deviance"}, \code{"class"}, \code{"mse"} or \code{"mae"} #' (see \code{\link[glmnet]{cv.glmnet}}) #' #' @param alpha.base #' elastic net mixing parameter for base learners\strong{:} #' numeric between \eqn{0} (ridge) and \eqn{1} (lasso) #' #' @param alpha.meta #' elastic net mixing parameter for meta learners\strong{:} #' numeric between \eqn{0} (ridge) and \eqn{1} (lasso) #' #' @param constraint #' non-negativity constraints\strong{:} #' logical (see details) #' #' @param weight #' inclusion/exclusion of variables\strong{:} #' logical matrix with \eqn{q} rows and \eqn{p} columns, #' or \code{NULL} (see details) #' #' @param ... #' further arguments passed to \code{\link[glmnet]{glmnet}} #' #' @references #' Armin Rauschenberger, Enrico Glaab (2021) #' "Predicting correlated outcomes from molecular data" #' \emph{Manuscript in preparation}. #' #' @details #' \strong{non-negativity constraints:} #' If it is reasonable to assume that the outcomes #' are \emph{positively} correlated #' (potentially after changing the sign of some outcomes) #' we recommend to set \code{constraint=TRUE}. #' Then non-negativity constraints are imposed on the meta learner. #' #' \strong{inclusion/exclusion of variables:} #' The entry in the \eqn{j}th column and the \eqn{k}th row #' indicates whether the \eqn{j}th feature may be used for #' modelling the \eqn{k}th outcome #' (where \eqn{0} means \code{FALSE} and #' \eqn{1} means \code{TRUE}). #' #' \strong{elastic net:} #' \code{alpha.base} controls input-output effects, #' \code{alpha.meta} controls output-output effects; #' lasso renders sparse models (\code{alpha}\eqn{=1}), #' ridge renders dense models (\code{alpha}\eqn{=0}) #' #' @return #' This function returns an object of class \code{joinet}. #' Available methods include #' \code{\link[=predict.joinet]{predict}}, #' \code{\link[=coef.joinet]{coef}}, #' and \code{\link[=weights.joinet]{weights}}. #' The slots \code{base} and \code{meta} each contain #' \eqn{q} \code{\link[glmnet]{cv.glmnet}}-like objects. #' #' @seealso #' \code{\link{cv.joinet}}, vignette #' #' @examples #' \dontshow{ #' if(!grepl('SunOS',Sys.info()['sysname'])){ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X)}} #' \dontrun{ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X)} #' #' \dontrun{ #' browseVignettes("joinet") # further examples} #' joinet <- function(Y,X,family="gaussian",nfolds=10,foldid=NULL,type.measure="deviance",alpha.base=1,alpha.meta=1,constraint=TRUE,weight=NULL,...){ # IMPLEMENT CODE FOR CONSTRAINT AND WEIGHT! #--- temporary --- # family <- "gaussian"; nfolds <- 10; foldid <- NULL; type.measure <- "deviance"; alpha.base <- alpha.meta <- 1; constraint <- TRUE; weight <- NULL #--- checks --- Y <- as.matrix(Y) X <- as.matrix(X) cornet:::.check(x=Y,type="matrix",miss=TRUE) if(constraint){ for(i in 1:(ncol(Y)-1)){ for(j in i:ncol(Y)){ cor <- stats::cor.test(Y[,i],Y[,j],use="pairwise.complete.obs") if(cor$statistic<0 & cor$p.value<0.05){ warning(paste("Columns",i,"and",j,"are negatively correlated. Consider using constraint=FALSE."),call.=FALSE) } } } } #if(any(stats::cor(Y,use="pairwise.complete.obs")<0,na.rm=TRUE)){warning("Negative correlation!",call.=FALSE)} cornet:::.check(x=X,type="matrix") #cornet:::.check(x=family,type="vector",values=c("gaussian","binomial","poisson")) if(nrow(Y)!=nrow(X)){stop("Contradictory sample size.",call.=FALSE)} cornet:::.check(x=nfolds,type="scalar",min=3) cornet:::.check(x=foldid,type="vector",values=seq_len(nfolds),null=TRUE) cornet:::.check(x=type.measure,type="string",values=c("deviance","class","mse","mae")) # not auc (min/max confusion) cornet:::.check(x=alpha.base,type="scalar",min=0,max=1) cornet:::.check(x=alpha.meta,type="scalar",min=0,max=1) cornet:::.check(x=weight,type="matrix",min=0,max=1,null=TRUE) if(!is.null(c(list(...)$lower.limits,list(...)$upper.limits))){ stop("Reserved arguments \"lower.limits\" and \"upper.limits\".",call.=FALSE) } #--- dimensionality --- n <- nrow(Y) q <- ncol(Y) p <- ncol(X) if(is.null(weight)){ pf <- matrix(1,nrow=q,ncol=p) } else { pf <- 1/weight } #--- family --- if(length(family)==1){ family <- rep(family,times=q) } else if(length(family)!=q){ stop("Invalid argument family",call.=FALSE) } #--- fold identifiers --- # provide foldid as matrix? if(is.null(foldid)){ foldid <- palasso:::.folds(y=Y[,1],nfolds=nfolds) # temporary Y[,1] } else { nfolds <- length(unique(foldid)) } #--- full fit --- nlambda <- numeric() base <- lapply(seq_len(q),function(x) list()) for(i in seq_len(q)){ cond <- !is.na(Y[,i]) #if(sum(cond)==0){nlambda[i] <- 0; next} base[[i]]$glmnet.fit <- glmnet::glmnet(y=Y[cond,i],x=X[cond,],family=family[i],alpha=alpha.base,penalty.factor=pf[i,],...) # ellipsis base[[i]]$lambda <- base[[i]]$glmnet.fit$lambda nlambda[i] <- length(base[[i]]$glmnet.fit$lambda) } #--- predictions --- link <- list() for(i in seq_len(q)){ link[[i]] <- matrix(data=NA,nrow=n,ncol=nlambda[i]) } #--- base cross-validation --- for(k in seq_len(nfolds)){ Y0 <- Y[foldid!=k,,drop=FALSE] Y1 <- Y[foldid==k,,drop=FALSE] X0 <- X[foldid!=k,,drop=FALSE] X1 <- X[foldid==k,,drop=FALSE] for(i in seq_len(q)){ cond <- !is.na(Y0[,i]) #if(sum(cond)==0){next} object <- glmnet::glmnet(y=Y0[cond,i],x=X0[cond,],family=family[i],alpha=alpha.base,penalty.factor=pf[i,],...) # ellipsis temp <- stats::predict(object=object,newx=X1,type="link", s=base[[i]]$glmnet.fit$lambda) link[[i]][foldid==k,seq_len(ncol(temp))] <- temp } } #--- tune base lambdas --- for(i in seq_len(q)){ fit <- .mean.function(link[[i]],family=family[i]) cond <- !is.na(Y[,i]) base[[i]]$cvm <- palasso:::.loss(y=Y[cond,i],fit=fit[cond,], family=family[i],type.measure=type.measure)[[1]] base[[i]]$lambda.min <- base[[i]]$lambda[which.min(base[[i]]$cvm)] class(base[[i]]) <- "cv.glmnet" # trial 2020-01-10 } #--- predictions --- hat <- matrix(NA,nrow=n,ncol=q) for(i in seq_len(q)){ hat[,i] <- link[[i]][,base[[i]]$lambda==base[[i]]$lambda.min] } #--- meta cross-validation --- meta <- list() for(i in seq_len(q)){ cond <- !is.na(Y[,i]) meta[[i]] <- glmnet::cv.glmnet(y=Y[cond,i],x=hat[cond,], lower.limits=ifelse(constraint,0,-Inf), # important: 0 (was lower.limits=0) upper.limits=Inf, # important: Inf foldid=foldid[cond], family=family[i], type.measure=type.measure, intercept=TRUE, # with intercept alpha=alpha.meta,...) # ellipsis # consider trying different alpha.meta and selecting best one } #--- return --- names(base) <- names(meta) <- paste0("y",seq_len(q)) info <- data.frame(q=q,p=p,family=family,type.measure=type.measure) list <- list(base=base,meta=meta,info=info) class(list) <- "joinet" return(list) } .mean.function <- function(x,family){ if(family %in% c("gaussian","cox")){ return(x) } else if(family=="binomial"){ return(1/(1+exp(-x))) } else if(family=="poisson"){ return(exp(x)) } else { stop("Family not implemented.",call.=FALSE) } } .link.function <- function(x,family){ if(family %in% c("gaussian","cox")){ return(x) } else if(family=="binomial"){ if(any(x<0|x>1)){stop("Invalid!",call.=FALSE)} return(log(x/(1-x))) } else if(family=="poisson"){ if(any(x<0)){stop("Invalid!",call.=FALSE)} return(log(x)) } else { stop("Family not implemented.",call.=FALSE) } } #--- Methods for class "joinet" ----------------------------------------------- #' @export #' @title #' Make Predictions #' #' @description #' Predicts outcome from features with stacked model. #' #' @param object #' \link[joinet]{joinet} object #' #' @param newx #' covariates\strong{:} #' numeric matrix with \eqn{n} rows (samples) #' and \eqn{p} columns (variables) #' #' @param type #' character "link" or "response" #' #' @param ... #' further arguments (not applicable) #' #' @return #' This function returns predictions from base and meta learners. #' The slots \code{base} and \code{meta} each contain a matrix #' with \eqn{n} rows (samples) and \eqn{q} columns (variables). #' #' @examples #' \dontshow{ #' if(!grepl('SunOS',Sys.info()['sysname'])){ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' Y[,1] <- 1*(Y[,1]>median(Y[,1])) #' object <- joinet(Y=Y,X=X,family=c("binomial","gaussian","gaussian")) #' predict(object,newx=X)}} #' \dontrun{ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' Y[,1] <- 1*(Y[,1]>median(Y[,1])) #' object <- joinet(Y=Y,X=X,family=c("binomial","gaussian","gaussian")) #' predict(object,newx=X)} #' predict.joinet <- function(object,newx,type="response",...){ if(length(list(...))!=0){warning("Ignoring argument.",call.=FALSE)} x <- object; rm(object) newx <- as.matrix(newx) cornet:::.check(x=newx,type="matrix") q <- length(x$base) n <- nrow(newx) base <- meta <- matrix(NA,nrow=n,ncol=q) # base learners for(i in seq_len(q)){ base[,i] <- as.numeric(stats::predict(object=x$base[[i]]$glmnet.fit,newx=newx,s=x$base[[i]]$lambda.min,type="link")) #base[,i] <- as.numeric(glmnet:::predict.cv.glmnet(object=x$base[[i]],newx=newx,s="lambda.min",type="link")) # check whether fine for "binomial" family } # meta learners for(i in seq_len(q)){ meta[,i] <- as.numeric(stats::predict(object=x$meta[[i]], newx=base,s="lambda.min",type="link")) } list <- list(base=base,meta=meta) if(type=="response"){ for(i in seq_len(q)){ base[,i] <- .mean.function(x=base[,i],family=x$info$family[i]) meta[,i] <- .mean.function(x=meta[,i],family=x$info$family[i]) } } else if(type!="link"){ stop("Invalid type.",call.=FALSE) } list <- list(base=base,meta=meta) return(list) } #' @export #' @title #' Extract Coefficients #' #' @description #' Extracts pooled coefficients. #' (The meta learners linearly combines #' the coefficients from the base learners.) #' #' @param object #' \link[joinet]{joinet} object #' #' @param ... #' further arguments (not applicable) #' #' @return #' This function returns the pooled coefficients. #' The slot \code{alpha} contains the intercepts #' in a vector of length \eqn{q}, #' and the slot \code{beta} contains the slopes #' in a matrix with \eqn{p} rows (inputs) and \eqn{q} columns. #' #' @examples #' \dontshow{ #' if(!grepl('SunOS',Sys.info()['sysname'])){ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X) #' coef <- coef(object)}} #' \dontrun{ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X) #' coef <- coef(object)} #' coef.joinet <- function(object,...){ if(length(list(...))!=0){warning("Ignoring argument.",call.=FALSE)} # base coefficients base <- list() coef <- sapply(object$base,function(x) stats::coef(object=x$glmnet.fit,s=x$lambda.min)) base$alpha <- sapply(coef,function(x) x[1,]) base$beta <- sapply(coef,function(x) x[-1,]) names(base$alpha) <- colnames(base$beta) <- names(object$base) # meta coefficients meta <- list() weights <- weights.joinet(object) meta$alpha <- weights[1,] meta$beta <- weights[-1,] # pooled coefficients pool <- list() pool$alpha <- meta$alpha + base$alpha %*% meta$beta pool$beta <- base$beta %*% meta$beta # q <- unique(object$info$q) # p <- unique(object$info$p) # pool$alpha <- rep(NA,times=q) # for(i in seq_len(q)){ # pool$alpha[i] <- meta$alpha[i] + sum(meta$beta[,i] * base$alpha) # } # pool$beta <- matrix(NA,nrow=p,ncol=q) # for(i in seq_len(p)){ # for(j in seq_len(q)){ # pool$beta[i,j] <- sum(meta$beta[,j] * base$beta[i,]) # } # } return(pool) } #' @export #' @importFrom stats weights #' @title #' Extract Weights #' #' @description #' Extracts coefficients from the meta learner, #' i.e. the weights for the base learners. #' #' @param object #' \link[joinet]{joinet} object #' #' @param ... #' further arguments (not applicable) #' #' @return #' This function returns a matrix with #' \eqn{1+q} rows and \eqn{q} columns. #' The first row contains the intercepts, #' and the other rows contain the slopes, #' which are the effects of the outcomes #' in the row on the outcomes in the column. #' #' @examples #' \dontshow{ #' if(!grepl('SunOS',Sys.info()['sysname'])){ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X) #' weights(object)}} #' \dontrun{ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' object <- joinet(Y=Y,X=X) #' weights(object)} #' weights.joinet <- function(object,...){ if(length(list(...))!=0){warning("Ignoring argument.",call.=FALSE)} x <- object$meta coef <- lapply(object$meta,function(x) stats::coef(object=x,s=x$lambda.min)) coef <- do.call(what="cbind",args=coef) coef <- as.matrix(coef) colnames(coef) <- names(object$meta) return(coef) } print.joinet <- function(x,...){ cat(paste0("joinet object"),"\n") } #--- Manuscript functions ------------------------------------------------------ #' @export #' @title #' Model comparison #' #' @description #' Compares univariate and multivariate regression. #' #' @inheritParams joinet #' #' @param nfolds.ext #' number of external folds #' #' @param nfolds.int #' number of internal folds #' #' @param foldid.ext #' external fold identifiers\strong{:} #' vector of length \eqn{n} with entries #' between \eqn{1} and \code{nfolds.ext}; #' or \code{NULL} #' #' @param foldid.int #' internal fold identifiers\strong{:} #' vector of length \eqn{n} with entries #' between \eqn{1} and \code{nfolds.int}; #' or \code{NULL} #' #' @param compare #' experimental arguments\strong{:} #' character vector with entries "mnorm", "spls", "mrce", #' "sier", "mtps", "rmtl", "gpm" and others #' (requires packages \code{spls}, \code{MRCE}, \code{SiER}, \code{MTPS}, \code{RMTL} or \code{GPM}) #' #' @param mice #' missing data imputation\strong{:} #' logical (\code{mice=TRUE} requires package \code{mice}) #' #' @param cvpred #' return cross-validated predictions: logical #' #' @param times #' measure computation time\strong{:} #' logical #' #' @param ... #' further arguments passed to \code{\link[glmnet]{glmnet}} #' and \code{\link[glmnet]{cv.glmnet}} #' #' @return #' This function returns a matrix with \eqn{q} columns, #' including the cross-validated loss from the univariate models #' (\code{base}), the multivariate models (\code{meta}), #' and the intercept-only models (\code{none}). #' #' @examples #' \dontshow{ #' if(!grepl('SunOS',Sys.info()['sysname'])){ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' cv.joinet(Y=Y,X=X)}} #' \dontrun{ #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' cv.joinet(Y=Y,X=X)} #' #' \dontrun{ #' # correlated features #' n <- 50; p <- 100; q <- 3 #' mu <- rep(0,times=p) #' Sigma <- 0.90^abs(col(diag(p))-row(diag(p))) #' X <- MASS::mvrnorm(n=n,mu=mu,Sigma=Sigma) #' mu <- rowSums(X[,sample(seq_len(p),size=5)]) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=mu)) #' #Y <- t(MASS::mvrnorm(n=q,mu=mu,Sigma=diag(n))) #' cv.joinet(Y=Y,X=X)} #' #' \dontrun{ #' # other distributions #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' eta <- rowSums(X[,1:5]) #' Y <- replicate(n=q,expr=rbinom(n=n,size=1,prob=1/(1+exp(-eta)))) #' cv.joinet(Y=Y,X=X,family="binomial") #' Y <- replicate(n=q,expr=rpois(n=n,lambda=exp(scale(eta)))) #' cv.joinet(Y=Y,X=X,family="poisson")} #' #' \dontrun{ #' # uncorrelated outcomes #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' y <- rnorm(n=n,mean=rowSums(X[,1:5])) #' Y <- cbind(y,matrix(rnorm(n*(q-1)),nrow=n,ncol=q-1)) #' cv.joinet(Y=Y,X=X)} #' #' \dontrun{ #' # sparse and dense models #' n <- 50; p <- 100; q <- 3 #' X <- matrix(rnorm(n*p),nrow=n,ncol=p) #' Y <- replicate(n=q,expr=rnorm(n=n,mean=rowSums(X[,1:5]))) #' set.seed(1) # fix folds #' cv.joinet(Y=Y,X=X,alpha.base=1) # lasso #' set.seed(1) #' cv.joinet(Y=Y,X=X,alpha.base=0) # ridge} #' cv.joinet <- function(Y,X,family="gaussian",nfolds.ext=5,nfolds.int=10,foldid.ext=NULL,foldid.int=NULL,type.measure="deviance",alpha.base=1,alpha.meta=1,compare=FALSE,mice=FALSE,cvpred=FALSE,times=FALSE,...){ if(FALSE){ fold <- foldid.ext family <- "gaussian"; nfolds.ext <- 5; nfolds.int <- 10; foldid.ext <- foldid.int <- NULL; type.measure <- "deviance"; alpha.base <- alpha.meta <- 1; mice <- cvpred <- times <- FALSE foldid.ext <- fold; nfolds.ext <- 1 #nfolds.ext <- 1; nfolds.int <- 10; foldid.int <- NULL; compare <- TRUE } if(length(compare)==1 && compare==TRUE){ if(all(family=="gaussian")){ compare <- c("mnorm","mars","spls","mrce","map","mrf","sier","mcen","gpm","rmtl","mtps") } else if(all(family=="binomial")){ compare <- c("mars","mcen","rmtl","mtps") } else { stop("Comparison not implemented for mixed families.",call.=FALSE) } } n <- nrow(Y) q <- ncol(Y) p <- ncol(X) #--- fold identifiers --- if(is.null(foldid.ext)){ foldid.ext <- palasso:::.folds(y=Y[,1],nfolds=nfolds.ext) # temporary Y[,1] } else { #nfolds.ext <- length(unique(foldid.ext)) nfolds.ext <- max(foldid.ext) } #--- family --- if(length(family)==1){ family <- rep(family,times=q) } else if(length(family)!=q){ stop("Invalid argument family",call.=FALSE) } # check packages pkgs <- .packages(all.available=TRUE) if(is.character(compare)){ for(i in seq_along(compare)){ pkg <- switch(compare[i],mnorm="glmnet",mars="earth",spls="spls", mrce="MRCE",map="remMap",mrf="MultivariateRandomForest", sier="SiER",mcen="mcen",gpm="GPM",rmtl="RMTL",mtps="MTPS", stop("Invalid method.",call.=FALSE)) if(!pkg %in% pkgs){ stop("Method \"",compare[i],"\" requires package \"",pkg,"\".",call.=FALSE) } } } #--- checks --- #if(any( & any(family!="gaussian")){ # stop("\"mnorm\", \"spls\", \"mrce\" and \"sier\" require family \"gaussian\".",call.=FALSE) #} #if(any(mtps,rmtl) & any(!family %in% c("gaussian","binomial"))){ # stop("\"mtps\" and \"rmtl\" require family \"gaussian\" or \"binomial\".",call.=FALSE) #} #--- cross-validated predictions --- models <- c("base","meta",compare,"none") pred <- lapply(X=models,function(x) matrix(data=NA,nrow=n,ncol=q)) time <- lapply(X=models,function(x) NA) names(pred) <- names(time) <- models for(i in seq_len(nfolds.ext)){ Y0 <- Y[foldid.ext!=i,,drop=FALSE] #Y1 <- Y[foldid.ext==i,,drop=FALSE] X0 <- X[foldid.ext!=i,,drop=FALSE] X1 <- X[foldid.ext==i,,drop=FALSE] # standardise features (trial) #mu <- apply(X=X0,MARGIN=2,FUN=function(x) mean(x)) #sd <- apply(X=X0,MARGIN=2,FUN=function(x) stats::sd(x)) #X0 <- t((t(X0)-mu)/sd)[,sd!=0] #X1 <- t((t(X1)-mu)/sd)[,sd!=0] # or standardise once before cv? # remove constant features cond <- apply(X=X0,MARGIN=2,FUN=function(x) stats::sd(x)!=0) X0 <- X0[,cond]; X1 <- X1[,cond] if(is.null(foldid.int)){ foldid <- palasso:::.folds(y=Y0[,1],nfolds=nfolds.int) # temporary Y0[,1] } else { foldid <- foldid.int[foldid.ext!=i] } # base and meta learners start <- Sys.time() fit <- joinet(Y=Y0,X=X0,family=family,type.measure=type.measure,alpha.base=alpha.base,alpha.meta=alpha.meta,foldid=foldid) # add ellipsis (...) # also do not standardise! temp <- predict.joinet(fit,newx=X1) pred$base[foldid.ext==i,] <- temp$base pred$meta[foldid.ext==i,] <- temp$meta end <- Sys.time() time$meta <- as.numeric(difftime(end,start,units="secs")) # missing values if(mice & any(is.na(Y0))){ if(requireNamespace("mice",quietly=TRUE)){ y0 <- as.matrix(mice::complete(data=mice::mice(Y0,m=1,method="pmm",seed=1,printFlag=FALSE),action="all")[[1]]) } else { stop("Imputation by PMM requires install.packages(\"mice\").",call.=FALSE) } } else { y0 <- apply(X=Y0,MARGIN=2,FUN=function(x) ifelse(is.na(x),stats::median(x[!is.na(x)]),x)) } all(Y0==y0,na.rm=TRUE) # other learners if("mnorm" %in% compare){ cat("mnorm"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("mnorm requires \"gaussian\" family.",call.=FALSE) } net <- glmnet::cv.glmnet(x=X0,y=y0,family="mgaussian",foldid=foldid,alpha=alpha.base) # add ellipsis (...) pred$mnorm[foldid.ext==i,] <- stats::predict(object=net,newx=X1,s="lambda.min",type="response") end <- Sys.time() time$mnorm <- as.numeric(difftime(end,start,units="secs")) } else { net <- glmnet::glmnet(x=X0,y=y0,family="mgaussian") } if("mars" %in% compare){ cat("mars"," ") start <- Sys.time() if(all(family=="gaussian")){ object <- earth::earth(x=X0,y=y0) # equivalent: object <- mda::mars(x=X0,y=y0) } else if(all(family=="binomial")){ object <- earth::earth(x=X0,y=y0,glm=list(family=stats::binomial)) } else { stop("MARS requires either \"gaussian\" or \"binomial\" family.",call.=FALSE) } # pmethod="cv" not available for multivariate outputs ### start trial ### if(FALSE){ #nk <- min(200, max(20, 2 * ncol(X0))) + 1 #nprune <- round(seq(from=2,to=nk,length.out=10)) #object <- list() #for(j in seq_along(nprune)){ # object[[j]] <- earth::earth(x=X0,y=y0,nprune=nprune[j],pmethod="cv",nfold=nfolds.int) #} #sapply(object,function(x) x$gcv) ## i.e. run earth/mars with tryCatch for each nprune ## and select run with best cvm (here gcv) # tune nprune (use default nk)! } #pred$mars[foldid.ext==i,] <- earth:::predict.earth(object=object,newdata=X1,type="response") # original pred$mars[foldid.ext==i,] <- stats::predict(object=object,newdata=X1,type="response") # trial end <- Sys.time() time$mars <- as.numeric(difftime(end,start,units="secs")) } if("spls" %in% compare){ cat("spls"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("spls requires \"gaussian\" family.") } invisible(utils::capture.output(cv.spls <- spls::cv.spls(x=X0,y=y0,fold=nfolds.int,K=seq_len(min(ncol(X0),10)), eta=seq(from=0.0,to=0.9,by=0.1),plot.it=FALSE))) #scale.x=FALSE object <- spls::spls(x=X0,y=y0,K=cv.spls$K.opt,eta=cv.spls$eta.opt) #scale.x=FALSE pred$spls[foldid.ext==i,] <- spls::predict.spls(object=object,newx=X1,type="fit") end <- Sys.time() time$spls <- as.numeric(difftime(end,start,units="secs")) } if("mrce" %in% compare){ cat("mrce"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("MRCE requires \"gaussian\" family.",call.=FALSE) } lam1 <- lam2 <- 10^seq(from=1,to=-4,length.out=11) invisible(utils::capture.output(trials <- lapply(lam2,function(x) tryCatch(expr=MRCE::mrce(X=X0,Y=y0,lam1.vec=lam1,lam2.vec=x,method="cv",kfold=nfolds.int),error=function(x) NULL)))) cv.err <- sapply(trials,function(x) ifelse(is.null(x),Inf,min(x$cv.err))) object <- trials[[which.min(cv.err)]] pred$mrce[foldid.ext==i,] <- matrix(object$muhat,nrow=nrow(X1),ncol=q,byrow=TRUE) + X1 %*% object$Bhat end <- Sys.time() time$mrce <- as.numeric(difftime(end,start,units="secs")) } if("map" %in% compare){ cat("map"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("map requires \"gaussian\" family.") } mean <- colMeans(y0) y0s <- y0-matrix(data=mean,nrow=nrow(X0),ncol=ncol(y0),byrow=TRUE) lamL1.v <- lamL2.v <- exp(seq(from=0,to=5,length.out=11)) cv <- remMap::remMap.CV(X=X0,Y=y0s,lamL1.v=lamL1.v,lamL2.v=lamL2.v,fold=nfolds.int) #graphics::plot(x=lamL1.v,y=log(as.numeric(cv$ols.cv[,3]))) pick <- which.min(as.vector(cv$ols.cv)) lamL1 <- cv$l.index[1,pick] lamL2 <- cv$l.index[2,pick] # index <- which(cv$ols.cv==min(cv$ols.cv),arr.ind=TRUE)[1,] # rev(lamL1.v)[index[1]] # rev(lamL2.v)[index[2]] ##cat("lam1:",lamL1,", lam2:",lamL2) object <- remMap::remMap(X.m=X0,Y.m=y0s,lamL1=lamL1,lamL2=lamL2) pred$map[foldid.ext==i,] <- matrix(data=mean,nrow=nrow(X1),ncol=ncol(y0),byrow=TRUE) + X1 %*% object$phi end <- Sys.time() time$map <- as.numeric(difftime(end,start,units="secs")) } if("mrf" %in% compare){ cat("mrf"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("mrf requires \"gaussian\" family.") } pred$mrf[foldid.ext==i,] <- MultivariateRandomForest::build_forest_predict(trainX=X0,trainY=y0, n_tree=100,m_feature=min(ncol(X0)-1,5),min_leaf=min(nrow(X0),5),testX=X1) # use n_tree=500, m_feature=floor(ncol(X0)/3) # alternative: IntegratedMRF # Check why this does not work well! end <- Sys.time() time$mrf <- as.numeric(difftime(end,start,units="secs")) } if("sier" %in% compare){ cat("sier"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("SiER requires \"gaussian\" family.",call.=FALSE) } invisible(utils::capture.output(object <- SiER::cv.SiER(X=X0,Y=y0,K.cv=3))) # trial with K.cv=3 (for spped-up) # use upper.comp=10 and thres=0.01 (changed for speed-up) pred$sier[foldid.ext==i,] <- SiER::pred.SiER(cv.fit=object,X.new=X1) end <- Sys.time() time$sier <- as.numeric(difftime(end,start,units="secs")) } if("mcen" %in% compare){ cat("mcen"," ") start <- Sys.time() if(all(family=="gaussian")){ type <- "mgaussian" } else if(all(family=="binomial")){ type <- "mbinomial" } else { stop("mcen requires either \"gaussian\" or \"binomial\".",call.=FALSE) } object <- mcen::cv.mcen(x=X0,y=y0,family=type,folds=foldid,ky=1, gamma_y=seq(from=0.1,to=5.1,by=1),ndelta=5) # TEMPORARY gamma_y=seq(from=0.1,to=5.1,length.out=3) and ndelta=3 (for speed-up) #temp <- mcen:::predict.cv.mcen(object=object,newx=X1) # original temp <- stats::predict(object=object,newx=X1) # trial pred$mcen[foldid.ext==i,] <- as.matrix(temp) # single cluster (ky=1) due to setting and error end <- Sys.time() time$mcen <- as.numeric(difftime(end,start,units="secs")) } if("gpm" %in% compare){ cat("gpm"," ") start <- Sys.time() if(any(family!="gaussian")){ stop("GPM requires \"gaussian\" family.",call.=FALSE) } object <- GPM::Fit(X=X0,Y=y0) pred$gpm[foldid.ext==i,] <- GPM::Predict(XF=X1,Model=object)$YF end <- Sys.time() time$gpm <- as.numeric(difftime(end,start,units="secs")) } if("rmtl" %in% compare){ cat("rmtl"," ") start <- Sys.time() if(all(family=="gaussian")){ type <- "Regression" y0l <- lapply(seq_len(ncol(y0)),function(i) y0[,i,drop=FALSE]) } else if(all(family=="binomial")){ type <- "Classification" y0l <- lapply(seq_len(ncol(y0)),function(i) 2*y0[,i,drop=FALSE]-1) } else { stop("RMTL requires either \"gaussian\" or \"binomial\".",call.=FALSE) } X0l <- lapply(seq_len(ncol(y0)),function(i) X0) X1l <- lapply(seq_len(ncol(y0)),function(i) X1) #--------------------------- #--- manual tuning start --- Lam1_seq <- c(10^seq(from=1,to=-4,length.out=11),0) Lam2_seq <- c(10^seq(from=1,to=-4,length.out=11),0) cvMTL <- list() seed <- .Random.seed for(j in seq_along(Lam2_seq)){ .Random.seed <- seed cvMTL[[j]] <- RMTL::cvMTL(X=X0l,Y=y0l,type=type,Lam1_seq=Lam1_seq,Lam2=Lam2_seq[j],nfolds=nfolds.int) } cvm <- vapply(X=cvMTL,FUN=function(x) min(x$cvm),FUN.VALUE=numeric(1)) Lam1 <- cvMTL[[which.min(cvm)]]$Lam1.min Lam2 <- Lam2_seq[which.min(cvm)] #graphics::plot(x=Lam2_seq,y=cvm) #cat(Lam1," ",Lam2,"\n") #--- manual tuning end ---- #-------------------------- MTL <- RMTL::MTL(X=X0l,Y=y0l,type=type,Lam1=Lam1,Lam2=Lam2) #temp <- RMTL:::predict.MTL(object=MTL,newX=X1l) # original temp <- stats::predict(object=MTL,newX=X1l) pred$rmtl[foldid.ext==i,] <- do.call(what="cbind",args=temp) end <- Sys.time() time$rmtl <- as.numeric(difftime(end,start,units="secs")) } if("mtps" %in% compare){ cat("mtps"," ") start <- Sys.time() if(alpha.base==0){ step1 <- MTPS::glmnet.ridge } else if(alpha.base==1){ step1 <- MTPS::glmnet.lasso } else { stop("MTPS requires alpha.base 0 or 1.",call.=FALSE) } #------------------------------- #--- manual lambda.min start --- #body <- body(step1) #body[[6]][[3]][[2]][[3]] <- "lambda.min" #body[[7]][[3]][[4]] <- "lambda.min" #body[[8]][[3]][[3]][[2]][[4]] <- "lambda.min" #body(step1) <- body #if(alpha.base==0){ # assignInNamespace(x="glmnet.ridge",value=step1,ns="MTPS") #} else if(alpha.base==1){ # assignInNamespace(x="glmnet.lasso",value=step1,ns="MTPS") #} #--- manual lambda.min end --- #----------------------------- #--------------------------- #--- manual kmeans start --- #fun <- MTPS::cv.multiFit #body <- body(fun) #body[[11]][[3]][[3]] <- nrow(unique(y0)) #body(fun) <- body #assignInNamespace(x="cv.multiFit",value=step1,ns="MTPS") #--- manual kmeans end --- #------------------------- if(all(family=="gaussian")){ step2 <- MTPS::rpart1 } else if(all(family=="binomial")){ step2 <- MTPS::glm1 #y0 <- as.data.frame(y0) #y0 <- as.data.frame(lapply(y0,function(x) factor(x))) } else { stop("MTPS requires family gaussian or binomial.",call.=FALSE) } object <- MTPS::MTPS(xmat=X0,ymat=y0,family=family,nfold=nfolds.int, method.step1=step1,method.step2=step2,cv=TRUE,residual=TRUE) pred$mtps[foldid.ext==i,] <- MTPS::predict.MTPS(object=object,newdata=X1,type="response") end <- Sys.time() time$mtps <- as.numeric(difftime(end,start,units="secs")) # now using cross-validation residual stacking (CVRS) } if(is.character(compare)){cat("\n")} # --- development --- if(FALSE){ ## --- MLPUGS --- (binary outcome only) #X0f <- as.data.frame(X0) #y0f <- as.data.frame(y0) #X1f <- as.data.frame(X1) #object <- MLPUGS::ecc(x=X0f,y=y0f,.f=randomForest::randomForest) #pred_ecc <- predict(object,newdata=X1f,n.iters=300,burn.in=100,thin=2, # .f = function(rF,newdata){randomForest:::predict.randomForest(rF, newdata, type = "prob")}) #pred$ecc[foldid.ext==i,] <- summary(pred_ecc,type="prob") ## --- MSGLasso --- (many user inputs) #MSGLasso::MSGLasso.cv(X=X0,Y=Y0) ## --- PMA --- (not for prediction?) ## --- MSP --- (not for hd data?) #object <- MBSP::mbsp.tpbn(X=X0,Y=Y0) #X1 %*% object$B # adjust for intercept ## --- bgsmtr --- (for SNPs only?) #temp <- bgsmtr::bgsmtr(X=t(X0),Y=t(Y0),group=rep(1,times=ncol(X0))) ## --- MGLM --- (for multinomial data only?) #MGLM::MGLMsparsereg.fit() #MGLM::MGLMtune() } pred$none[foldid.ext==i,] <- matrix(colMeans(Y0,na.rm=TRUE),nrow=sum(foldid.ext==i),ncol=ncol(Y0),byrow=TRUE) } #--- cross-validated loss --- loss <- matrix(data=NA,nrow=length(models),ncol=ncol(Y), dimnames=list(models,colnames(Y))) for(j in models){ for(i in seq_len(q)){ cond <- !is.na(Y[,i]) & foldid.ext!=0 # added foldid.ext!=0 loss[j,i] <- palasso:::.loss(y=Y[cond,i],fit=pred[[j]][cond,i],family=family[i],type.measure=type.measure)[[1]] } } #--- model refit --- #fit <- joinet(Y=Y,X=X,family=family,type.measure=type.measure,alpha.base=alpha.base,alpha.meta=alpha.meta) # add ,... #list <- list(loss=loss,fit=fit) if(cvpred){ loss <- list(loss=loss,cvpred=pred$meta) } if(times){ loss <- list(loss=loss,time=time) } return(loss) } plot.matrix <- function (X, margin = 0, labels = TRUE, las = 1, cex = 1, range = NULL, cutoff = 0, digits=2) { #margin <- 0; labels <- TRUE; las <- 1; cex <- 1; range <- NULL; cutoff <- 0; digits <- 2 if(is.vector(X)){X <- as.matrix(X,ncol=1)} n <- nrow(X) p <- ncol(X) if(is.null(rownames(X))&n!=1){rownames(X) <- seq_len(n)} if(is.null(colnames(X))&p!=1){colnames(X) <- seq_len(p)} v <- ifelse(n==1,1,0.5/(n - 1)) h <- ifelse(p==1,1,0.5/(p - 1)) graphics::plot.new() graphics::plot.window(xlim = c(-h, 1 + h), ylim = c(-v, 1 + v)) par_usr <- graphics::par()$usr graphics::par(usr = c(-h, 1 + h, -v, 1 + v)) at <- (seq(nrow(X)) - 1)/(nrow(X) - 1) graphics::mtext(text = rev(rownames(X)), at = at, side = 2, las = las, cex = cex,line=0.1) at <- (seq(ncol(X)) - 1)/(ncol(X) - 1) graphics::mtext(text = colnames(X), at = at , side = 3, las = las, cex = cex,line=0.1) if(is.null(range)){range <- range(X,-X,na.rm=TRUE)} if(any(Xmax(range),na.rm=TRUE)){stop("Invalid.")} breaks <- c(seq(from=min(range),to=cutoff,length.out=101), seq(from=cutoff,to=max(range),length.out=101)[-1]) col <- c(grDevices::colorRampPalette(c("darkblue","blue","white"))(100), grDevices::colorRampPalette(c("white","red","darkred"))(100)) image <- t(X)[, seq(from = n, to = 1, by = -1),drop=FALSE] graphics::image(x = image, breaks=breaks, col=col, add = TRUE) if (any(margin==1)){ graphics::segments(x0 = -h, x1 = 1 + h, y0 = seq(from = -v, to = 1 + v, by = 2 * v), col = "white", lwd = 3) } if (any(margin==2)){ graphics::segments(x0 = seq(from = -h, to = 1 + h, by = 2 * h), y0 = 1 + v, y1 = 0 - v, col = "white", lwd = 3) } if (labels) { labels <- round(as.numeric(X), digits = digits) is.na <- is.na(labels) labels <- format(labels, digits = digits) labels[is.na] <- "" xs <- rep(seq_len(p), each = n) ys <- rep(seq_len(n), times = p) if(p==1){x <- 0.5}else{x <- (xs - 1)/(p - 1)} if(n==1){y <- 0.5}else{y <- (n - ys)/(n - 1)} graphics::text(x = x, y = y, labels = labels, col = "black",cex=cex) } graphics::par(usr = par_usr) } # # MTPS.MTPS <- function (xmat, ymat, family, cv = FALSE, residual = TRUE, nfold = 5, # method.step1, method.step2, resid.type = c("deviance", # "pearson", "raw"), resid.std = FALSE) # { # resid.type <- match.arg(resid.type) # ny <- ncol(ymat) # if (length(family) == 1) { # if (!family %in% c("gaussian", "binomial")) { # stop("family must be gaussian or binomial") # } # if (family == "gaussian") { # family = rep("gaussian", ny) # } # else if (family == "binomial") { # family = rep("binomial", ny) # } # } # if (length(family) != ny) { # stop("length of family must be consistent with response") # } # if (sum(family %in% c("gaussian", "binomial")) != # ny) { # stop("family must be gaussian or binomial or their combination") # } # if (length(method.step1) == 1) { # method.step1 <- rep(list(method.step1), ny) # } # if (length(method.step2) == 1) { # method.step2 <- rep(list(method.step2), ny) # } # if (length(method.step1) != ny) { # stop("length of method.step1 must be 1 or the same as response column") # } # if (length(method.step2) != ny) { # stop("length of method.step2 must be 1 or the same as response column") # } # #for (ii in 1:ny) { # # if (!check.match(family[ii], FUN = method.step1[[ii]])) { # # stop("method.step1 must be consistent with response category") # # } # #} # if (!residual) { # for (ii in 1:ny) { # if (!MTPS::check.match(family[ii], FUN = method.step2[[ii]])) { # stop("method.step2 must be consistent with response category") # } # } # } # else { # for (ii in 1:ny) { # if (!MTPS::check.match("gaussian", FUN = method.step2[[ii]])) { # stop("residual stacking does not allow binary outcome model in second step") # } # } # } # if (cv) { # fit1 <- MTPS::cv.multiFit(xmat = xmat, ymat = ymat, nfold = nfold, # method = method.step1, family = family) # } # else { # fit1 <- MTPS.multiFit(xmat = xmat, ymat = ymat, method = method.step1, # family = family) # } # pred1 <- fit1$y.fitted # if (residual) { # fit2 <- MTPS::rs.multiFit(yhat = pred1, ymat = ymat, xmat = xmat, # family = family, resid.type = resid.type, resid.std = resid.std, # method = method.step2) # } # else { # fit2 <- MTPS.multiFit(xmat = pred1, ymat = ymat, method = method.step2, # family = family) # } # fit <- list(fit1 = fit1, fit2 = fit2, cv = cv, residual = residual) # class(fit) <- "MTPS" # return(fit) # } # # MTPS.multiFit <- function (xmat, ymat, method, family = family) # { # ny <- ncol(ymat) # nx <- ncol(xmat) # if (length(family) == 1) { # if (!family %in% c("gaussian", "binomial")) { # stop("family must be gaussian or binomial") # } # if (family == "gaussian") { # family = rep("gaussian", ny) # } # else if (family == "binomial") { # family = rep("binomial", ny) # } # } # if (length(family) != ny) { # stop("length of family must be consistent with response") # } # if (sum(family %in% c("gaussian", "binomial")) != # ny) { # stop("each family must be gaussian or binomial") # } # if (length(method) == 1) { # method <- rep(list(method), ny) # } # if (length(method) != ny) { # stop("length of method.step1 must be 1 or the same as response column") # } # #for (ii in 1:ny) { # # if (!check.match(family[ii], FUN = method[[ii]])) { # # stop("method.step1 must be consistent with response category") # # } # #} # y.fitted <- ymat # y.fitted[!is.na(y.fitted)] <- NA # models <- vector("list", ny) # colnames(y.fitted) <- names(models) <- colnames(ymat) # fit <- vector("list", ny) # colnames(xmat) <- paste0("X", 1:nx) # for (kk in 1:ny) { # fit[[kk]] <- method[[kk]](xmat, ymat[, kk], family = family[kk]) # models[[kk]] <- fit[[kk]]$model # y.fitted[, kk] <- fit[[kk]]$y.fitted # } # multiFit.fits <- list(fit = fit, y.fitted = y.fitted, model = models, # method = method, family = family) # class(multiFit.fits) <- "multiFit" # return(multiFit.fits) # } # # # MTPS.glmnet.ridge <- function (xmat, ymat, family, alpha = 0, ...) # { # tmp0 <- data.frame(yy = ymat, xmat) # if (family == "binomial") # ymat <- factor(ymat) # foldid <- MTPS::createFolds(ymat, k = 5, list = F) # model <- MTPS::cv.glmnet2(xmat, ymat, alpha = alpha, foldid = foldid, # family = family, ...) # coef.mat <- as.numeric(coef(model, s = "lambda.min")) # y.fitted <- predict(model, xmat, s = "lambda.min", # type = "response") # predFun <- function(model, xnew) { # predict(model, newx = as.matrix(xnew), s = "lambda.min", # type = "response") # } # return(list(model = model, y.fitted = y.fitted, predFun = predFun)) # } # # MTPS.glmnet.lasso <- function (xmat, ymat, family, alpha = 1, ...) # { # tmp0 <- data.frame(yy = ymat, xmat) # if (family == "binomial") # ymat <- factor(ymat) # foldid <- MTPS::createFolds(ymat, k = 5, list = F) # model <- MTPS::cv.glmnet2(xmat, ymat, alpha = alpha, foldid = foldid, # family = family, ...) # coef.mat <- as.numeric(coef(model, s = "lambda.min")) # y.fitted <- predict(model, xmat, s = "lambda.min", # type = "response") # predFun <- function(model, xnew) { # predict(model, newx = as.matrix(xnew), s = "lambda.min", # type = "response") # } # return(list(model = model, y.fitted = y.fitted, predFun = predFun)) # } # # #