R/squeezy.R

Defines functions .rtgamma .Hunpen .Hpen .SigmaBlocks .logdet normalityCheckQQ mAIC.LA.ridgeGLM dminML.LA.ridgeGLM minML.LA.ridgeGLM squeezy

Documented in dminML.LA.ridgeGLM mAIC.LA.ridgeGLM minML.LA.ridgeGLM normalityCheckQQ squeezy

#main function: fit group-adaptive elastic net linear or logistic model
squeezy <- function(Y,X,groupset,alpha=1,model=NULL,
                      X2=NULL,Y2=NULL,unpen=NULL,intrcpt=TRUE,
                      method=c("ecpcEN","MML","MML.noDeriv","CV"),
                      fold=10,compareMR = TRUE,selectAIC=FALSE,
                      fit.ecpc=NULL,
                      lambdas=NULL,lambdaglobal=NULL,
                      lambdasinit=NULL,sigmasq=NULL,
                      ecpcinit=TRUE,SANN=FALSE,minlam=10^-3,
                      standardise_Y=NULL,reCV=NULL,opt.sigma=NULL,
                      resultsAICboth=FALSE,silent=FALSE){
  #Y: response
  #X: observed data
  #groupset: list with index of covariates of co-data groups
  #alpha: elastic net penalty parameter (as in glmnet)
  #X2: independent observed data
  #Y2: independent response data
  #unpen: vector with index of unpenalised variables
  #intrcpt: should intercept be included?
  #fold: number of folds used in global lambda cross-validation
  #sigmasq: (linear regression) noise level
  #method: "CV","MML.noDeriv", or "MML"
  #compareMR: TRUE/FALSE to return betas for multiridge estimate, and predictions for Y2 if X2 is given
  #selectAIC: TRUE/FALSE to compare AIC of multiridge model and ordinary ridge model. Return best one.
  
  #Set-up variables ---------------------------------------------------------------------------
  n <- dim(X)[1] #number of samples
  p <- dim(X)[2] #number of covariates 
  if(length(lambdas)==p && missing(groupset)) groupset <- lapply(1:p, function(x) x)
  groupsets <- list(groupset)
  if(!is.null(X2)) n2<-dim(X2)[1] #number of samples in independent data set x2 if given
  
  if(is.null(model)){
    if(all(is.element(Y,c(0,1))) || is.factor(Y)){
      model <- "logistic" 
    } else if(all(is.numeric(Y)) & !(is.matrix(Y) && dim(Y)[2]==2)){
      model <- "linear"
    }else{
      model <- "cox"
    }
  }
  
  #settings default methods
  if(length(method)==4){
    method <- "MML"
  }
  
  switch(method,
         "ecpcEN"={
           if(is.null(fit.ecpc)) stop("provide ecpc fit results")
           if(is.null(lambdas)) lambdas <- fit.ecpc$sigmahat/(fit.ecpc$gamma*fit.ecpc$tauglobal)
           if(is.null(lambdaglobal)) lambdaglobal <- fit.ecpc$sigmahat/fit.ecpc$tauglobal
           if(is.null(sigmasq)) sigmasq <- fit.ecpc$sigmahat
           if(is.null(standardise_Y)) standardise_Y <- TRUE
           if(is.null(reCV)) reCV <- TRUE
           if(is.null(opt.sigma)) opt.sigma <- FALSE
         },
         "MML"={
           if(!is.null(fit.ecpc)){
             if(is.null(lambdasinit)) lambdasinit <- fit.ecpc$sigmahat/(fit.ecpc$gamma*fit.ecpc$tauglobal)
             if(is.null(lambdaglobal)) lambdaglobal <- fit.ecpc$sigmahat/fit.ecpc$tauglobal
             if(is.null(sigmasq)) sigmasq <- fit.ecpc$sigmahat
           } 
           if(is.null(standardise_Y)) standardise_Y <- FALSE
           if(is.null(opt.sigma)) opt.sigma <- TRUE
           if(is.null(reCV)){
             reCV <- FALSE; if(model!="linear" | !opt.sigma) reCV <- TRUE
           } 
           
         },
         "MML.noDeriv"={
           if(!is.null(fit.ecpc)){
             if(is.null(lambdasinit)) lambdasinit <- fit.ecpc$sigmahat/(fit.ecpc$gamma*fit.ecpc$tauglobal)
             if(is.null(lambdaglobal)) lambdaglobal <- fit.ecpc$sigmahat/fit.ecpc$tauglobal
             if(is.null(sigmasq)) sigmasq <- fit.ecpc$sigmahat
           } 
           if(is.null(standardise_Y)) standardise_Y <- FALSE
           if(is.null(opt.sigma)) opt.sigma <- TRUE
           if(is.null(reCV)){
             reCV <- FALSE; if(!opt.sigma) reCV <- TRUE
           } 
         },
         "CV"={
           if(!is.null(fit.ecpc)){
             if(is.null(lambdasinit)) lambdasinit <- fit.ecpc$sigmahat/(fit.ecpc$gamma*fit.ecpc$tauglobal)
             if(is.null(lambdaglobal)) lambdaglobal <- fit.ecpc$sigmahat/fit.ecpc$tauglobal
             if(is.null(sigmasq)) sigmasq <- fit.ecpc$sigmahat
           } 
           if(is.null(standardise_Y)) standardise_Y <- FALSE
           if(is.null(opt.sigma)) opt.sigma <- FALSE
           if(is.null(reCV)) reCV <- TRUE
         }
  )
  

  switch(model,
         'linear'={
           fml <- 'gaussian'
           sd_y <- sqrt(var(Y)*(n-1)/n)[1]
           if(standardise_Y){
             Y <- Y/sd_y
             sd_y_former <- sd_y
             sd_y <- 1
             if(!is.null(sigmasq)) sigmasq <- sigmasq/sd_y_former
           }
           if(method=="MML") minlam <- max(minlam,10^-4*var(Y))
         },
         'logistic'={
           fml <- 'binomial'
           opt.sigma <- FALSE
           standardise_Y <- FALSE
           sd_y <- 1 #do not standardise y in logistic setting
           sd_y_former <- sd_y
           
           #set response to numeric 0,1
           levelsY<-cbind(c(0,1),c(0,1))
           if(!all(is.element(Y,c(0,1)))){
             oldLevelsY<-levels(Y)
             levels(Y)<-c("0","1")
             Y<-as.numeric(Y)-1
             levelsY<-cbind(oldLevelsY,c(0,1))
             colnames(levelsY)<-c("Old level names","New level names")
             if(!is.null(Y2)){
               levels(Y2)<-c("0","1")
               Y2<-as.numeric(Y2)-1
             }
             if(!silent) print("Y is put in 0/1 format, see levelsY in output for new names")
           }
         },
         'cox'={
           fml <- 'cox'
           opt.sigma <- FALSE
           standardise_Y <- FALSE
           sd_y <- 1 #do not standardise y in cox regression setting
           sd_y_former <- sd_y
           intrcpt <- FALSE #Cox does not use intercept
         }
  )
  # check whether unpenalised covariates are not in partition
  # and set penalty.factor of unpenalised covariates to 0 for glmnet
  penfctr <- rep(1,p) #factor=1 for penalised covariates
  if(length(unpen)>0){
    penfctr[unpen] <- 0 #factor=0 for unpenalised covariates
    if(any(unlist(groupsets)%in%unpen)){
      warning("Unpenalised covariates removed from group set")
      for(i in 1:length(groupsets)){
        for(j in 1:length(groupsets[[i]])){
          if(all(groupsets[[i]][[j]]%in%unpen)){
            groupsets[[i]][[j]] <- NULL #remove whole group if all covariates unpenalised
          }else{
            groupsets[[i]][[j]] <- groupsets[[i]][[j]][!(groupsets[[i]][[j]]%in%unpen)]
          }
        }
      }
    }
  }
  
  G <- sapply(groupsets,length) #1xm vector with G_i, number of groups in partition i
  m <- length(G) #number of partitions
  
  indGrpsGlobal <- list(1:G[1]) #global group index in case we have multiple partitions
  if(m>1){
    for(i in 2:m){
      indGrpsGlobal[[i]] <- (sum(G[1:(i-1)])+1):sum(G[1:i])
    }
  }
  Kg <- lapply(groupsets,function(x)(sapply(x,length))) #m-list with G_i vector of group sizes in partition i
  #ind1<-ind
  
  #ind <- (matrix(1,G,1)%*%ind)==(1:G)#sparse matrix with ij element TRUE if jth element in group i, otherwise FALSE
  i<-unlist(sapply(1:sum(G),function(x){rep(x,unlist(Kg)[x])}))
  j<-unlist(unlist(groupsets))
  ind <- sparseMatrix(i,j,x=1) #sparse matrix with ij element 1 if jth element in group i (global index), otherwise 0
  
  Ik <- lapply(1:m,function(i){
    x<-rep(0,sum(G))
    x[(sum(G[1:i-1])+1):sum(G[1:i])]<-1
    as.vector(x%*%ind)}) #list for each partition with px1 vector with number of groups beta_k is in
  #sparse matrix with ij element 1/Ij if beta_j in group i
  
  #make co-data matrix Z (Zt transpose of Z as in paper, with co-data matrices stacked for multiple groupsets)
  Zt<-ind; 
  if(G[1]>1){
    Zt[1:G[1],]<-t(t(ind[1:G[1],])/apply(ind[1:G[1],],2,sum))
  }
  if(m>1){
    for(i in 2:m){
      if(G[i]>1){
        Zt[indGrpsGlobal[[i]],]<-t(t(ind[indGrpsGlobal[[i]],])/
                                     apply(ind[indGrpsGlobal[[i]],],2,sum))
      }
    }
  }
  
  if(dim(Zt)[2]<p) Zt <- cbind(Zt,matrix(rep(NaN,(p-dim(Zt)[2])*sum(G)),c(sum(G),p-dim(Zt)[2])))
  
  
  #Extend data to make artifical non-overlapping groups----
  Xxtnd <- do.call(cbind,lapply(groupsets[[1]],function(group){t(t(X[,group])/sqrt(Ik[[1]][group]))}))
  #create new group indices for Xxtnd
  Kg2 <- c(1,Kg[[1]]) 
  G2 <- length(Kg2)-1
  groupxtnd <- lapply(2:length(Kg2),function(i){sum(Kg2[1:(i-1)]):(sum(Kg2[1:i])-1)}) #list of indices in each group
  groupxtnd2 <- unlist(sapply(1:G2,function(x){rep(x,Kg2[x+1])})) #vector with group number
 
  Xunpen <- NULL
  if(sum((1:p)%in%unpen)>0) Xunpen<-X[,(1:p)%in%unpen] #seperate matrix for unpenalised variables
  
  #datablockNo <- groupxtnd2
  #datablocks <- groupxtnd (or groupset if no overlapping groups)
  
  #datablocks: list with each element a data type containing indices of covariates with that data type 
  Xbl <- createXblocks(lapply(groupxtnd,function(ind) Xxtnd[,ind,drop=FALSE]))
  XXbl <- createXXblocks(lapply(groupxtnd,function(ind) Xxtnd[,ind,drop=FALSE]))
  
  #Find global lambda if not given for initial penalty and/or for AIC comparison----
  if(is.null(lambdaglobal)){
    #find rough estimate of initial global lambda
    # cvperblock <- fastCV2(list(Xxtnd),Y=Y,kfold=fold,fixedfolds = FALSE,X1=Xunpen,intercept=intrcpt)
    # lambda <- cvperblock$lambdas
    # lambda[lambda==Inf] <- 10^6
    
    lambdaseq <- 10^c(-10:10)
    XXbl1 <- list(apply(simplify2array(XXbl),c(1,2),sum))
    ML <- sapply(log(lambdaseq),function(lam){
      temp <- try(minML.LA.ridgeGLM(loglambdas = lam,XXblocks = XXbl1,sigmasq=sd_y^2,
                                    Y=Y,Xunpen=Xunpen,intrcpt = intrcpt,model=model),
                  silent=TRUE)
      if(class(temp)[1]!="try-error"){
        return(temp)
      }else return(NaN)
      }
    )
    
    
    if(all(is.nan(ML))) stop("Error in estimating global lambda, try standardising data")
    lambdaseq <- lambdaseq[!is.nan(ML)&(ML!=Inf)]
    ML <- ML[!is.nan(ML)&(ML!=Inf)]
    #if multiple lambda with same ML, take least extreme lambda, 
    #that is within 1 percent of minimal value of difference no/full penalty
    se <- abs(ML[1]-rev(ML)[1])/100
    if(ML[1]<rev(ML)[1]){
      lambda <- rev(lambdaseq)[which.min(rev(ML))]
      if(lambda==lambdaseq[1]){
        lambda <- max(lambdaseq[ML<=(min(ML)+se)])
      }
    } 
    else{
      lambda <- lambdaseq[which.min(ML)]
      if(lambda==rev(lambdaseq)[1]){
        lambda <- min(lambdaseq[ML<=(min(ML)+se)])
      }
    } 
  }else{
    lambda <- lambdaglobal
  }
    
  #Further optimise global lambda with same method as multi-group for AIC comparison
  if(selectAIC | compareMR){
    if(method=="CV"){
      leftout <- CVfolds(Y=Y,kfold=fold,nrepeat=3,fixedfolds = FALSE) #Create (repeated) CV-splits of the data
      lambda1groupfit <- optLambdasWrap(penaltiesinit=lambda, 
                                     XXblocks=list(apply(simplify2array(XXbl),c(1,2),sum)),
                                     Y=Y,folds=leftout,
                                     X1=Xunpen,intercept=intrcpt,
                                     score=ifelse(model == "linear", "mse", "loglik"),model=model,
                                     maxItropt2 = 500,reltol = 10^-3)
      lambda <- lambda1groupfit$optpen
    }else if(method=="MML.noDeriv"){
      lambda1groupfit <- optLambdas_mgcv(penaltiesinit=lambda,Y=Y,
                                         XXblocks=list(apply(simplify2array(XXbl),c(1,2),sum)),
                                         model=model,reltol=1e-3,
                                         maxItropt=500,tracescore=FALSE,fixedseed =FALSE,
                                         optmethod = "Nelder-Mead",
                                         sigmasq=sigmasq) #TD: intercept?
      lambda <- lambda1groupfit$optpen
    }else if(method=="MML"){
      sigmahat<-sd_y 
      if(!is.null(sigmasq) & model=="linear") sigmahat <- sigmasq
      
      if(opt.sigma){
        lambda1groupfit <- optim(par=c(log(sigmahat),log(lambda)), fn=minML.LA.ridgeGLM,
                                 gr=dminML.LA.ridgeGLM, method="BFGS",minlam=minlam,
                                 XXblocks = list(apply(simplify2array(XXbl),c(1,2),sum)) ,
                                 Y=Y,sigmasq=sigmahat,model=model,
                                 intrcpt=intrcpt, Xunpen=Xunpen,opt.sigma=opt.sigma)
        sigmasq <- exp(lambda1groupfit$par[1])+minlam
        lambda <- exp(lambda1groupfit$par[-1])+minlam
      }else{
        lambda1groupfit <- optim(par=log(lambda), fn=minML.LA.ridgeGLM,
                                 gr=dminML.LA.ridgeGLM, method="BFGS",minlam=minlam,
                                 XXblocks = list(apply(simplify2array(XXbl),c(1,2),sum)) ,
                                 Y=Y,sigmasq=sigmahat,model=model,
                                 intrcpt=intrcpt, Xunpen=Xunpen,opt.sigma=opt.sigma)
        lambda <- exp(lambda1groupfit$par)+minlam
      }
      
    }
  }
  
  #find estimate for sigma for 1 group for AIC comparison
  sigmahat <- sd_y
  if(model=="linear"){
    if(!is.null(sigmasq)) sigmahat <- sigmasq
    else{
      XXT1grp <- SigmaFromBlocks(XXblocks = list(apply(simplify2array(XXbl),c(1,2),sum)),lambda)
      if(length(unpen)>0 | intrcpt){
        Xunpen2 <- Xunpen
        if(intrcpt) Xunpen2 <- cbind(Xunpen,rep(1,n))
        if(intrcpt && length(unpen)==0){
          betaunpenML1grp <- sum(Y)/n
        }else{
          temp <- solve(XXT1grp+diag(rep(1,n)),Xunpen2)
          betaunpenML1grp <- solve(t(Xunpen2)%*%temp , t(temp)%*%Y)
        }
        sigmahat <- c(t(Y-Xunpen2%*%betaunpenML1grp)%*%
                              solve(XXT1grp+diag(rep(1,n)),Y-Xunpen2%*%betaunpenML1grp)/n)
      }else{
        sigmahat <- c(t(Y)%*%solve(XXT1grp+diag(rep(1,n)),Y)/n)
      }
    }
  }
  if(selectAIC){
    sigmahat1group <- sigmahat
  }
  
  #Compute ridge penalties and corresponding group variance estimates----
  if(is.null(lambdas)){
    #If not given, set initial lambdas to global lambda
    if(is.null(lambdasinit)|!ecpcinit){
      lambdasinit <- rep(lambda,G)
    }
    if(any(lambdasinit==Inf)){
      lambdasinit[lambdasinit>2*lambda] <- 2*lambda #truncate at 2 times the global lambda
    }
    
    #Find joint lambdas:
    if(method=="CV"){
      leftout <- CVfolds(Y=Y,kfold=fold,nrepeat=3,fixedfolds = FALSE) #Create (repeated) CV-splits of the data
      jointlambdas <- optLambdasWrap(penaltiesinit=lambdasinit, XXblocks=XXbl,Y=Y,folds=leftout,
                                     X1=Xunpen,intercept=intrcpt,
                                     score=ifelse(model == "linear", "mse", "loglik"),model=model,
                                     maxItropt2 = 500,reltol = 10^-3,traceCV=FALSE)
      lambdas <- jointlambdas$optpen
    }else if(method=="MML.noDeriv"){
      #browser()
      if(ecpcinit){
        if(SANN){
          jointlambdas <- optLambdas_mgcvWrap(penaltiesinit=lambdasinit, XXblocks=XXbl,Y=Y,
                                              model=model,reltol=1e-4,
                                              maxItropt2=1000,tracescore=FALSE,fixedseed =FALSE,
                                              optmethod2 = "Nelder-Mead",
                                              sigmasq=sigmahat,opt.sigma=opt.sigma) #TD: intercept?
        }else{
          jointlambdas <- optLambdas_mgcv(penaltiesinit=lambdasinit, XXblocks=XXbl,Y=Y,
                                          model=model,reltol=1e-4,
                                          maxItropt=1000,tracescore=FALSE,fixedseed =FALSE,
                                          optmethod = "Nelder-Mead",
                                          sigmasq=sigmahat,opt.sigma=opt.sigma) #TD: intercept?
        }
        
      }else{
        if(SANN){
          jointlambdas <- optLambdas_mgcvWrap(penaltiesinit=rep(lambda,G), XXblocks=XXbl,Y=Y,
                                              model=model,reltol=1e-4,
                                              maxItropt2=1000,tracescore=FALSE,fixedseed =FALSE,
                                              optmethod2 = "Nelder-Mead",
                                              sigmasq=sigmahat,opt.sigma=opt.sigma) #TD: intercept?
        }else{
          jointlambdas <- optLambdas_mgcv(penaltiesinit=rep(lambda,G), XXblocks=XXbl,Y=Y,
                                          model=model,reltol=1e-4,
                                          maxItropt=1000,tracescore=FALSE,fixedseed =FALSE,
                                          optmethod = "Nelder-Mead",
                                          sigmasq=sigmahat,opt.sigma=opt.sigma) #TD: intercept?
        }
      }
      
      # print("-log(ML) for initial penalties")
      # if(ecpcinit){
      #   MLinit <- mgcv_lambda(penalties=lambdasinit, XXblocks=XXbl,Y=Y, model=model, sigmasq = sigmasq) #-log(ml) for ecpc
      # }else{
      #   MLinit <- mgcv_lambda(penalties=rep(lambda,G), XXblocks=XXbl,Y=Y, model=model, sigmasq = sigmasq) #-log(ml) for global
      # }

      if(opt.sigma){
        sigmasq <- jointlambdas$optpen[1]
        lambdas <- jointlambdas$optpen[-1]
      }else{
        lambdas <- jointlambdas$optpen
      }
      # print("-log(ML) for final penalties")
      # MLfinal <- mgcv_lambda(penalties=lambdas, XXblocks=XXbl,Y=Y, model=model, sigmasq = sigmasq) #-log(ml) for ecpc

      
    }else if(method=="MML"){
      if(opt.sigma){
        jointlambdas <- optim(par=c(log(sigmahat),log(lambdasinit)), 
                              fn=minML.LA.ridgeGLM, 
                              gr=dminML.LA.ridgeGLM, method="BFGS",minlam=minlam,
                              XXblocks = XXbl , Y=Y,opt.sigma=opt.sigma,model=model,
                              intrcpt=intrcpt, Xunpen=Xunpen)
        sigmasq <- exp(jointlambdas$par[1])+minlam
        lambdas <- exp(jointlambdas$par[-1])+minlam
        
        # MLinit <- minML.LA.ridgeGLM(loglambdas=c(log(sigmahat),log(lambdasinit)),opt.sigma = TRUE,
        #                             XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
        # MLfinal <- minML.LA.ridgeGLM(loglambdas=c(log(sigmasq),log(lambdas)),
        #                              opt.sigma = TRUE,
        #                              XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
        
      }else{
        jointlambdas <- optim(par=log(lambdasinit), fn=minML.LA.ridgeGLM, 
                              gr=dminML.LA.ridgeGLM, method="BFGS",minlam=minlam,
                              XXblocks = XXbl , Y=Y,sigmasq=sigmahat,model=model,
                              intrcpt=intrcpt, Xunpen=Xunpen)
        lambdas <- exp(jointlambdas$par)+minlam
        
        # MLinit <- minML.LA.ridgeGLM(loglambdas=c(log(lambdasinit)),opt.sigma = FALSE,
        #                             sigmasq=sigmahat,
        #                             XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
        # MLfinal <- minML.LA.ridgeGLM(loglambdas=c(log(lambdas)),
        #                              opt.sigma = FALSE,sigmasq=sigmasq,
        #                              XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
      }
        
      
    }
  }else{
    lambdasinit <- lambdas
  }
  
  sigmahat <- 1 #sigma not in model for logistic: set to 1
  if(model=="linear"){
    if(!is.null(sigmasq)) sigmahat <- sigmasq
    else{
      XXT <- SigmaFromBlocks(XXblocks = XXbl,lambdas)
      if(length(unpen)>0 | intrcpt){
        Xunpen2 <- Xunpen
        if(intrcpt) Xunpen2 <- cbind(Xunpen,rep(1,n))
        if(intrcpt && length(unpen)==0){
          betaunpenML <- sum(Y)/n
        }else{
          temp <- solve(XXT+diag(rep(1,n)),Xunpen2)
          betaunpenML <- solve(t(Xunpen2)%*%temp , t(temp)%*%Y)
        }
        sigmahat <- c(t(Y-Xunpen2%*%betaunpenML)%*%solve(XXT+diag(rep(1,n)),Y-Xunpen2%*%betaunpenML)/n)
      }else{
        sigmahat <- c(t(Y)%*%solve(XXT+diag(rep(1,n)),Y)/n)
      }
    }
  }
  
  MLinit <- minML.LA.ridgeGLM(loglambdas=log(lambdasinit),opt.sigma = FALSE,sigmasq=sigmahat,
                              XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
  MLfinal <- minML.LA.ridgeGLM(loglambdas=log(lambdas),
                               opt.sigma = FALSE, sigmasq=sigmahat,
                               XXblocks = XXbl , Y=Y,model=model,intrcpt=intrcpt,minlam=0)
  # print("-log(ML) for initial penalties"); print(MLinit)
  # print("-log(ML) for final penalties"); print(MLfinal)
  
  #May compare 1 group versus multiple groups with AIC
  if(selectAIC){
    # res1group <- squeezy(Y=Y,X=X,groupset=list(1:p),alpha=0,model=model,
    #                        unpen=unpen,intrcpt=intrcpt,
    #                        fold=fold,sigmasq=sigmasq,method=method,
    #                        compareMR = FALSE,selectAIC=FALSE)
    # 
    # lambda1group <- res1group$lambdaMR
    # sigmahat1group <- res1group$sigmahat
    lambda1group <- lambda
    if(sum((1:p)%in%unpen)>0){
      AIC1group <- mAIC.LA.ridgeGLM(log(lambda1group),XXblocks=list(apply(simplify2array(XXbl),c(1,2),sum)),
                                   Y=Y,sigmasq=sigmahat1group,Xunpen=Xunpen,intrcpt=intrcpt,model=model)
      AICmultigroup <- mAIC.LA.ridgeGLM(log(lambdas),XXblocks=XXbl,
                                       Y=Y,sigmasq=sigmahat,Xunpen=Xunpen,intrcpt=intrcpt,model=model)
    }else{
      AIC1group <- mAIC.LA.ridgeGLM(log(lambda1group),
                                   XXblocks=list(apply(simplify2array(XXbl),c(1,2),sum)),
                                   Y=Y,sigmasq=sigmahat1group,intrcpt=intrcpt,model=model)
      AICmultigroup <- mAIC.LA.ridgeGLM(log(lambdas),XXblocks=XXbl,
                                       Y=Y,sigmasq=sigmahat,intrcpt=intrcpt,model=model)
    }
    #If model with only one group has lower AIC, select that model
    if(AIC1group <= AICmultigroup){
      lambdasNotOptimalAIC <- lambdas
      sigmahatNotOptimalAIC <- sigmahat
      lambdas <- rep(lambda1group,G)
      sigmahat <- sigmahat1group
      modelbestAIC <- "onegroup"
    }else{
      lambdasNotOptimalAIC <- lambda1group
      sigmahatNotOptimalAIC <- sigmahat1group
      modelbestAIC <- "multigroup"
    }
  }
  
  tauglobal<- sigmahat/lambda #set global group variance
  gamma <- lambda/lambdas #= (sigmahat/lambdas)/tauglobal
  lambdap<- lambda/(as.vector(gamma%*%Zt)) #=sigmahat/(tauglobal*as.vector(gamma%*%Zt)) #specific ridge penalty for beta_k
  lambdap[lambdap<0]<-Inf 
  
  
  #Compute regression model----
  glmGR <- NA
  if(compareMR||alpha==0){
    #Compute ridge betas with multiridge
    XXT <- SigmaFromBlocks(XXbl,penalties=lambdas) #create nxn Sigma matrix = sum_b [lambda_b)^{-1} X_b %*% t(X_b)]
    if(sum((1:p)%in%unpen)>0){
      fit <- IWLSridge(XXT,Y=Y, model=model,intercept=intrcpt,X1=X[,(1:p)%in%unpen]) #Fit. fit$etas contains the n linear predictors
    }else{
      fit <- IWLSridge(XXT,Y=Y, model=model,intercept=intrcpt) #Fit. fit$etas contains the n linear predictors
    }
    betafit <- betasout(fit, Xblocks=Xbl, penalties=lambdas) #Find betas.
    a0MR <- 0
    if(intrcpt) a0MR <- c(betafit[[1]][1]) #intercept
    betaMR <- rep(0,p) 
    betaMR[(1:p)%in%unpen] <- betafit[[1]][-1] #unpenalised variables
    for(i in 1:length(groupsets[[1]])){
      betaMR[groupsets[[1]][[i]]] <- betaMR[groupsets[[1]][[i]]] + betafit[[1+i]]/sqrt(Ik[[1]][groupsets[[1]][[i]]])
    }  
    rm(betafit)
  }
  
  #Transform group variance estimates to desired group prior estimates----
  
  pen <- which(!((1:p)%in%unpen))
  if(any(is.nan(sqrt(lambdap[pen])))){browser()}
  
  # #Compute ridge betas with glmnet
  # 
  # penfctr2 <- penfctr
  # penfctr2[pen] <- lambdap[pen]
  # not0 <- which(penfctr2!=Inf)
  # lambdaoverall <- sum(penfctr2[not0])/length(penfctr2[not0]) #global penalty parameter
  # penfctr2 <- penfctr2/lambdaoverall #scale penalty factor such that sums to p
  # #all(penfctr2*lambdaoverall==lambdap)
  # 
  # Xacc <- X
  # Xacc[,pen] <- as.matrix(X[,pen] %*% sparseMatrix(i=1:length(pen),j=1:length(pen),
  #                                                  x=c(1/sqrt(penfctr2[pen]))))
  # if(model=="cox"){
  #   glmGR <- glmnet(Xacc[,not0],Y,alpha=0,#lambda = lambdaoverall/n*sd_y,
  #                   family=fml,standardize = FALSE,
  #                   penalty.factor=penfctr[not0], thresh=10^-10)
  #   temp <- coef(glmGR,s=lambdaoverall/n*sd_y,exact=TRUE,x=Xacc[,not0],y=Y,
  #                penalty.factor=penfctr[not0],family=fml,
  #                thresh=10^-10)
  # }else{
  #   glmGR <- glmnet(Xacc[,not0],Y,alpha=0,#lambda = lambdaoverall/n*sd_y,
  #                   family=fml, intercept = intrcpt, standardize = FALSE,
  #                   penalty.factor=penfctr[not0], thresh=10^-10)
  #   #betaGLM <- rep(0,p); betaGLM[not0] <- as.vector(glmGR$beta) 
  #   temp <- coef(glmGR,s=lambdaoverall/n*sd_y,exact=TRUE,x=Xacc[,not0],y=Y,
  #                penalty.factor=penfctr[not0],family=fml,
  #                intercept=intrcpt, thresh=10^-10)
  # }
  # betaGLM <- rep(0,p); betaGLM[not0] <- temp[-1]
  # betaGLM[pen] <- c(1/sqrt(penfctr2[pen])) * betaGLM[pen]
  # a0GLM <- temp[1]
  # 
  # plot(betaMR,betaGLM); abline(0,1)
  
  # #with penfctr and glmnet: does not always converge
  # penfctr2 <- penfctr
  # penfctr2[pen] <- lambdap[pen]
  # not0 <- which(penfctr2!=Inf)
  # lambdaoverall <- sum(penfctr2[not0])/length(penfctr2[not0]) #global penalty parameter
  # penfctr2 <- penfctr2/lambdaoverall #scale penalty factor such that sums to p
  # if(model=="cox"){
  #   glmGR <- glmnet(X[,not0],Y,alpha=0,family=fml,standardize = FALSE,
  #                   penalty.factor=penfctr2[not0], thresh=10^-10)
  #   temp <- coef(glmGR,s=lambdaoverall/n*sd_y,exact=TRUE,x=X[,not0],y=Y,
  #                penalty.factor=penfctr2[not0],family=fml,
  #                thresh=10^-10)
  # }else{
  #   glmGR <- glmnet(X[,not0],Y,alpha=0,family=fml,#lambda=lambdaoverall/n*sd_y,
  #                   intercept = intrcpt, standardize = FALSE,
  #                   penalty.factor=penfctr2[not0], thresh=10^-10)
  #   #betaGLM2 <- rep(0,p); betaGLM2[not0] <- as.vector(glmGR$beta)
  #   
  #   temp <- coef(glmGR,s=lambdaoverall/n*sd_y,exact=TRUE,x=X[,not0],y=Y,
  #                penalty.factor=penfctr2[not0],family=fml,
  #                intercept=intrcpt, thresh=10^-12)
  # }
  # betaGLM2 <- rep(0,p); betaGLM2[not0] <- temp[-1]
  # plot(betaGLM2,betaGLM); abline(0,1); 
  # points(betaGLM2[groupsets[[1]][[1]]],betaGLM[groupsets[[1]][[1]]],col="red")
  
  #Transform penalties and compute elastic net betas with glmnet
  if(alpha <= 1){
    varFunc <- function(tauEN,alpha=alpha,tauR){
      #tauR: ridge prior variance (equals sigma^2/lambda_R)
      #tauEN: elastic net prior variance (equals sigma^2/lambda_{EN})
      #alpha: fixed elastic net mixing parameter
      #return variance of elastic net prior minus the ridge variance tauR;
      #p(\beta)\propto exp(-1/(2*tauEN)*(alpha*|\beta|+(1-\alpha)\beta^2))
      
      t2 <- - alpha/2/(1-alpha)^(3/2)*sqrt(tauEN)*
        exp(dnorm(alpha/2/sqrt(tauEN)/sqrt(1-alpha),log=TRUE) -
              pnorm(-alpha/2/sqrt(tauEN)/sqrt(1-alpha),log.p=TRUE))
      varBeta <- tauEN/(1-alpha) + alpha^2/4/(1-alpha)^2 + t2
      f <- varBeta - tauR
      return(f)
    }
    lamEN <- function(alpha,tauR){
      if(alpha==0){
        lamEN <- sigmahat/tauR
      }else if(alpha==1){
        lamEN <- sigmahat*sqrt(8/tauR)
      }else if(tauR/sigmahat>10^6){
        lamEN <- sigmahat/tauR
        if(alpha>0.2){
          ub2 <- 10^6*sigmahat
          lb2 <- sqrt(10^6*sigmahat/8)
          temp <- try(uniroot(varFunc,c(0.9*lb2,1.1*ub2),alpha=alpha,tauR=10^6*sigmahat,tol=10^-6))
          if(class(temp)[1]!="try-error"){
            if(temp$root<lamEN) lamEN <- sigmahat/temp$root
          }
        }
      }else if(tauR/sigmahat<10^-6){
        lamEN <- sigmahat*sqrt(8/tauR)
        if(alpha<0.8){
          ub2 <- sqrt(10^-7*sigmahat/8)
          lb2 <- 10^-7*sigmahat
          temp <- try(uniroot(varFunc,c(0.9*lb2,1.1*ub2),alpha=alpha,tauR=10^-7*sigmahat,tol=10^-6))
          if(class(temp)[1]!="try-error"){
            if(temp$root>lamEN) lamEN <- temp$root
          }
        }
      }else{
        lb <- min(tauR,sqrt(tauR/8))
        ub <- max(tauR,sqrt(tauR/8))
        #if(sign(varFunc(0.9*lb,alpha=alpha,tauR=tauR))==sign(varFunc(1.1*ub,alpha=alpha,tauR=tauR))) browser()
        temp <- try(uniroot(varFunc,c(0.9*lb,1.1*ub),alpha=alpha,tauR=tauR,tol=10^-6))
        if(class(temp)[1]=="try-error"){
          if(tauR<0.5) lamEN <- sigmahat*sqrt(8/tauR)
          if(tauR>0.5) lamEN <- sigmahat/tauR
        }else{
          lamEN <- sigmahat/temp$root
        }
      }
      return(lamEN)
    }
    alphat <- 1/(1+2*sd_y*(1-alpha)/alpha) #alpha used in glmnet
    lambdasEN <- sapply(sigmahat/lambdas,function(tau) lamEN(tauR = tau,alpha=alpha))
    tauEN <- sigmahat/lambdasEN
    lambdasENhat <- lambdasEN/2*(alpha/sd_y+ 2*(1-alpha))
    
    uniqueTaus <- unique(as.vector(c(sigmahat/lambdas)%*%Zt))
    if(dim(X)[2]==dim(Xxtnd)[2]){
      lambdap<- (as.vector(lambdasENhat%*%Zt))
      lambdapApprox<- (as.vector(lambdasEN%*%Zt))
    }else{ #some groups are overlapping
      lambdasENunique <- sapply(uniqueTaus,function(tau) lamEN(tauR = tau,alpha=alpha))
      lambdasENuniquehat <- lambdasENunique/2*(alpha/sd_y+ 2*(1-alpha))
      indUnique <- sapply(as.vector(c(sigmahat/lambdas)%*%Zt),function(x)which(uniqueTaus==x))
      lambdap <- lambdasENuniquehat[indUnique]
      lambdapApprox <- lambdasENunique[indUnique]
    }
    
    if(alpha==0){
      beta <- betaMR
      a0 <- a0MR
      lambdaMR_reCV <- lambdas
    }else{
      penfctr2 <- penfctr
      penfctr2[pen] <- lambdap[pen] 
      not0 <- which(penfctr2!=Inf)
      lambdaEN <- sum(penfctr2[not0])/length(penfctr2[not0])
      penfctr2 <- penfctr2/lambdaEN
      if(model=="cox"){
        glmGR <- glmnet(X[,not0],Y,alpha=alphat,family=fml,standardize = FALSE,
                        penalty.factor=penfctr2[not0], thresh=10^-10)
      }else{
        glmGR <- glmnet(X[,not0],Y,alpha=alphat,family=fml,
                        intercept = intrcpt, standardize = FALSE,
                        penalty.factor=penfctr2[not0], thresh=10^-10)
      }
      
      if(reCV){
        glmGR.cv <- cv.glmnet(X[,not0],Y,alpha=alphat,family=fml,
                              intercept = intrcpt, standardize = FALSE,
                              penalty.factor=penfctr2[not0], thresh=10^-10)
        sopt <- glmGR.cv$lambda.min
        
        #if 0 model returned, first try to rerun CV
        tempItr <- 1
        while(glmGR.cv$lambda.min==glmGR.cv$lambda[1] & tempItr<=10){
          glmGR.cv <- cv.glmnet(X[,not0],Y,alpha=alphat,family=fml,
                                intercept = intrcpt, standardize = FALSE,
                                penalty.factor=penfctr2[not0], thresh=10^-10)
          sopt <- glmGR.cv$lambda.min
          tempItr <- tempItr+1
        }
        # #if still 0 model returned (for first lambda in the sequence as run by glmnet), 
        # #take smaller lambda with cvm within 1 se
        # if(glmGR.cv$lambda.min==glmGR.cv$lambda[1]){
        #   warning("Recalibrated global lambda returns intercept-only model. 
        #           Model for smaller global lambda is returned")
        #   #se <- sd(glmGR.cv$cvm) 
        #   se <- abs(glmGR.cv$cvm[1]-rev(glmGR.cv$cvm)[1])/100
        #   sopt <- min(glmGR.cv$lambda[glmGR.cv$cvm <= (glmGR.cv$lambda + se)])
        # },
        
        lambdasEN_reCV <- sopt/lambdaEN*n/sd_y * lambdasEN
        lambdaMR_reCV <- sigmahat/sapply(sigmahat/lambdasEN_reCV,varFunc,alpha=alpha,tauR=0)
        
      } else{
        sopt <- lambdaEN/n*sd_y
        lambdaMR_reCV <- lambdas
      } 
      
      temp <- coef(glmGR,s=sopt,exact=TRUE,x=X[,not0],y=Y,alpha=alphat,
                   penalty.factor=penfctr2[not0],family=fml,intercept=intrcpt) 
      beta <- rep(0,p); beta[not0] <- temp[-1]
      a0 <- temp[1]
      #plot(beta)
      #print(lambdaEN/n*sd_y)
    }
  }else{stop("alpha should be between 0 and 1")}
  
  if(standardise_Y){
    beta <- beta*sd_y_former
    a0 <- a0*sd_y_former
    if(compareMR){
      betaMR <- betaMR*sd_y_former
      a0MR <- a0MR*sd_y_former
    }
  }  
  
  #Compute predictions for independent data if given----
  if(!is.null(X2)){
    #Ypredridge <- predict(glmR,newx=X2)
    #browser()
    
    if(compareMR){
      if(model=="linear"){
        X2c <- cbind(X2,rep(1,n2))
        YpredMR <- X2c %*% c(betaMR,a0MR)
        MSEMR <- sum((YpredMR-Y2)^2)/n2
      } 
      if(model=='logistic'){
        X2c <- cbind(X2,rep(1,n2))
        YpredMR <- 1/(1+exp(-X2c %*% c(betaMR,a0MR)))
        MSEMR <- sum((YpredMR-Y2)^2)/n2
      }else if(model=="cox"){
        expXb<-exp(X %*% c(betaMR))
        h0 <- sapply(1:length(Y[,1]),function(i){Y[i,2]/sum(expXb[Y[,1]>=Y[i,1]])})#updated baseline hazard in censored times for left out samples
        H0 <- sapply(Y2[,1],function(Ti){sum(h0[Y[,1]<=Ti])})
        YpredMR <- H0*exp(X2 %*% betaMR)
        MSEMR<- sum((YpredMR-Y2[,2])^2)/n2
      }
    }
    
    #Compute test error for elastic net/lasso estimate
    if(model=="linear"){
      X2c <- cbind(X2,rep(1,n2))
      YpredApprox <- X2c %*% c(beta,a0)
      MSEApprox <- sum((YpredApprox-Y2)^2)/n2
    } 
    if(model=='logistic'){
      X2c <- cbind(X2,rep(1,n2))
      YpredApprox <- 1/(1+exp(-X2c %*% c(beta,a0)))
      MSEApprox <- sum((YpredApprox-Y2)^2)/n2
    }else if(model=="cox"){
      expXb<-exp(X %*% c(beta))
      h0 <- sapply(1:length(Y[,1]),function(i){Y[i,2]/sum(expXb[Y[,1]>=Y[i,1]])})#updated baseline hazard in censored times for left out samples
      H0 <- sapply(Y2[,1],function(Ti){sum(h0[Y[,1]<=Ti])})
      YpredApprox <- H0*exp(X2 %*% beta)
      MSEApprox<- sum((YpredApprox-Y2[,2])^2)/n2
    }
    
  }
  
  #Return output----
  output <- list(betaApprox = beta, #lasso
                 a0Approx = a0,
                 #tauApprox = tauEN, #prior parameter estimates
                 lambdaApprox = lambdasEN, #elastic net group penalty
                 lambdapApprox = lambdapApprox, #p-dimensional vector with EN penalties
                 tauMR = sigmahat/lambdas, #prior parameter estimates
                 lambdaMR = lambdas,
                 lambdaglobal = lambda,
                 sigmahat = sigmahat,
                 MLinit=MLinit,
                 MLfinal=MLfinal,
                 alpha=alpha,#elastic net parameter alpha value used
                 glmnet.fit = glmGR, #glmnet fit for the coefficients 
                 lambdaMR_reCV=lambdaMR_reCV) 
                 #betaMR2 = betaGLM, #multiridge penalty with glmnet
                 #a0MR2 = a0GLM)
  if(compareMR){
    output$betaMR <- betaMR #multiridge package
    output$a0MR <- a0MR
  }
  if(!is.null(X2)){
    output$YpredApprox<-YpredApprox #predictions for test set
    output$MSEApprox <- MSEApprox #MSE on test set
    if(compareMR){
      output$YpredMR <-YpredMR #predictions for test set
      output$MSEMR <- MSEMR #MSE on test set
    }
  }
  if(selectAIC){
    output$AICmodels <- list(
      "multigroup"=list(
        "lambdas"=lambdas,
        "sigmahat"=sigmahat,
        "AIC"=AICmultigroup
      ),
      "onegroup"=list(
        "lambdas"=lambda1group,
        "sigmahat"=sigmahat1group,
        "AIC"=AIC1group
      )
    )
    if(modelbestAIC!="multigroup"){
      output$AICmodels$multigroup$lambdas <- lambdasNotOptimalAIC
      output$AICmodels$multigroup$sigmahat <- sigmahatNotOptimalAIC
    }
    if(resultsAICboth){
        output$AICmodels$onegroup$fit <- squeezy(Y,X,groupset,alpha=alpha,model=model,
                X2=X2,Y2=Y2,unpen=unpen,intrcpt=intrcpt,
                method="MML",fold=fold,
                compareMR = compareMR,selectAIC=FALSE,
                fit.ecpc=NULL,
                lambdas=rep(lambda1group,G),lambdaglobal=lambda1group,
                sigmasq=sigmahat1group,
                standardise_Y=standardise_Y,reCV=reCV,resultsAICboth=FALSE)
        output$AICmodels$multigroup$fit <- squeezy(Y,X,groupset,alpha=alpha,model=model,
                       X2=X2,Y2=Y2,unpen=unpen,intrcpt=intrcpt,
                       method="MML",fold=fold,
                       compareMR = compareMR,selectAIC=FALSE,
                       fit.ecpc=NULL,
                       lambdas=output$AICmodels$multigroup$lambdas,
                       lambdaglobal=lambda1group,
                       sigmasq=output$AICmodels$multigroup$sigmahat,
                       standardise_Y=standardise_Y,reCV=reCV,resultsAICboth=FALSE)
    }
    
    output$modelbestAIC <- modelbestAIC
  }
  
  return(output)
}

#Other functions----
#Minus log marginal likelihood of the Laplace approximation for ridge penalised GLMs----
minML.LA.ridgeGLM<- function(loglambdas,XXblocks,Y,sigmasq=1,
                             Xunpen=NULL,intrcpt=TRUE,model,minlam=0,opt.sigma=FALSE){
  #compute Laplace approximation (LA) of the minus log marginal likelihood of ridge penalised GLMs
  #(NOTE: for now only implemented for linear and logistic)
  #Input:
  #loglambdas: logarithm of group penalties
  #XXblocks: nxn matrices for each group
  #Y: response vector
  #sigmasq: noise level in linear regression (1 for logistic)
  #Xunpen: unpenalised variables
  #intercept: TRUE/FALSE to include intercept
  #model: "linear" or "logistic" 
  #opt.sigma: (linear case) TRUE/FALSE if log(sigma^2) is given as first argument of 
  #           loglambdas for optimisation purposes
  if(model!="linear"){
    opt.sigma <- FALSE
    sigmasq <- 1
  } 
  
  n<-length(Y)
  lambdas <- exp(loglambdas)+minlam
  if(opt.sigma){
    sigmasq <- lambdas[1]
    lambdas <- lambdas[-1]
  }
  Lam0inv <- .SigmaBlocks(XXblocks,lambdas) 
  if(!is.null(Xunpen)){
    fit <- IWLSridge(XXT=Lam0inv,Y=Y, model=model,intercept=intrcpt,
                     X1=Xunpen,maxItr = 500,eps=10^-12) #Fit. fit$etas contains the n linear predictors
  }else{
    fit <- IWLSridge(XXT=Lam0inv,Y=Y, model=model,intercept=intrcpt,
                     maxItr = 500,eps=10^-12) #Fit. fit$etas contains the n linear predictors
  }
  eta <- fit$etas + fit$eta0
  
  if(model=="linear"){
    mu <- eta
    W <- diag(rep(1,n))
    Hpen <- Lam0inv - Lam0inv %*% solve(solve(W)+Lam0inv, Lam0inv)
    
    t1 <- dmvnorm(c(Y),mean=eta,sigma=sigmasq*diag(rep(1,n)),log=TRUE) #log likelihood in linear predictor
  }else{
    mu <- 1/(1+exp(-eta))
    W <- diag(c(mu)*c(1-mu))
    
    #Hpen <- Lam0inv - Lam0inv %*% solve(diag(1/diag(W))+Lam0inv, Lam0inv)
    Hpen <- Lam0inv - Lam0inv %*% solve(diag(1,n)+W%*%Lam0inv, W%*%Lam0inv)
    
    t1 <- sum(Y*log(mu)+(1-Y)*log(1-mu)) #log likelihood in linear predictor
  }
  
  t2 <- -1/2/sigmasq*sum(c(Y-mu)*eta)
  t3 <- 1/2 * .logdet(diag(rep(1,n))-W%*%Hpen)
  #return(-t2)
  return(-(t1+t2+t3))
}

#Derivative of minus log marginal likelihood of the Laplace approximation for ridge penalised GLMs----
dminML.LA.ridgeGLM<- function(loglambdas,XXblocks,Y,sigmasq=1,
                              Xunpen=NULL,intrcpt=TRUE,model,minlam=0,opt.sigma=FALSE){
  #compute Laplace approximation (LA) of the first derivative of the minus log 
  #  marginal likelihood of ridge penalised GLMs
  #(NOTE: for now only implemented for linear and logistic)
  #Input:
  #loglambdas: logarithm of group penalties 
  #XXblocks: nxn matrices for each group
  #Y: response vector
  #sigmasq: noise level in linear regression (1 for logistic)
  #Xunpen: unpenalised variables
  #intercept: TRUE/FALSE to include intercept
  #model: "linear" or "logistic" 
  #minlam: minimum of lambda that is added to the value given (useful in optimisation settings)
  #opt.sigma: (linear case) TRUE/FALSE if derivative to log(sigma^2) should be given for optimisation purposes
  #           Note that sigma^2 should then be given as the first argument of loglambdas     
  
  if(model!="linear") opt.sigma <- FALSE
  
  n<-length(Y)
  lambdas <- exp(loglambdas)+minlam
  if(opt.sigma){
    sigmasq <- lambdas[1]
    lambdas <- lambdas[-1]
  }
  rhos<-log(lambdas)
  Lam0inv <- .SigmaBlocks(XXblocks,lambdas) 
  if(!is.null(Xunpen)){
    fit <- IWLSridge(XXT=Lam0inv,Y=Y, model=model,intercept=intrcpt,
                     X1=Xunpen,maxItr = 500,eps=10^-12) #Fit. fit$etas contains the n linear predictors
  }else{
    fit <- IWLSridge(XXT=Lam0inv,Y=Y, model=model,intercept=intrcpt,
                     maxItr = 500,eps=10^-12) #Fit. fit$etas contains the n linear predictors
  }
  eta <- fit$etas + fit$eta0
  
  if(intrcpt) Xunpen <- cbind(Xunpen,rep(1,n))
  #eta0 <- fit$eta0 #TD: add unpenalised variables/intercept
  
  #GLM-type specific terms
  if(model=="linear"){
    mu <- eta
    W <- diag(rep(1,n))
    dwdeta <- rep(0,n)
  }else if(model=="logistic"){
    mu <- 1/(1+exp(-eta))
    W <- diag(c(mu)*c(1-mu))
    dwdeta <- c(mu*(1-mu)*(1-2*mu))
  }else{
    stop(print(paste("Only model type linear and logistic supported")))
  }
  
  #Hpen <- Lam0inv - Lam0inv %*% solve(solve(W)+Lam0inv, Lam0inv)
  Hpen <- Lam0inv - Lam0inv %*% solve(diag(1,n)+W%*%Lam0inv, W%*%Lam0inv)
  
  if(!is.null(Xunpen)){
    WP1W <- diag(rep(1,n)) - Xunpen%*%solve(t(Xunpen)%*%W%*%Xunpen,t(Xunpen)%*%W)
  }else{
    WP1W <- diag(rep(1,n))
  }
  #L2 <- WP1W%*%(diag(rep(1,n))-t(solve(solve(W)+WP1W%*%Lam0inv,WP1W%*%Lam0inv)))
  L2 <- WP1W%*%(diag(rep(1,n))-t(solve(diag(1,n)+W%*%WP1W%*%Lam0inv,W%*%WP1W%*%Lam0inv)))
  Hj <- lapply(1:length(XXblocks),function(i){
    L2%*%XXblocks[[i]]/lambdas[i]
  })
  
  if(is.null(Xunpen)) {Hres <- .Hpen(diag(W),Lam0inv);Hmat <- Hres$Hmat} else {
    Hres <- .Hunpen(diag(W),Lam0inv,Xunpen); Hmat <- Hres$Hmat
  }
  # detadrho <- lapply(1:length(rhos),function(j){
  #   -(diag(rep(1,n))-L2%*%Lam0inv%*%W)%*%t(Hj[[j]])%*%c(Y-mu+W%*%eta)})
  detadrho <- lapply(1:length(rhos),function(j){
    -exp(rhos[j])*(diag(rep(1,n))-Hmat%*%W)%*%t(Hj[[j]]/lambdas[j])%*%c(Y-mu+W%*%eta)})
  dWdrho <- lapply(1:length(rhos),function(j){dwdeta*c(detadrho[[j]])})
  
  t1 <- sapply(1:length(rhos),function(j){-1/sigmasq*c((Y-mu))%*%detadrho[[j]]})
  t2 <- sapply(1:length(rhos),function(j){1/2/sigmasq*c(-W%*%eta+Y-mu)%*%detadrho[[j]]})
  #t3 <- sapply(1:length(rhos),function(j){-0.5*sum(diag(solve(solve(W)+Lam0inv,XXblocks[[j]]/sigmasq/lambdas[j])))})
  t3 <- sapply(1:length(rhos),function(j){-0.5*sum(diag(solve(diag(1,n)+W%*%Lam0inv,W%*%XXblocks[[j]]/lambdas[j])))})
  t4 <- sapply(1:length(rhos),function(j){0.5*sum(diag(Hpen)*dWdrho[[j]])})
  
  #return(t2)
  if(!opt.sigma) return((t1+t2+t3+t4)) #derivative to rho
  
  #for linear model, may want to return derivative to log(sigma^2) as well
  # browser()
  # invMat <- solve(diag(rep(sigmasq,n))+Lam0inv*sigmasq)
  # ts.1 <- -0.5*sum(diag(invMat))
  # ts.2 <- -0.5*t(Y-mu)%*%invMat%*%invMat%*%(Y-mu)
  
  ts.1 <- n/2/sigmasq
  ts.2 <- -1/2/sigmasq^2*t(Y-eta)%*%Y
  
  return(c(c(ts.1+ts.2)*sigmasq,(t1+t2+t3+t4))) #derivative to rho
}

#Marginal AIC for ridge penalised GLMs----
mAIC.LA.ridgeGLM<- function(loglambdas,XXblocks,Y,sigmasq=1,
                            Xunpen=NULL,intrcpt=TRUE,model,minlam=0){
  
  minLL <- minML.LA.ridgeGLM(loglambdas,XXblocks,Y,sigmasq=sigmasq,
                             Xunpen=Xunpen,intrcpt=intrcpt,model=model,minlam=minlam)
  k <- length(XXblocks) + intrcpt #number of parameters
  if(model=="linear") k <- k+1 #sigma estimated in linear model as well
  if(is.matrix(Xunpen)) k <- k+dim(Xunpen)[2]
  if(is.vector(Xunpen)) k <- k + 1
  
  mAIC <- 2*minLL + 2*k
  return(mAIC)
}

#Normality check----
normalityCheckQQ <- function(X,groupset,fit.squeezy,nSim=500){
  lambdapApprox <- fit.squeezy$lambdapApprox
  alpha <- fit.squeezy$alpha
  sigmahat <- fit.squeezy$sigmahat
  tauMR <- fit.squeezy$tauMR
  n <- dim(X)[1]
  
  #sample linear predictors according to elastic net prior distribution
  sampleEtas <- function(lambdapApprox=lambdapApprox,alpha=alpha,sigmahat=sigmahat,nSim=nSim){
    #X: data matrix
    #lambdapApprox: p-dimensional vector with elastic net penalties
    #alpha: elastic net mixing parameter
    #sigmahat: (linear regression only) noise variable
    p<-length(lambdapApprox)
    
    if(alpha==0){ 
      #gaussian distribution
      etas <- sapply(1:nSim,function(x){
        #simulate betas
        betas <- rnorm(p, mean=0, sd=sqrt(sigmahat/lambdapApprox))
        
        #compute linear predictor eta
        eta <- X%*%betas
        return(eta)
      })
    }else if(alpha==1){
      #A Laplace(0,b) variate can also be generated as the difference of two i.i.d. 
      #Exponential(1/b) random variables
      etas <- sapply(1:nSim,function(x){
        #simulate betas
        betas <- rexp(p, rate=lambdapApprox/2/sigmahat) - rexp(p, rate=lambdapApprox/2/sigmahat)
        
        #compute linear predictor eta
        eta <- X%*%betas
        return(eta)
      })
    }else if(alpha=="horseshoe"){
      #generate horseshoe prior for comparison
      etas <- sapply(1:nSim,function(x){
        #simulate betas
        tausLatent <- abs(rcauchy(p))
        betas <- rnorm(p,mean=0,sd=tausLatent*sqrt(sigmahat/lambdapApprox))
        
        #compute linear predictor eta
        eta <- X%*%betas
        return(eta)
      })
    }else{
      #bayesian elastic net parameters (given sigmasq)
      lam1 <- alpha*lambdapApprox
      lam2 <- (1-alpha)*lambdapApprox
      
      shape <- 0.5
      scale <- 8*lam2*sigmahat/lam1^2
      a <- 1
      b <- Inf
      
      etas <- sapply(1:nSim,function(x){
        #simulate betas
        tausLatent <- .rtgamma(p,shape = shape, scale= scale, a=a, b=b)
        varsBeta <- sigmahat*(tausLatent-1)/lam2/tausLatent
        varsBeta[tausLatent==Inf] <- sigmahat/lam2[tausLatent==Inf]
        betas <- rnorm(p,mean=0,sd=sqrt(varsBeta)) 
        
        #compute linear predictor eta
        eta <- X%*%betas
        return(eta)
      })
    }
    
    return(etas)
  }
  
  etas <- sampleEtas(lambdapApprox=lambdapApprox,alpha=alpha,sigmahat=sigmahat,nSim=nSim)
  
  #create covariance matrix
  covMat <- function(X=X,groupset=groupset,tauMR=tauMR){
    p <- dim(X)[2]
    groupsets <- list(groupset)
    G <- sapply(groupsets,length) #1xm vector with G_i, number of groups in partition i
    m <- length(G) #number of partitions
    
    indGrpsGlobal <- list(1:G[1]) #global group index in case we have multiple partitions
    if(m>1){
      for(i in 2:m){
        indGrpsGlobal[[i]] <- (sum(G[1:(i-1)])+1):sum(G[1:i])
      }
    }
    Kg <- lapply(groupsets,function(x)(sapply(x,length))) #m-list with G_i vector of group sizes in partition i
    #ind1<-ind
    
    #ind <- (matrix(1,G,1)%*%ind)==(1:G)#sparse matrix with ij element TRUE if jth element in group i, otherwise FALSE
    i<-unlist(sapply(1:sum(G),function(x){rep(x,unlist(Kg)[x])}))
    j<-unlist(unlist(groupsets))
    ind <- sparseMatrix(i,j,x=1) #sparse matrix with ij element 1 if jth element in group i (global index), otherwise 0
    
    Ik <- lapply(1:m,function(i){
      x<-rep(0,sum(G))
      x[(sum(G[1:i-1])+1):sum(G[1:i])]<-1
      as.vector(x%*%ind)}) #list for each partition with px1 vector with number of groups beta_k is in
    #sparse matrix with ij element 1/Ij if beta_j in group i
    
    #make co-data matrix Z (Zt transpose of Z as in paper, with co-data matrices stacked for multiple groupsets)
    Zt<-ind; 
    if(G[1]>1){
      Zt[1:G[1],]<-t(t(ind[1:G[1],])/apply(ind[1:G[1],],2,sum))
    }
    if(m>1){
      for(i in 2:m){
        if(G[i]>1){
          Zt[indGrpsGlobal[[i]],]<-t(t(ind[indGrpsGlobal[[i]],])/
                                       apply(ind[indGrpsGlobal[[i]],],2,sum))
        }
      }
    }
    
    if(dim(Zt)[2]<p) Zt <- cbind(Zt,matrix(rep(NaN,(p-dim(Zt)[2])*sum(G)),c(sum(G),p-dim(Zt)[2])))
    
    
    #Extend data to make artifical non-overlapping groups----
    Xxtnd <- do.call(cbind,lapply(groupsets[[1]],function(group){t(t(X[,group])/sqrt(Ik[[1]][group]))}))
    #create new group indices for Xxtnd
    Kg2 <- c(1,Kg[[1]]) 
    G2 <- length(Kg2)-1
    groupxtnd <- lapply(2:length(Kg2),function(i){sum(Kg2[1:(i-1)]):(sum(Kg2[1:i])-1)}) #list of indices in each group
    groupxtnd2 <- unlist(sapply(1:G2,function(x){rep(x,Kg2[x+1])})) #vector with group number
    
    #datablocks: list with each element a data type containing indices of covariates with that data type 
    Xbl <- createXblocks(lapply(groupxtnd,function(ind) Xxtnd[,ind]))
    XXbl <- createXXblocks(lapply(groupxtnd,function(ind) Xxtnd[,ind]))
    
    covMat <- SigmaFromBlocks(XXblocks=XXbl,penalties=1/tauMR)
    
    return(covMat)
  }
  
  covMatX <- covMat(X,groupset,tauMR)
  
  #Compute Mahalanobis distance
  mahala <- function(eta,covMat) {
    eta <- matrix(eta,nrow=1)
    dist <- try(eta %*% solve(covMat, t(eta)),silent = TRUE)
    if(class(dist)[1]=="try-error"){
      svdcovMat <- svd(covMat)
      svdd <- svdcovMat$d
      #reci <- 1/svdd[1:n]
      reci <- c(1/svdd[1:n-1],0)
      invforcovMat <- svdcovMat$v %*% (reci * t(svdcovMat$u))
      dist <- c(eta%*% invforcovMat %*% t(eta))
    }
    return(dist)
  }
  
  #qq-plot
  dfs <- nSim
  mahaladists <- apply(etas,2,mahala,covMat=covMatX)
  
  if(requireNamespace("ggplot2")){
    Sample <- NULL
    df <- data.frame("Sample"=mahaladists)
    p <- ggplot2::ggplot(df)+ggplot2::aes(sample=Sample)+
      ggplot2::geom_qq(distribution=stats::qchisq, dparams = dfs)+
      ggplot2::geom_qq_line(distribution=stats::qchisq, dparams = dfs,line.p = c(0.25,0.75),col="red")+
      ggplot2::labs(x="Theoretical quantiles",y="Empirical quantiles")+
      ggplot2::theme_bw()
    
    return(p)
  }else{
    qqplot(qchisq(ppoints(500), df = dfs), mahaladists, ylab="Empirical quantiles", xlab="Theoretical quantiles")
    qqline(mahaladists, distribution = function(p) qchisq(p, df = dfs),
           probs = c(0.1, 0.6), col = 2)
    
    return(NULL)
  }
  
}

#Internal functions----
#Log-determinant; needed below----
.logdet <- function(mat) return(determinant(mat,logarithm=TRUE)$modulus[1])

.SigmaBlocks <- function(XXblocks,lambdas){ #computes X Laminv X^T from block cross-products
  nblocks <- length(XXblocks)
  if(nblocks != length(lambdas)){
    print("Error: Number of penalty parameters should equal number of blocks")
    return(NULL)
  } else {
    Sigma<-Reduce('+', lapply(1:nblocks,function(i) XXblocks[[i]] * 1/lambdas[i]))
    return(Sigma)
  }
}





#Computation of weighted Hat matrix, penalized variables only; needed in IWLS functions----
.Hpen <- function(WV,XXT){ #WV:weigths as vector (n); XXT: (penalized) sample cross-product (nxn)
  n <-length(WV)
  Winv <- diag(1/WV)
  inv <- solve(Winv + XXT)
  Mmat <- diag(n) - inv %*% XXT
  Hmat <- XXT %*% Mmat
  return(list(Hmat=Hmat,Mmat=Mmat))
}

#Computation of weighted Hat matrix, possibly with unpenalized variables----
.Hunpen <- function(WV,XXT,X1){
  #Function from multiridge-package written by Mark van de Wiel
  #WV:weigths as vector (n)
  #XXT: sample cross-product (nxn) penalized variables
  #X1 (p1xn) design matrix unpenalized variables
  n <- length(WV)
  WsqrtV <- sqrt(WV)
  WinvsqrtV <- 1/WsqrtV
  X1W <- WsqrtV * X1
  X1aux <- solve(t(X1W) %*% X1W) %*% t(X1W)
  X1Wproj <- X1W %*% X1aux
  P1W <- diag(n) - X1Wproj
  GammaW <- t(t(WsqrtV * XXT) * WsqrtV) #faster
  P1GammaW <- P1W %*% GammaW
  # cons <- mean(abs(P1GammaW))
  # solve(diag(n)/cons+P1GammaW/cons)
  invforH2<-try(solve(diag(n) + P1GammaW))
  if(class(invforH2)[1]=="try-error"){
    svdXXT <- svd(diag(n) + P1GammaW)
    svdd <- svdXXT$d
    #reci <- 1/svdd[1:n]
    reci <- c(1/svdd[1:n-1],0)
    invforH2 <- svdXXT$v %*% (reci * t(svdXXT$u))
  }
  #invforH2 <- solve(diag(n) + P1GammaW)
  #H2 <- Winvsqrt %*% GammaW %*% (diag(n) -invforH2 %*% P1GammaW) %*% P1W %*% Winvsqrt
  MW <- WsqrtV * t(t((diag(n) - invforH2 %*% P1GammaW) %*% P1W) * WinvsqrtV)
  H20 <- GammaW %*% (WinvsqrtV * MW)
  H2 <- t(t(WinvsqrtV * H20)) #faster
  #Hboth <- Winvsqrt %*% X1Wproj %*% (diag(n) - Wsqrt %*% H2 %*% Wsqrt) %*% Winvsqrt + H2
  KW <- t(t(X1aux %*% (diag(n) - t(t(WsqrtV * H2) * WsqrtV))) * WinvsqrtV)
  Hmat <-(WinvsqrtV * X1W) %*% KW  + H2
  return(list(Hmat=Hmat,MW=MW,KW=KW))
}


#Generate from truncated gamma distribution----
#Code from: https://rdrr.io/rforge/TruncatedDistributions/man/tgamma.html
.rtgamma <- function(n, shape, scale = 1, a = 0, b = Inf){
  stopifnot(n > 0 & all(shape > 0) & all(scale > 0))
  output <- rep(Inf,n)
  maxIt <- 50; It <- 1
  while(any(output==Inf) & It <= maxIt){
    x <- runif(n)
    Fa <- pgamma(a, shape, scale = scale)
    Fb <- pgamma(b, shape, scale = scale)
    y <- (1 - x) * Fa + x * Fb
    if(sum(output==Inf)!=length(y)) browser()
    temp <- qgamma(y, shape, scale = scale)
    output[output==Inf] <- temp
    It <- It + 1
    if(length(shape)>1) shape <- shape[temp==Inf]
    if(length(scale)>1) scale <- scale[temp==Inf]
    n <- sum(output==Inf)
  }
  return(output)
}

Try the squeezy package in your browser

Any scripts or data that you put into this service are public.

squeezy documentation built on May 13, 2022, 5:08 p.m.