R/MRSP-fit-39.r
In MRSP: Multinomial Response Models with Structured Penalties

################################################################################
#####    An R Package for fitting (M)ultinomial (L)ogit models with a      #####
#####    (S)tructured (P)enalization for global and category-specific      #####
#####    predictors.                                                       #####
################################################################################
#####    Author: Wolfgang Pφίnecker                                        #####
#####    Last modified: 14.10.2014, 16:21                                  #####
################################################################################

## the main fitting function:
## -------------------------------------------------------------------------- ##
## Arguments: 
   ## dat: a list that contains the data. must have entries "y", "x" and "V".
   ##      "y" is a matrix with entries between 0 and 1 giving the response. in 
   ##      the case of repeated observations, the entries in y are the relative 
   ##      frequency of the response categories for a particular combination of
   ##      covariates. the amount of repeated measurements is then captured by 
   ##      weights. size nobs x K for K response categories.
   ##      "x" is a matrix with predictors that dont depend on the response 
   ##      category but whose coefficients are category-specific. size nobs x P.
   ##      "V" is a list (!) of length K whose entires are nobs x L matrices 
   ##      giving the category-specific predictors. 
   ## coef.init: a list of initial coefficients. the first entry is a K x P 
   ##      matrix corresponding to x, the second entry is a K x L matrix
   ##      belonging to V.
   ## coef.stand.init: same as coef.init, but corresponding to the standardized
   ##      predictors.
   ## offset: a vector of length nobs with offset values.
   ## weights: a vector of observation weights.
   ## grpindex: a list of two vectors that indicate which columns of the design
   ##      matrix form a group that has to be penalized jointly, e.g. the 
   ##      different dummies of a categorical predictor. the first element is 
   ##      the grouping vector for x, the second one for V. those columns with 
   ##      the same number belong to one group. the numbers must begin with 1!
   ##      MRSP always uses a group lasso for such parameter groups. if you dont
   ##      want this behaviour, simly dont specify grpindex.
   ## penindex: a list of two vectors, same size as grpindex, whose numbers 
   ##      indicate how each predictors shall be penalized. elements that share 
   ##      the same number must obviously also share the same number in penindex
   ##      possible values:
   ##      1:  global predictor ("x") whose coefficients shall be penalized with  
   ##          a group lasso penalty with grouping "across" categories.
   ##      10: global predictor, unpenalized.
   ##      11: global predictor, sparse group lasso.
   ##      12: global predictor, ordinary lasso.
   ##      13: global predictor, ridge penalty. does not support penweights.
   ##      14: global predictor, sparse group lasso + group fused lasso.
   ##      15: global predictor, "rowwise" ridge fusion on adjacent parameters.
   ##          (i.e. a discrete P-spline penalty.) only works for parameter
   ##          groups with more than one element.
   ##      16: global predictor, "columnwise" ridge fusion on adjacent para-
   ##          meters. only works if K > 1.
   ##      17: global predictor, sparge group lasso + L1 fused lasso.
   ##      18: global predictor, "rowwise" L1 fused lasso.
   ##      2:  category-specific predictor whose coefficients are penalized with
   ##          the ordinary (group-)lasso. (depending on grpindex.)
   ##      20: category-specific, unpenalized.
   ##      21: category-specific, ridge penalty.
   ##      3:  category-speficic predictor with category-specific coefficients
   ##          that are penalized by a group lasso like in "1". this variant is
   ##          not implemented yet!
   ##      30: category-specific with category-specific coefs, unpenalized.
   ##      31: category-specific with category-specific coefficients and sparse
   ##          group lasso penalty.
   ##      32: cat-cat-specific, with ordinary lasso.
   ##      33: cat-cat, with ridge. does not support penweights.
   ##      34-37: to come in the future, like the previous four versions, but
   ##             includes automatic distinction between category-specific and
   ##             global parameterizations for category-specific preditors.
   ##             in other words: these 4 will be able to shrink the "3"-series
   ##             to the "2"-series.
   ##      4:  ordinal, global effect, penalized. (cf. the "2"-series).
   ##      40: ordinal, global effect, unpenalized.
   ##      41: ordinal, global effect, ridge penalty. 
   ## tuning: a list of tuning parameters for the model. the first slot contains
   ##      the tuning parameters for the global predictors in "x", the second 
   ##      slot that for the category-specific predictors with global parameters
   ##      and the third slot those for the "3"-series from above. if a sparse 
   ##      group lasso is used in the "1"- or the "3"-series, the corresponding 
   ##      slot in tuning must contain 2 parameters instead of just one.
   ## model: an object of class MRSP.model.
   ## constr: either "symmetric" or "none", which mean that a symmetric side 
   ##         constraint or no side constraint are used. (if constr = "none",
   ##         mean-centering is used for coefficients of unpenalized predictors)
   ##         alternatively, constr can be numeric; then the corresponding 
   ##         category of y is the reference category.
   ## control: an object of class MRSP.control.
   ## fista.control: a list with control information for fista.
   ## Proximal.control: a list with control information for fistaProximal.
   ## Proximal.args: a list with information for the call to fistaProximal during
   ##      the fista algorithm. if missing, the required information will be 
   ##      computed within MRSP.fit.
   ## penweights: a list with 2 elements, one for the group lasso type penalties
   ##      and one for the ordinary lasso type penalties. each of these elements
   ##      gives the weight that is assigned to the various predictors. this is 
   ##      used for adaptive lasso estimation.
   ## adaptive: should an adaptive penalty be used? set to F to not use adaptive
   ##      penalization. set to T to use the same penalty for both initial and
   ##      the adaptively weighted final fit. set to "ML" to use the ML-fit as 
   ##      as the initial estimator. 
   ## refit: should a refit be performed?
   ## fusion: if fusion="adja", fusion-type penalties apply to all differences
   ##      between adjacent coefficients. if fusion="all", it applies to all
   ##      pairwise differences. fusion = FALSE means no fusion is considered.
   ## nonneg: if TRUE, a nonnegativity constraint is incorporated on all coef-
   ##      ficients that are subject to a grouped or ordinary lasso penalty.
## -------------------------------------------------------------------------- ##

## the main fitting method which fits the model for one conrete lambda:
setMethod("MRSP.fit",
          signature(lambda = "numeric"),
function(dat, coef.init, coef.stand.init, coef.pretres.init, offset = rep(0, nrow(dat$y)),
                     weights = rep(1, nrow(dat$y)), grpindex = NULL, penindex = NULL,
                     lambda, lambdaR = lambda, lambdaF = lambda, gamma = 1, psi = 1,
                     indg = NULL, indcs = NULL,
                     model = multinomlogit(), constr = NULL, control = MRSP.control(),
                     fista.control = NULL, Proximal.control = NULL,
                     Proximal.args = NULL, penweights = NULL, mlfit = NULL,
                     adaptive = FALSE, threshold = FALSE, refit = FALSE, fusion = FALSE, nonneg = FALSE,
                     ...)
{
 mycall <- match.call()
 ## if any objects that are not formals of MRSP.fit are passed via ... , we need
 ## to explicitly assign the corresponding objects:
 dotlist <- list(...)
 for(i in seq_len(length(dotlist))) assign(names(dotlist)[i], dotlist[[i]])
 if(is.expression(model)) model <- eval(model)

 
 ## replace all entries of mycall with their actual value. this is useful, for
 ## example, for bootstrapping or predict methods (and so on...)
 if(control@expandcall){
  excl <- c(1, which(names(mycall) %in% c("dat", "model")))
  if(is.null(mycall$model)){
   mycall$model <- expression(multinomlogit())
   excl <- c(excl, length(mycall))
  }else{
   mycall[[which(names(mycall) == "model")]] <- switch(model@name,
    "Multinomial Logit Model" = expression(multinomlogit()),
    "Sequential Logit Model" = expression(sequentiallogit()),
    "CUB Binomial Logit Model" = expression(CUBbinomiallogit()),
    "OLS Regression" = expression(OLSreg())
   )
  }
  mycall[-excl] <- mget(names(mycall[-excl]), envir = as.environment(-1))
 }
 
 ## if more than one lambda was supplied, fix this by going back to
 ## MRSP.fit(..., signature(lambda=list)):
 if(length(lambda) > 1){mycall$lambda <- list(lambda); out <- eval(mycall); return(out)}

 if(missing(constr) | is.null(constr))  constr <- model@constraint
 if(nonneg == T && constr == "symmetric"){
  constr <- 1
  warning()
  cat(" You cannot combine a nonnegativity constraint on the coefficients with a symmetric side constraint!", "\n", "The first response category is instead used as reference.","\n")
 }

 ## are there any category-specific predictors? they are stored in the 'V'-entry of dat
 hasV <- !is.null(dat$V)
 
 nobs <- nrow(dat$y)
 wnobs <- sum(weights)
 K <- ncol(dat$y)
 P <- ncol(dat$x)
 if(hasV & !is.matrix(dat$V[[1]])){dat$V <- lapply(dat$V, as.matrix)}
 if(hasV){L <- ncol(dat$V[[1]])}

 
 if(hasV){
  if(missing(penindex) | is.null(penindex)){
   if((missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indg <- integer(0)
    indcs <- seq(1, L)
   }else if((missing(indg) | is.null(indg)) && !(missing(indcs) | is.null(indcs))){
    indg <- seq(1, L)[-which(seq(1, L) %in% indcs)]
   }else if(!(missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indcs <- seq(1, L)[-which(seq(1, L) %in% indg)]
   }  
  }else{
   indg <- which(penindex[[2]] %in% c(2, 20, 21))
   indcs <- which(penindex[[2]] %in% c(3, 30, 31, 32, 33))
  } 
 }

 ## if the first response category is chosen as reference, this can cause 
 ## problems on many occasions. therefore, we rearrange stuff in this case.
 ## really? probably not! commented out for now!
 #if(constr == 1){
 # dat$y[,c(1, K)] <- dat$y[, c(K, 1)]
 # colnames(dat$y)[c(1, K)] <- colnames(dat$y)[c(K, 1)]
 # dat$V[c(1, K)] <- dat$V[c(K, 1)]
 # if(!(missing(mlfit) | is.null(mlfit)) | !(missing(coef.init) | is.null(coef.init)) |
 #    !(missing(coef.stand.init) | is.null(coef.stand.init)) | 
 #    !(missing(coef.pretres.init) | is.null(coef.pretres.init)) |
 #    !(missing(penweights) | is.null(penweights))  ){
 #     stop("something went completely wrong in the passing on of arguments for the case of 'constr == 1'. please use a response category different from the first one as reference.")
 # }  
 #} 

 if(!(missing(grpindex) | is.null(grpindex))){
  if(!is.list(grpindex)){grpindex <- list(grpindex)}
 }
 if(!(missing(penindex) | is.null(penindex))){
  if(!is.list(penindex)){penindex <- list(penindex)}
 }

 if((missing(mlfit) | is.null(mlfit)) & control@doMLfit == T){
  mlfit <- getmlfit(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              offset = offset, weights = weights, grpindex = grpindex,
                              penindex = penindex, lambda = lambda, lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma,
                              psi = psi, indg = indg, indcs = indcs, model = model,
                              control = control, fista.control = fista.control, constr = constr,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit,
                              nonneg = nonneg, ...)
 }                             

 if(missing(penweights) | is.null(penweights)){
  penweights <- getpenweights(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              offset = offset, weights = weights, grpindex = grpindex,
                              penindex = penindex, lambda = lambda, lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma,
                              psi = psi, indg = indg, indcs = indcs, model = model, 
                              control = control, fista.control = fista.control, constr = constr,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit,
                              fusion = fusion, mlfit = mlfit, nonneg = nonneg, ...)
 }
 

 if(is.list(lambda) | length(lambda) != 1 | lambda < 0)
   stop("something went wrong with lambda")
   
 tuning <- list(lambda = lambda,
                gamma  = gamma,
                psi    = psi,
                lambdaR = lambdaR,
                lambdaF = lambdaF)


 ## handling the intercept
 which.intercept <- which(apply(dat$x, 2, function(u) diff(range(u)) < .Machine$double.eps ^ 0.5))
 has.intercept <- !(length(which.intercept) == 0)
 if((length(which.intercept) == 1) && which.intercept != 1){
  dat$x[,c(1, which.intercept)] <- dat$x[,c(which.intercept, 1)] 
  penindex[[1]][c(1,which.intercept)] <- penindex[[1]][c(which.intercept,1)]
  grpindex[[1]][c(1,which.intercept)] <- grpindex[[1]][c(which.intercept,1)]
  if(!all(coef[[1]] == 0))
  coef[[1]][,c(1,which.intercept)] <- coef[[1]][,c(which.intercept,1)]
  which.intercept <- 1
 }
 if(length(which.intercept) > 1)
   stop("more than one intercept column specified")

 ## are we using an ordinal regression model?
 isordinal <- model@name %in% c("Cumulative Logit Model", "Sequential Logit Model")
 if(isordinal){
  if(!is.list(penindex)){
   if(is.null(penindex)) stop("penindex must be supplied in ordinal regression")
   penindex <- list(penindex)
  }
  sl.indg <- which(penindex[[1]] %in% c(4, 40, 41))
  sl.indcs <- which(penindex[[1]] %in% c(1, 10, 11, 12, 13, 14))
 }else{
  sl.indg <- NULL
  sl.indcs <- NULL
 } 
 if(isordinal && (constr != "none")) stop("You must use no constraint in ordinal models")

 ## now fill in the potentially missing arguments:
 if(missing(coef.init) | is.null(coef.init)){
  coef <- list()
  coef[[1]] <- matrix(0, nrow = K, ncol = P)
  ybar <- apply(dat$y, 2, function(u) weighted.mean(u, w = weights))
  if(any(ybar == 0)){
   print(ybar)
   stop("ill-conditioned data: at least one of the response categories was not observed")
  }    
  ## if there are ties in the relative frequencies of the categories, break them
  ## note that at least one column of coef[[1]] must contain no ties
  if(length(unique(ybar)) != length(ybar)){
   ybar <- ybar + runif(length(ybar), 0, 1e-2)
   lyb <- sum(ybar)
   ybar <- ybar / lyb
  }
  
  if(!isordinal){
   if(is.numeric(constr)){
    coef[[1]][-constr,1] <- log(ybar[-constr]/(1-sum(ybar[-constr])))
    coef[[1]][constr,1] <- 0
   }else{
    coef[[1]][,1] <- log(ybar / prod(ybar)^(1/length(ybar)))
    coef[[1]][,1] <- coef[[1]][,1] - mean(coef[[1]][,1])
   }  
  }else{
   coef[[1]][,1] <- model@link(mu = matrix(ybar, nrow = 1), constr = constr)
  }
    
  if(hasV){
   coef[[2]] <- matrix(0, nrow = K, ncol = L)
  }
  if(K == 1){
   coef[[1]][,1] <- weighted.mean(dat$y, w = weights)
  }
 }else{coef <- coef.init}

 if(missing(grpindex) | is.null(grpindex)){
  grpindex <- list()
  grpindex[[1]] <- seq_len(P)
  if(hasV){
   grpindex[[2]] <- seq(from = P+1, to = P+L)
  } 
 }

 if(missing(penindex) | is.null(penindex)){
  penindex <- list()
  penindex[[1]] <- rep(1, P)
  penindex[[1]][1] <- 10                                                               
  if(hasV){
   penindex[[2]][indg] <- rep(2, length(indg))                                  
   penindex[[2]][indcs] <- rep(3, length(indcs))
  }
 } 

 if(missing(coef.stand.init) | is.null(coef.stand.init)){
  coef.stand <- coef
 }else{coef.stand <- coef.stand.init}
 if(missing(coef.pretres.init) | is.null(coef.pretres.init)){
  coef.pretres <- coef.stand
 }else{coef.pretres <- coef.pretres.init}
 if(!is.list(coef.pretres)){coef.pretres <- list(coef.pretres)} 
 if(!is.list(coef.stand)){coef.stand <- list(coef.stand)}
 if(!is.list(coef)){coef <- list(coef)}

 ## set up the difference matrix 'R' for GFL-fusion-type penalties
 if(fusion == F){
  R <- NULL
 }else{
  if(is.numeric(constr)) Kf <- K-1 else Kf <- K         ## Kf = relevant size of the problem for fusion penalties
  #if(Kf < 2) stop("a fusion penalty was requested for a too small parameter group")
  if(Kf <= 2) R <- matrix(c(-1,1), nrow=1)
  if(Kf > 2){
   if(fusion == "adja"){
    R <- c(-1, 1, rep(0, Kf-2))
    for(i in 1:(Kf-2)){
     R <- rbind(R, c(rep(0,i),-1,1,rep(0,Kf-2-i)))
    }
    R <- matrix(R, ncol=Kf)
   }else if(fusion == "all"){
    R <- cbind(rep(1,Kf-1), -diag(Kf-1))
    for(i in 1:(Kf-1)){
     add <- cbind(matrix(0,nrow=Kf-1-i,ncol=i),rep(1,Kf-1-i),-diag(Kf-1-i))
     R <- rbind(R,add)
    }
   }
  }
 }#else stop("option 'fusion' in MRSP.fit must be either FALSE or ``adja`` or ``all``")
 

 ## error checking
 if(!(is.numeric(constr) | constr == "symmetric" | constr == "none"))
   stop("the identifiability constraint is misspecified")
   
 if(!all(weights == rep(1, nrow(dat$y))) && (control@standardize == T))
   warning(paste("the usage of weights for penalized parameter groups together with standardization does not yet produce exact results. maybe try to apply the standardization manually and set control@standardize = F"))
 
 if(!is.matrix(dat$x))
   stop("x has to be a matrix")

 if(any(is.na(dat$x)))
   stop("Missing values in x not allowed!") 
 
 if(!is.numeric(dat$y))
   stop("y must be 'numeric'")
   
 if(!is.matrix(dat$y))
   stop("y must be a matrix. (wide format)")
   
 if(!model@check(dat))
   stop("y values outside the appropriate range for the specified model class!")
   
 if(nrow(coef[[1]]) != K)
   stop("coef.init has wrong format")
   
 if(ncol(coef[[1]]) != P)
   stop("coef.init has wrong format")
   
 if(is.numeric(constr) && !all(coef[[1]][constr,] == 0))
   stop("coef.init does not comply with the chosen side constraint")  
   
 if(length(weights) != nobs)
   stop("weight vector has wrong length")
   
 if(any(weights <= 0))
   stop("weights must be strictly positive!")  

 if(is.matrix(offset)){
  if(nrow(offset) != nobs)
    stop("nrow(offset) not equal to the number of observations")
  if(ncol(offset) != K)
    stop("ncol(offset) not equal to the number response categoires")
 }
 if(is.vector(offset)){
  if(length(offset) != nobs)
    stop("length(offset) not equal to the number of observations")
 }
    
 if(length(grpindex[[1]]) != P)
   stop("grpindex[[1]] has wrong length")

 if(length(penindex[[1]]) != P)
   stop("penindex[[1]] has wrong length")
    
 if(length(tuning) != 5)
   stop("for the given data, the tuning object needs to be a list of length 4")
     
 if(hasV){
  if(!is.list(dat$V))
    stop("V has to be a list")
    
  if(length(dat$V) != K)
    stop("V has wrong length")
  
  if(nrow(dat$V[[1]]) != nobs)
    stop("the elements of V have wrong size")
   
  if(ncol(dat$V[[1]]) != L)
    stop("the elements of V have wrong size")    
    
  if(Reduce(any, lapply(dat$V, function(u){any(is.na(u))})))
    stop("Missing values in V not allowed!")
  
  if(!Reduce(all, lapply(dat$V, is.numeric)))
    stop("Values in V must be numeric")
   
  if(ncol(coef[[2]]) != L)
    stop("coef.init has wrong format")
    
  if(nrow(coef[[2]]) != K)
    stop("coef.init has wrong format")  
    
  if(length(grpindex[[2]]) != L)
    stop("grpindex[[2]] has wrong length")
    
  if(length(penindex[[2]]) != L)
    stop("penindex[[2]] has wrong length")
  
  if(length(indg) + length(indcs) != L)
    stop("indg and indcs objects have wrong length")  
 }


 ## creating objects that give the position, type, length etc. of the different
 ## penalization and groups. most of this is passed to Proximal.args
 
 ## global predictors
 ## structured group lasso penalty with groups across categories
 any.grouppen.x <- any(penindex[[1]] == 1)   
 which.grouppen.x <- which(penindex[[1]] == 1)
 groups.grouppen.x <- unique(grpindex[[1]][which.grouppen.x])
 
 ## unpenalized
 any.notpen.x <- any(penindex[[1]] == 10)
 which.notpen.x <- which(penindex[[1]] == 10)
 groups.notpen.x <- unique(grpindex[[1]][which.notpen.x])
 
 ## sparse group lasso
 any.spgrppen.x <- any(penindex[[1]] == 11)
 which.spgrppen.x <- which(penindex[[1]] == 11)
 groups.spgrppen.x <- unique(grpindex[[1]][which.spgrppen.x])
 
 ## ordinary lasso
 any.lassopen.x <- any(penindex[[1]] == 12)
 which.lassopen.x <- which(penindex[[1]] == 12)
 groups.lassopen.x <- unique(grpindex[[1]][which.lassopen.x])
 
 ## ridge penalty
 any.ridgepen.x <- any(penindex[[1]] == 13)
 which.ridgepen.x <- which(penindex[[1]] == 13)
 groups.ridgepen.x <- unique(grpindex[[1]][which.ridgepen.x])
 
 ## sparse group lasso + group fused lasso
 any.spGFLpen.x <- any(penindex[[1]] == 14)
 which.spGFLpen.x <- which(penindex[[1]] == 14)
 groups.spGFLpen.x <- unique(grpindex[[1]][which.spGFLpen.x])
 
 ## ridge fusion aka discrete P-splines
 any.Psplinepen.x <- any(penindex[[1]] == 15)
 which.Psplinepen.x <- which(penindex[[1]] == 15)
 groups.Psplinepen.x <- unique(grpindex[[1]][which.Psplinepen.x])
 
 ## columnwise ridge fusion
 any.colPsplinepen.x <- any(penindex[[1]] == 16)
 which.colPsplinepen.x <- which(penindex[[1]] == 16)
 groups.colPsplinepen.x <- unique(grpindex[[1]][which.colPsplinepen.x])
 
 ## fused sparge group lasso (SGL + L1-fusion)
 any.spFGLpen.x <- any(penindex[[1]] == 17)
 which.spFGLpen.x <- which(penindex[[1]] == 17)
 groups.spFGLpen.x <- unique(grpindex[[1]][which.spFGLpen.x])
 
 ## rowwise L1 fusion
 any.rowFLpen.x <- any(penindex[[1]] == 18)
 which.rowFLpen.x <- which(penindex[[1]] == 18)
 groups.rowFLpen.x <- unique(grpindex[[1]][which.rowFLpen.x])
 
 ## if spGFL is used, prepare some objects for it:
 if(any.spGFLpen.x && !isordinal) stop("GFL penalty is currently only supported for ordinal models")

 ## set up the difference matrix 'Omega' for P-spline-type fusion penalties
 if(any.Psplinepen.x || any.rowFLpen.x){
  Omega <- list() #; length(Omega) <- length(groups.Psplinepen.x)
  tD <- list()
  for(j in sort(c(groups.Psplinepen.x, groups.rowFLpen.x))){
   j.which <- which(grpindex[[1]] == j)
   kj <- length(j.which)
   if(kj < 2) stop("a fusion penalty was requested for a too small parameter group")
   if(kj == 2){
    tD[[j]] <- c(-1,1)
    Omega[[j]] <- crossprod(matrix(c(-1,1),nrow=1,ncol=2))
   }else if(kj > 2){
    if(fusion == "adja"){
     D <- c(-1, 1, rep(0, kj-2))
     for(i in 1:(kj-2)){
      D <- rbind(D, c(rep(0,i),-1,1,rep(0,kj-2-i)))
     }
     D <- matrix(D, ncol=kj)
    }else if(fusion == "all"){
     D <- cbind(rep(1, kj-1), -diag(kj-1))
     for(i in 1:(kj-1)){
      add <- cbind(matrix(0, nrow=kj-1-i, ncol=i), rep(1, kj-1-i), -diag(kj-1-i))
      D <- rbind(D, add)
     }
    }
    tD[[j]] <- t(D)
    Omega[[j]] <- crossprod(D)
   }
  }
 }

 if(hasV){
  ## category-specific predictors with global coefficients
  ## ordinary (group-)lasso penalty
  any.globalpen.V <- any(penindex[[2]] == 2)
  which.globalpen.V <- which(penindex[[2]] == 2)
  groups.globalpen.V <- unique(grpindex[[2]][which.globalpen.V])
  
  ## unpenalized
  any.globalunpen.V <- any(penindex[[2]] == 20)
  which.globalunpen.V <- which(penindex[[2]] == 20)
  groups.globalunpen.V <- unique(grpindex[[2]][which.globalunpen.V])
  
  ## ridge
  any.globalridgepen.V <- any(penindex[[2]] == 21)
  which.globalridgepen.V <- which(penindex[[2]] == 21)
  groups.globalridgepen.V <- unique(grpindex[[2]][which.globalridgepen.V])
  
  ## category-specific predictors with category-specific coefficients 
  ## structured group lasso like in "1"
  any.catpen.V <- any(penindex[[2]] == 3)
  which.catpen.V <- which(penindex[[2]] == 3)
  groups.catpen.V <- unique(grpindex[[2]][which.catpen.V])
  
  ## unpenalized
  any.catunpen.V <- any(penindex[[2]] == 30)
  which.catunpen.V <- which(penindex[[2]] == 30)
  groups.catunpen.V <- unique(grpindex[[2]][which.catunpen.V])
  
  ## sparse group lasso
  any.catspgrppen.V <- any(penindex[[2]] == 31)
  which.catspgrppen.V <- which(penindex[[2]] == 31)
  groups.catspgrppen.V <- unique(grpindex[[2]][which.catspgrppen.V])
  
  ## ordinary lasso
  any.catlassopen.V <- any(penindex[[2]] == 32)
  which.catlassopen.V <- which(penindex[[2]] == 32)
  groups.catlassopen.V <- unique(grpindex[[2]][which.catlassopen.V])
  
  ## ridge
  any.catridgepen.V <- any(penindex[[2]] == 33)
  which.catridgepen.V <- which(penindex[[2]] == 33)
  groups.catridgepen.V <- unique(grpindex[[2]][which.catridgepen.V])
 }
 
 if(isordinal){
  ## ordinal model with global effects
  ## ordinary (group-)lasso penalty
  any.globalpen.o <- any(penindex[[1]] == 4)
  which.globalpen.o <- which(penindex[[1]] == 4)
  groups.globalpen.o <- unique(grpindex[[1]][which.globalpen.o])
  
  ## unpenalized
  any.globalunpen.o <- any(penindex[[1]] == 40)
  which.globalunpen.o <- which(penindex[[1]] == 40)
  groups.globalunpen.o <- unique(grpindex[[1]][which.globalunpen.o])
  
  ## ridge
  any.globalridgepen.o <- any(penindex[[1]] == 41)
  which.globalridgepen.o <- which(penindex[[1]] == 41)
  groups.globalridgepen.o <- unique(grpindex[[1]][which.globalridgepen.o])
 } 
    

 ## degrees of freedom, groups and so on...
 if(isordinal){
  pen.which.x <- sort(c(which.grouppen.x, which.spgrppen.x, which.lassopen.x, which.ridgepen.x,
                        which.spGFLpen.x, which.spFGLpen.x, which.rowFLpen.x,
                        which.globalpen.o, which.globalridgepen.o))
  notpen.which.x <- sort(c(which.notpen.x, which.globalunpen.o))
  Pspline.which.x <- sort(c(which.Psplinepen.x, which.colPsplinepen.x))
 }else{
  pen.which.x <- sort(c(which.grouppen.x, which.spgrppen.x, which.lassopen.x, which.ridgepen.x,
                        which.spGFLpen.x, which.spFGLpen.x, which.rowFLpen.x))
  notpen.which.x <- sort(c(which.notpen.x))
  Pspline.which.x <- sort(c(which.Psplinepen.x, which.colPsplinepen.x))
 }    
 nrpen.x <- length(pen.which.x)
 nrnotpen.x <- length(notpen.which.x)
 nrPspline.x <- length(Pspline.which.x)
 
 grpindex.pen.x <- grpindex[[1]][pen.which.x]
 nrgrp.pen.x <- length(unique(grpindex.pen.x))
 grpindex.notpen.x <- grpindex[[1]][notpen.which.x]
 nrgrp.notpen.x <- length(unique(grpindex.notpen.x))
 grpindex.Pspline.x <- grpindex[[1]][Pspline.which.x]
 nrgrp.Pspline.x <- length(unique(grpindex.Pspline.x))
 
 dict.pen.x <- sort(unique(grpindex.pen.x))
 pen.tab.x <- table(grpindex.pen.x)[as.character(dict.pen.x)]
  
 if(hasV){
  pen.which.V <- sort(c(which.globalpen.V, which.globalridgepen.V, which.catpen.V, which.catspgrppen.V, which.catlassopen.V, which.catridgepen.V))
  nrpen.V <- length(pen.which.V)
  notpen.which.V <- sort(c(which.globalunpen.V, which.catunpen.V))
  nrnotpen.V <- length(notpen.which.V)

  grpindex.pen.V <- grpindex[[2]][pen.which.V]
  nrgrp.pen.V <- length(unique(grpindex.pen.V))
  grpindex.notpen.V <- grpindex[[2]][notpen.which.V]
  nrgrp.notpen.V <- length(unique(grpindex.notpen.V))

  dict.pen.V <- sort(unique(grpindex.pen.V))
  pen.tab.V <- table(grpindex.pen.V)[as.character(dict.pen.V)]
 }  

 ## extract control information
 max.iter <- control@max.iter
 rel.tol <- control@rel.tol
 standardize <- control@standardize
 keepdat <- control@keepdat
 ridgestabil <- control@ridgestabil
 ridgestabilrf <- control@ridgestabilrf
 lambdastabil <- control@lambdastabil
 trace <- control@trace
 cut.to.rel <- control@cut.to.rel
 SPG.iter.max <- control@SPG.iter.max
 SPG.accel <- control@SPG.accel
 SPG.eps <- control@SPG.eps
 
 ## extract the model information
 link <- model@link
 invlink <- model@invlink
 loglik <- model@loglik
 gradient <- model@gradient
 fisher <- model@fisher
 name <- model@name

 #####################################################################################
 ## orthonormalization of the original observations. should be done in the wrap-around
 ## call to MRSP.fit, which applies MRSP.fit to the sequence of lambda-
 ## values. 
 ##### note: a correct backtransformation from the coefficients belonging to
 ## standardized predictors to the corresponding coefficients belonging to the
 ## original predictors is only possible if an intercept column is present!
 
 warnold <- options("warn")
 if(warnold < 2)
  options(warn = 1)
 if(!has.intercept & standardize){
  warning()
  cat(" Standardization of predictors was requested while no intercept column could be found.",'\n',
  "For this case, backtransformation of coefficients to the original scale is not possible, so that output for the standardized predictors is returned instead.",'\n',
  "Be careful!",'\n')
 }
 if(warnold < 2)
  options(warn = as.numeric(warnold))
 
 ## orthonormalization of observations
 x.original <- dat$x
 if(hasV) V.original <- dat$V
    
 if(standardize){
  sweights <- weights / sum(weights)
  x.centered <- dat$x
  mean.x <- apply(dat$x[,,drop=F], 2, function(u) weighted.mean(u, weights))
  if(has.intercept){
   x.centered[,-1] <- sweep(dat$x[,-1, drop=F], 2, mean.x[-1])
  }else{x.centered <- sweep(dat$x[,, drop=F], 2, mean.x)} 

  xw <- x.centered
  scale.pen.x <- list()
  scale.notpen.x <- NULL
  ## if there are no unpenalized groups, skip the following:

  if(length(notpen.which.x) > 0){
   if(has.intercept){
    scale.notpen.x <- sqrt(drop(sweights %*% (x.centered[, notpen.which.x[-1]]^2)))
    xw[,notpen.which.x[-1]] <- scale(x.centered[, notpen.which.x[-1]], FALSE, scale.notpen.x)
   }else{
    scale.notpen.x <- sqrt(drop(sweights %*% (x.centered[, notpen.which.x]^2)))
    xw[,notpen.which.x] <- scale(x.centered[, notpen.which.x], FALSE, scale.notpen.x)     
   }
  }
  
  for(j in dict.pen.x){
   j.which <- which(grpindex[[1]] == j)
   decomp.x <- qr(sqrt(weights) * x.centered[, j.which])
   if(decomp.x$rank < length(j.which)) ## warn if block doesnt have full rank
     stop("Block belonging to columns ", paste(j.which), "does not have full rank!")
   scale.pen.x[[j]] <- qr.R(decomp.x) * 1/sqrt(wnobs)
   xw[, j.which] <- qr.Q(decomp.x) * sqrt(wnobs)
  }
  
  dat$x <- xw  
  
  if(hasV){
   eweights <- rep(weights, times=K)
   seweights <- rep(sweights, times=K)
   ## extract V into a matrix of size (nobs * K) x L:
   Vs <- do.call(rbind, dat$V)
   mean.V <- apply(Vs, 2, function(u) weighted.mean(u, eweights))
   V.centered <- sweep(Vs, 2, mean.V)

   Vw <- V.centered
   scale.pen.V <- list()
   scale.notpen.V <- NULL
   if(length(notpen.which.V) > 0){
    scale.notpen.V <- sqrt(drop(seweights %*% (V.centered[, notpen.which.V]^2)))
    Vw[, notpen.which.V] <- scale(V.centered[, notpen.which.V], FALSE, scale.notpen.V)
   }
   
   for(j in dict.pen.V){
    j.which <- which(grpindex[[2]] == j)
    decomp.V <- qr(sqrt(eweights) * V.centered[, j.which])
    if(decomp.V$rank < length(j.which))
      stop("Block belonging to columns ", paste(j.which), "in V does not have full rank!")
    scale.pen.V[[j]] <- qr.R(decomp.V) * 1/sqrt(wnobs * K)
    Vw[, j.which] <- qr.Q(decomp.V) * sqrt(wnobs * K)
   }
   
   for(i in seq_len(K)){dat$V[[i]] <- Vw[seq(nobs*i-nobs+1,nobs*i),, drop=F]}                                      
  }
  
  if(threshold != F){
   coef <- coef.pretres
  }else{  
   coef <- coef.stand
  }
 }     

 ## assign the proxy class to the coef object
 class(coef) <- "MRSP.coef"

 eta <- updateEta(dat=dat, coef=coef, offset=offset)
 mu <- invlink(eta)
                    
 ## prepare the tuning object for the call to fista. note that only true "lambda"
 ## parameters may be left in tuning when fista calls it. gamma is for spgrplasso
 ## psi is for global vs cat-specific
 ## tuning[[1]] is for the group lasso penalty,
 ## tuning[[2]] for the ordinary lasso
 ## tuning[[3]] is for the category-specific predictors with global effects or
 ## for cat-spec predictors with cat-spec effects and group lasso penalty
 ## tuning[[6]] is for the category-specific predictors with cat-spec and 
 ## ordinary lasso penalty
 ## tuning[[4]] is for ridge penalties
 ## tuning[[5]] is for a ridge penalty that may be used for all parameters which
 ## otherwise would be unpenalized (including the intercept), which can be tried
 ## in order to stabilize the model if divergence of parameters occurs.
 ## tuning[[7]] is for the GFL penalty and, generally speaking, fusion penalties.
 tuning.old <- tuning
 lambda <- tuning$lambda
 gamma <- tuning$gamma
 psi <- tuning$psi
 lambdaR <- tuning$lambdaR
 lambdaF <- tuning$lambdaF
 tuning <- list()
 tuning[[1]] <- lambda *  psi
 tuning[[2]] <- lambda * gamma *psi
 tuning[[4]] <- lambdaR
 if(hasV){
  tuning[[3]] <- lambda * (2 - psi)
  tuning[[6]] <- lambda * (2 - psi) * gamma
 }
 tuning[[7]] <- lambdaF
 lmod <- K*P
 if(hasV) lmod <- lmod + L
 if(nobs <= lmod) lambdastabil <- 1*lambdastabil   
 tuning[[5]] <- 4*sqrt(2*log(lmod)/nobs) * lambdastabil

 ## list with arguments for the calls to fistaProximal and the other fistaGenerics
 if(missing(Proximal.args) | is.null(Proximal.args)){
  if(hasV){Proximal.args <- list(
                       any.grouppen.x = any.grouppen.x,
                       any.notpen.x = any.notpen.x,   
                       any.spgrppen.x = any.spgrppen.x,
                       any.lassopen.x = any.lassopen.x,
                       any.ridgepen.x = any.ridgepen.x,
                       any.spGFLpen.x = any.spGFLpen.x,
                       any.Psplinepen.x = any.Psplinepen.x,
                       any.globalpen.V = any.globalpen.V,
                       any.globalunpen.V = any.globalunpen.V,
                       any.globalridgepen.V = any.globalridgepen.V,
                       any.catpen.V = any.catpen.V,
                       any.catunpen.V = any.catunpen.V,
                       any.catspgrppen.V = any.catspgrppen.V,
                       any.catlassopen.V = any.catlassopen.V,
                       any.catridgepen.V = any.catridgepen.V,
                       which.grouppen.x = which.grouppen.x,
                       which.notpen.x = which.notpen.x,   
                       which.spgrppen.x = which.spgrppen.x,
                       which.lassopen.x = which.lassopen.x,
                       which.ridgepen.x = which.ridgepen.x,
                       which.spGFLpen.x = which.spGFLpen.x,
                       which.Psplinepen.x = which.Psplinepen.x,
                       which.globalpen.V = which.globalpen.V,
                       which.globalunpen.V = which.globalunpen.V,
                       which.globalridgepen.V = which.globalridgepen.V,
                       which.catpen.V = which.catpen.V,
                       which.catunpen.V = which.catunpen.V,
                       which.catspgrppen.V = which.catspgrppen.V,
                       which.catlassopen.V = which.catlassopen.V,
                       which.catridgepen.V = which.catridgepen.V,
                       groups.grouppen.x = groups.grouppen.x,
                       groups.notpen.x = groups.notpen.x,
                       groups.spgrppen.x = groups.spgrppen.x,
                       groups.lassopen.x = groups.lassopen.x,
                       groups.ridgepen.x = groups.ridgepen.x,
                       groups.spGFLpen.x = groups.spGFLpen.x,
                       groups.Psplinepen.x = groups.Psplinepen.x,
                       groups.globalpen.V = groups.globalpen.V,
                       groups.globalunpen.V = groups.globalunpen.V,
                       groups.globalridgepen.V = groups.globalridgepen.V,
                       groups.catpen.V = groups.catpen.V,
                       groups.catunpen.V = groups.catunpen.V,
                       groups.catspgrppen.V = groups.catspgrppen.V,
                       groups.catlassopen.V = groups.catlassopen.V,
                       groups.catridgepen.V = groups.catridgepen.V,
                       any.colPsplinepen.x = any.colPsplinepen.x,
                       which.colPsplinepen.x = which.colPsplinepen.x,
                       groups.colPsplinepen.x = groups.colPsplinepen.x,
                       any.spFGLpen.x = any.spFGLpen.x,
                       which.spFGLpen.x = which.spFGLpen.x,
                       groups.spFGLpen.x = groups.spFGLpen.x,
                       any.rowFLpen.x = any.rowFLpen.x,
                       which.rowFLpen.x = which.rowFLpen.x,
                       groups.rowFLpen.x = groups.rowFLpen.x,
                       dict.pen.x = dict.pen.x,
                       dict.pen.V = dict.pen.V,
                       gamma = gamma,
                       psi = psi,
                       hasV = hasV,
                       isordinal = isordinal,
                       ridgestabil = ridgestabil,
                       ridgestabilrf = ridgestabilrf,
                       lambdastabil = lambdastabil,
                       extrastabil = control@extrastabil,
                       sancount = 0,
                       n = nobs,
                       constraint = constr,
                       indg = indg,
                       indcs = indcs,
                       sl.indg = sl.indg,
                       sl.indcs = sl.indcs,
                       R = R,
                       SPG.iter.max = SPG.iter.max,
                       SPG.accel = SPG.accel,
                       SPG.eps = SPG.eps,
                       modelname = model@name,
                       nonneg = nonneg)
  }else{Proximal.args <- list(
                       any.grouppen.x = any.grouppen.x,
                       any.notpen.x = any.notpen.x,   
                       any.spgrppen.x = any.spgrppen.x,
                       any.lassopen.x = any.lassopen.x,
                       any.ridgepen.x = any.ridgepen.x,
                       any.spGFLpen.x = any.spGFLpen.x,
                       any.Psplinepen.x = any.Psplinepen.x,
                       which.grouppen.x = which.grouppen.x,
                       which.notpen.x = which.notpen.x,   
                       which.spgrppen.x = which.spgrppen.x,
                       which.lassopen.x = which.lassopen.x,
                       which.ridgepen.x = which.ridgepen.x,
                       which.spGFLpen.x = which.spGFLpen.x,
                       which.Psplinepen.x = which.Psplinepen.x,
                       groups.grouppen.x = groups.grouppen.x,
                       groups.notpen.x = groups.notpen.x,
                       groups.spgrppen.x = groups.spgrppen.x,
                       groups.lassopen.x = groups.lassopen.x,
                       groups.ridgepen.x = groups.ridgepen.x,
                       groups.spGFLpen.x = groups.spGFLpen.x,
                       groups.Psplinepen.x = groups.Psplinepen.x,
                       any.colPsplinepen.x = any.colPsplinepen.x,
                       which.colPsplinepen.x = which.colPsplinepen.x,
                       groups.colPsplinepen.x = groups.colPsplinepen.x,
                       any.spFGLpen.x = any.spFGLpen.x,
                       which.spFGLpen.x = which.spFGLpen.x,
                       groups.spFGLpen.x = groups.spFGLpen.x,
                       any.rowFLpen.x = any.rowFLpen.x,
                       which.rowFLpen.x = which.rowFLpen.x,
                       groups.rowFLpen.x = groups.rowFLpen.x,
                       dict.pen.x = dict.pen.x,
                       gamma = gamma,
                       psi = psi,
                       hasV = hasV,
                       isordinal = isordinal,
                       ridgestabil = ridgestabil,
                       ridgestabilrf = ridgestabilrf,
                       lambdastabil = lambdastabil,
                       extrastabil = control@extrastabil,
                       sancount = 0,
                       n = nobs,
                       constraint = constr,
                       indg = indg,
                       indcs = indcs,
                       sl.indg = sl.indg,
                       sl.indcs = sl.indcs,
                       R = R,
                       SPG.iter.max = SPG.iter.max,
                       SPG.accel = SPG.accel,
                       SPG.eps = SPG.eps,
                       modelname = model@name,
                       nonneg = nonneg)
  }
  if(isordinal){
   Proximal.args$any.globalpen.o <- any.globalpen.o
   Proximal.args$any.globalunpen.o <- any.globalunpen.o
   Proximal.args$any.globalridgepen.o <- any.globalridgepen.o
   Proximal.args$which.globalpen.o <- which.globalpen.o
   Proximal.args$which.globalunpen.o <- which.globalunpen.o
   Proximal.args$which.globalridgepen.o <- which.globalridgepen.o
   Proximal.args$groups.globalpen.o <- groups.globalpen.o
   Proximal.args$groups.globalunpen.o <- groups.globalunpen.o
   Proximal.args$groups.globalridgepen.o <- groups.globalridgepen.o
  }
  if(any.Psplinepen.x || any.rowFLpen.x){
   Proximal.args$Omega <- Omega
   Proximal.args$tD <- tD
  }
 }
 
 logl.old <- loglik(y=dat$y, mu=mu, weights=weights)
 pen.old <- penalty(coef=coef, tuning=tuning, penweights=penweights,
                    penindex=penindex, grpindex=grpindex, Proximal.args=Proximal.args)
 fn.val.old <- -logl.old + pen.old
 coef.old <- coef

 ## create the fista.control argument
 if(missing(fista.control) | is.null(fista.control)){
  fista.control <- fista.control(rel.tol = rel.tol, max.iter = max.iter)
 }else{fista.control@rel.tol <- rel.tol; fista.control@max.iter <- max.iter} 

 ## now the call to fista where the core fitting takes place.
 updated <- fista(coef.init=coef, dat=dat, offset=offset, eta.init=eta,
                  mu.init=mu, weights=weights, model=model, control=fista.control,
                  Proximal.args=Proximal.args, Proximal.control=Proximal.control,
                  grpindex=grpindex, penindex=penindex, tuning=tuning,
                  penweights.init=penweights)
                 
 coef <- updated$coef
 penweights <- updated$penweights
 eta <- updated$eta
 mu <- updated$mu
 logl <- updated$loglikval
 logl.approx <- 0
 pen <- updated$penval
 iter.count <- updated$iter.count
 best.iter <- updated$best.iter
 d.fn.updated <- updated$d.fn
 fn.val <- updated$fn.val
 ridgestabil <- updated$ridgestabil
 control@ridgestabilrf <- updated$ridgestabilrf
 
 ## some checks that nothing has gone completely wrong
 if(is.na(pen) | is.na(logl) | is.na(logl.approx) | is.na(fn.val) | is.na(d.fn.updated))
   warning(paste("pen, logl, logl.approx, fn.val, d.fn or several of them were NA"))
 if(class(coef) != "MRSP.coef")
   stop("the coef object returned by fista did not retain the correct class")
 
 ## convergence checks
 d.fn <- (fn.val.old - fn.val) / (0.001 + abs(fn.val))

 if(sign(d.fn) != sign(d.fn.updated)){
  if(trace){
   warning(paste("the value of the penalized loglikelihood has worsened"))
  }
 } 
 if(abs(d.fn.updated) > sqrt(rel.tol)){
  if(trace){
   print(paste("d.fn.updated:"))
   print(d.fn.updated)
   print(paste("rel.tol:"))
   print(rel.tol)
   warning(paste("the fista algorithm might have converged earlier than wanted"))
  }
 }  
 d.par <- max(mapply(function(x,y){max(abs(x-y)/(0.01+abs(y)))}, coef.old, coef))
 
 ## now prepare for the output
 #############################
 ## cut values of the standardized coefficients which are smaller than
 ## control@cut.to.rel. if it's set to 0, there is no cutting.
 ## note that this has to be done before the retransformation. otherwise, the 
 ## coefs of predictors with large variance might get cut although they are
 ## highly significant!
 coef <- lapply(coef, function(u){u[which(abs(u) <= cut.to.rel)] <- 0; return(u)})

 ## save the coefficients before thresholding so that the next model in the list
 ## of lambdas gets computed as fast as possible.
 coef.pretres <- coef
 
 ## perform thresholding
 if(is.function(control@numerictresh) & threshold == F){
  threshold <- control@numerictresh(nobs)
  if(refit == T | refit == "L") threshold <- threshold / 2
 } 
 if(threshold == T){ 
  if(hasV) dimmod <- P + L else dimmod <- P
  threshold <- 0.5*sqrt(2*log(dimmod)/nrow(dat$y))
 } 
 if(is.numeric(threshold) & length(threshold) == 1){
  for(j in dict.pen.x){
   j.which <- which(grpindex[[1]] == j)
   if(penindex[[1]][j.which[1]] %in% c(1, 13)){
    if(constr == "none"){
     thresholdval <- l2norminv(coef[[1]][, j.which])
    }else{
     thresholdval <- l2norminv.x(coef[[1]][, j.which])
    }       
    if(thresholdval < threshold) coef[[1]][, j.which] <- 0
   }else if(penindex[[1]][j.which[1]] %in% c(12)){
    if(constr == "none"){
     thresholdval <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv)
    }else{
     thresholdval <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv.x)
    } 
    whichnull <- which(thresholdval < threshold)
    coef[[1]][whichnull, j.which] <- 0
   }else if(penindex[[1]][j.which[1]] %in% c(11)){
    if(constr == "none"){
     thresholdval1 <- l2norminv(coef[[1]][, j.which])
     thresholdval2 <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv)
    }else{
     thresholdval1 <- l2norminv.x(coef[[1]][, j.which])
     thresholdval2 <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv.x)
    }                        
    if(thresholdval1 < threshold) coef[[1]][, j.which] <- 0
    whichnull <- which(thresholdval2 < threshold)
    coef[[1]][whichnull, j.which] <- 0
   }else if(penindex[[1]][j.which[1]] %in% c(14)){
    if((constr == "none" | constr == "symmetric")){
     thresholdval1 <- l2norminv(coef[[1]][, j.which])
     thresholdval2 <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv)
     thresholdval3 <- l2norminv(R%*%coef[[1]][, j.which])
    }else{ stop("GFL penalty can only be used in models with no or a symmetric side constraint for now") }
    if(thresholdval1 < threshold) coef[[1]][, j.which] <- 0
    if(thresholdval3 < threshold){
     coef[[1]][, j.which] <- matrix(rep(colMeans(coef[[1]][, j.which]), each=K), ncol=length(j.which))
    }
    whichnull <- which(thresholdval2 < threshold)
    coef[[1]][whichnull, j.which] <- 0
   }else if(penindex[[1]][j.which[1]] %in% c(17)){
    if(constr == "none" | constr == "symmetric"){
     thresholdval1 <- l2norminv(coef[[1]][, j.which])
     thresholdval2 <- apply(coef[[1]][, j.which, drop=F], 1, l2norminv)
    }else{ stop("Sparse group lasso with L1-fusion can only be used in models without side constraint for now") }
    if(thresholdval1 < threshold) coef[[1]][, j.which] <- 0
    clusterj <- sapply(seq(nrow(coef[[1]][, j.which])), list)
    for(s in seq(nrow(coef[[1]][, j.which]))){
     for(l in seq(j, nrow(coef[[1]][, j.which]))){
      if(l2norminv(coef[[1]][s, j.which] - coef[[1]][l, j.which]) < threshold){
       s.ind <- which(clusterj %in% coef[[1]][s, j.which])
       l.ind <- which(clusterj %in% coef[[1]][l, j.which])
       clusterj[[s.ind]] <- c(clusterj[[s.ind]], clusterj[[l.ind]])
       clusterj[[l.ind]] <- NULL
       coef[[1]][clusterj[[s.ind]], j.which] <- matrix(colMeans(coef[[1]][clusterj[[s.ind]], j.which]), nrow=length(clusterj[[s.ind]]), ncol=length(j.which), byrow=T)
      }
     }
    }
    whichnull <- which(thresholdval2 < threshold)
    coef[[1]][whichnull, j.which] <- 0
   }else if(penindex[[1]][j.which[1]] %in% c(18)){
    if(constr == "none"){
     for(i in seq(nrow(coef[[1]][, j.which]))){
      clusteri <- sapply(seq(ncol(coef[[1]][, j.which])), list)                 ## first every coefficient forms its own cluster
      for(s in seq(ncol(coef[[1]][, j.which])-1)){
       for(l in seq(s+1, ncol(coef[[1]][, j.which]))){
        if(abs(coef[[1]][i, j.which][,s] - coef[[1]][i, j.which][,l]) < threshold){
         s.ind <- which(sapply(clusteri, function(x) s %in% x))   #which(coef[[1]][i, j.which] %in% coef[[1]][i, j.which][,s])
         l.ind <- which(sapply(clusteri, function(x) l %in% x))   #which(coef[[1]][i, j.which] %in% coef[[1]][i, j.which][,l])
         clusteri[[s.ind]] <- unique(c(clusteri[[s.ind]], clusteri[[l.ind]]))
         if(s.ind != l.ind){
          clusteri[[l.ind]] <- NULL
          if(s.ind > l.ind) s.ind <- s.ind - 1
         }
         coef[[1]][i, j.which][, clusteri[[s.ind]]] <- mean(coef[[1]][i, j.which][, clusteri[[s.ind]]])
        }
       }
      }
     }
    }else{
     stop("Rowwise L1-fusion can only be used in models without side constraint for now")
    }
   }
   if(penindex[[1]][j.which[1]] %in% c(4, 41)){
    thresholdval <- l2norminv(coef[[1]][1, j.which])                                        
    if(thresholdval < threshold) coef[[1]][, j.which] <- 0
   }      
  }
  if(hasV){
   for(j in dict.pen.V){
    j.which <- which(grpindex[[2]] == j) 
    if(penindex[[2]][j.which[1]] %in% c(2, 21)){
     thresholdval <- l2norminv(coef[[2]][1, j.which])                                        
     if(thresholdval < threshold) coef[[2]][, j.which] <- 0
    }else if(penindex[[2]][j.which[1]] %in% c(3, 33)){
     thresholdval <- l2norminv(coef[[2]][, j.which])
     if(thresholdval < threshold) coef[[2]][, j.which] <- 0
    }else if(penindex[[2]][j.which[1]] %in% c(32)){
     thresholdval <- apply(coef[[2]][, j.which, drop=F], 1, l2norminv)
     whichnull <- which(thresholdval < threshold)
     coef[[2]][whichnull, j.which] <- 0
    }else if(penindex[[2]][j.which[1]] %in% c(31)){
     thresholdval1 <- l2norminv(coef[[2]][, j.which])
     thresholdval2 <- apply(coef[[2]][, j.which, drop=F], 1, l2norminv)
     if(thresholdval1 < threshold) coef[[2]][, j.which] <- 0
     whichnull <- which(thresholdval2 < threshold)
     coef[[2]][whichnull, j.which] <- 0
    }                            
   }
  }
 }


 ##### degrees of freedom
 ## note that the numeric thresholding performed in the previous lines means
 ## that the symmetric side constraint, if used, might not be fully satisfied
 ## for the thresholded coefs. therefore, subtracting from the edf in order to 
 ## account for the side constraint might in rare instances make the following
 ## edf formulas return negative values - thus, we use a max(edf, 0)-construct.
 ## for the general edf formula, see Tutz, Pφίnecker, Uhlmann (2014) or
 ## Yuan & Lin (2006). 
 df <- 0
 
 if(control@computeDF){
  if(any.notpen.x){
   df <- df + max(length(c(coef[[1]][,which.notpen.x])) - ifelse(constr == "none", 0, length(which.notpen.x)), 0)
  }

  if(any.ridgepen.x){
   #add <- sum(diag(dat$x%*%solve(crossprod(dat$x) + lambdaR * diag(ncol(dat$x)))%*%t(dat$x)))   ## <- that only works for linear models..
   add <- max(length((coef[[1]][,which.ridgepen.x])) - ifelse(constr == "none", 0, length(which.ridgepen.x)), 0) / (1 + lambdaR)
   df <- df + add #max(length((coef[[1]][,which.ridgepen.x])) - ifelse(constr == "none", 0, length(which.ridgepen.x)), 0)
  }
  if(any.grouppen.x){
   for(j in groups.grouppen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    df <- df + max(ifelse(l2normraw(coefj) > 0, 1 +
    (length(c(coefj)) - 1 - ifelse(constr == "none", 0, ncol(coefj))) *
    l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][,j.which]), 0), 0)
   }
  }
 
  if(any.spgrppen.x){
   for(j in groups.spgrppen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    df <- df + max((0.5 * ifelse(l2normraw(coefj) > 0, 1 +
    (length(c(coefj)) - 1 - ifelse(constr == "none", 0, ncol(coefj))) *
    l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][,j.which]), 0)   +
    0.5 * gamma * sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
    ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[1]][i,j.which])), 0)}))
    - ifelse(constr == "symmetric", ncol(coefj), 0)) / (0.5 + 0.5 * gamma), 0)
   }
  }
 
  if(any.lassopen.x){
   for(j in groups.lassopen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    df <- df + max(sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
    ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[1]][i,j.which])), 0)}))
    - ifelse(constr == "symmetric", ncol(coefj), 0), 0)
   }
  }

  if(any.spGFLpen.x){
   for(j in groups.spGFLpen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    #if(all(apply(coefj, 2, function(u) all(diff(range(u)) < .Machine$double.eps ^ 0.5)))){
    # coefj <- coef[[1]][1,j.which]
    # add <- ifelse(l2normraw(coefj) > 0, 1 + ifelse(length(c(coefj)) == 1, 0, (length(c(coefj)) - 1) * l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][1,j.which])), 0)
    #}else{
     diffnormratio <- ifelse(lambdaF > 0, (ifelse(l2normraw(R%*%coefj) > 0, 1, 0) + (K-2) * ifelse(l2normraw(coefj) > 0, l2normraw(mlfit$coef.stand[[1]][,j.which]) / l2normraw(coefj), 1) *
                                           l2normraw(R%*%coefj) / l2normraw(R%*%mlfit$coef.stand[[1]][,j.which])) / (K-1), 1)

     lassoadd <-  0.5 * gamma * sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
      ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[1]][i,j.which])), 0)}))

     grplassoadd <- 0.5 * ifelse(l2normraw(coefj) > 0, 1 +
      (length(c(coefj)) - 1 - ifelse(constr == "none", 0, ncol(coefj)))*
      l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][,j.which])*diffnormratio, 0)
    
     add <- max(( grplassoadd + 1/K * lassoadd + (K-1)/K * diffnormratio * lassoadd
      - ifelse(constr == "symmetric", ncol(coefj), 0)) / (0.5 + 0.5 * gamma), 0)
    #}
    df <- df + add
   }
  }

  if(any.Psplinepen.x){
   for(j in groups.Psplinepen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    add <- 0
    for(i in seq(K)){
     add <- add + ifelse(constr != i, 1 + (nrow(coefj) - 1) * crossprod(t(tD[[j]])%*%coefj[i,,drop=T]) / crossprod(t(tD[[j]])%*%mlfit$coef.stand[[1]][i,j.which,drop=T]), 0)
    }
    df <- df + add
   }
  }
  
  if(any.colPsplinepen.x){
   for(j in groups.colPsplinepen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    df <- df + ncol(coefj) + ncol(coefj)*(nrow(coefj) - 1 - ifelse(constr == "none", 0, 1)) * crossprod(R%*%coefj)/crossprod(R%*%mlfit$coef.stand[[1]][,j.which])
   }
  }
  
  if(any.spFGLpen.x){
   for(j in groups.spFGLpen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    diffnormratio <- ifelse(lambdaF > 0, (ifelse(l2normraw(R%*%coefj) > 0, 1, 0) + (K-2) * ifelse(l2normraw(coefj) > 0, l2normraw(mlfit$coef.stand[[1]][,j.which]) / l2normraw(coefj), 1) *
                                          l2normraw(R%*%coefj) / l2normraw(R%*%mlfit$coef.stand[[1]][,j.which])) / (K-1), 1)

    coefju <- unique(coefj)
    clustertable <- matrix(c(seq(nrow(coefj)), rep(0, nrow(coefj))), ncol=2)
    for(i in seq(nrow(coefj))){
     for(l in seq(nrow(coefju))){
      if(all(coefj[i,] == coefju[l,])) clustertable[i,2] <- l
     }
    }
      
    lassoadd <-  0.5 * gamma * sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
     ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[1]][i,j.which])), 0) / sum(clustertable[,2] == clustertable[i,2])}))

    grplassoadd <- 0.5 * ifelse(l2normraw(coefj) > 0, 1 +
     (length(c(coefj)) - 1 - ifelse(constr == "none", 0, ncol(coefj)))*
     l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][,j.which])*diffnormratio, 0)

    add <- max(( grplassoadd + lassoadd
     - ifelse(constr == "symmetric", ncol(coefj), 0)) / (0.5 + 0.5 * gamma), 0)
    df <- df + add
   }
  }

  if(any.rowFLpen.x){
   for(j in groups.rowFLpen.x){
    j.which <- which(grpindex[[1]] == j)
    coefj <- coef[[1]][,j.which]
    df <- df + sum(apply(coefj, 1, function(u){length(unique(u[u!=0]))}))
   }
  }
 
  if(hasV){
   if(any.globalunpen.V){
    df <- df + length(c(coef[[2]][1,which.globalunpen.V]))
   }

   if(any.globalridgepen.V){
    df <- df + length(c(coef[[2]][1,which.globalridgepen.V]))
   }
    
   if(any.globalpen.V){
    for(j in groups.globalpen.V){
     j.which <- which(grpindex[[2]] == j)
     coefj <- coef[[2]][1,j.which]
     df <- df + ifelse(l2normraw(coefj) > 0, 1 +
     ifelse(length(c(coefj)) == 1, 0, (length(c(coefj)) - 1) * l2normraw(coefj) / l2normraw(mlfit$coef.stand[[2]][1,j.which])), 0)
    }
   }
  
   if(any.catpen.V){
    for(j in groups.catpen.V){
     j.which <- which(grpindex[[2]] == j)
     coefj <- coef[[2]][,j.which]
     df <- df + ifelse(l2normraw(coefj) > 0, 1 +
     (length(c(coefj)) - 1) * l2normraw(coefj) / l2normraw(mlfit$coef.stand[[2]][,j.which]), 0)
    }
   }

   if(any.catunpen.V){
    df <- df + sum(coef[[2]][,which.catunpen.V] != 0)
   }

   if(any.catridgepen.V){
    df <- df + sum(coef[[2]][,which.catridgepen.V] != 0)
   }
  
   if(any.catspgrppen.V){
    for(j in groups.catspgrppen.V){
     j.which <- which(grpindex[[2]] == j)
     coefj <- coef[[2]][,j.which]
     df <- df + (0.5 * ifelse(l2normraw(coefj) > 0, 1 +
     (length(c(coefj)) - 1) *
     l2normraw(coefj) / l2normraw(mlfit$coef.stand[[2]][,j.which]), 0)   +
     0.5 * gamma * sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
     ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[2]][i,j.which])), 0)}))
     ) / (0.5 + 0.5 * gamma)
    }
   }
  
   if(any.catlassopen.V){
    for(j in groups.catlassopen.V){
     j.which <- which(grpindex[[2]] == j)
     coefj <- coef[[2]][,j.which]
     df <- df + sum(sapply(1:nrow(coefj), function(i){ifelse(l2normraw(coefj[i,]) > 0, 1 +
     ifelse(length(c(coefj[i,])) == 1, 0, (length(c(coefj[i,])) - 1) * l2normraw(coefj[i,]) / l2normraw(mlfit$coef.stand[[2]][i,j.which])), 0)}))
    }
   }
  } # end 'hasV'
 
  if(isordinal){
   if(any.globalunpen.o){
    df <- df + length(c(coef[[1]][1,which.globalunpen.o]))
   }

   if(any.globalridgepen.o){
    df <- df + length(c(coef[[1]][1,which.globalridgepen.o]))
   }
    
   if(any.globalpen.o){
    for(j in groups.globalpen.o){
     j.which <- which(grpindex[[1]] == j)
     coefj <- coef[[1]][1,j.which]
     df <- df + ifelse(l2normraw(coefj) > 0, 1 +
    ifelse(length(c(coefj)) == 1, 0, (length(c(coefj)) - 1) * l2normraw(coefj) / l2normraw(mlfit$coef.stand[[1]][1,j.which])), 0)
    }
   }
  } # end 'isordinal'
 }else{
  df <- sum(v.allequal(Reduce("c", coef), 0))
 }                                               # end if(control@computeDF)

 ## now retransform the coefficients back to the original scale if 
 ## standardization and centering were used.
 coef.stand <- coef

 if(control@backtransf & has.intercept){
  if(length(notpen.which.x) > 1 && (which.intercept %in% notpen.which.x)){
   coef[[1]][, notpen.which.x[-1]] <- scale(coef[[1]][, notpen.which.x[-1]], FALSE, scale.notpen.x)
  }else if(length(notpen.which.x) > 0 && !(which.intercept %in% notpen.which.x)){
   coef[[1]][, notpen.which.x] <- scale(coef[[1]][, notpen.which.x], FALSE, scale.notpen.x)
  }


  for(j in dict.pen.x){
   j.which <- which(grpindex[[1]] == j)
   if(any(coef[[1]][,j.which] != 0)){
    coef[[1]][, j.which] <- t(apply(coef[[1]][, j.which, drop=F], 1, function(u) solve(scale.pen.x[[j]], u)))
   }
  }
  
  if(hasV){
   if(length(notpen.which.V) > 0){
    coef[[2]][,notpen.which.V] <- scale(coef[[2]][,notpen.which.V], FALSE, scale.notpen.V)
   }
  
   for(j in dict.pen.V){
    j.which <- which(grpindex[[2]] == j)
    if(any(coef[[2]][,j.which] != 0)){                                    ## fixme: for cs-g variables with more than one column, the retransformation might break up the "equal coefficients in all rows of coef[[2]]"-constraint
     coef[[2]][,j.which] <- t(apply(coef[[2]][,j.which, drop=F], 1, function(u) solve(scale.pen.V[[j]], u)))
    }
   }  
  }   
  
  coefsum.x <- rowSums(sweep(coef[[1]][,-1, drop=F], 2, mean.x[-1], FUN="*"))
  coef[[1]][,1] <- coef[[1]][,1] - coefsum.x
  if(hasV){
   coefsum.V <- rowSums(sweep(coef[[2]], 2, mean.V, FUN="*"))                                            
   coef[[1]][,1] <- coef[[1]][,1] - coefsum.V
  }
  ## sometimes, there are spill-over effects on the intercept for the reference
  ## category, making it unequal to zero.
  if(is.numeric(constr)){
   if(abs(coef[[1]][constr,1]) > 1e-4){
    if(penindex[[1]][1] %in% c(10, 13)){
     coef[[1]][,1] <- coef[[1]][,1] - rep(coef[[1]][constr,1], length(coef[[1]][,1]))
    }else{
     coef[[1]][constr,1] <- 0
    }
   }else{
    coef[[1]][constr,1] <- 0
   }    
  }
 } 
 ## ensure the symmetric side constraint. (note for future self and curious 
 ## readers of this source code: the following should not work in theory, but
 ## extensive tests have shown that it does work in practice. dont ask me why..)
 if(constr == "symmetric"){
  coef[[1]] <- apply(coef[[1]], 2, function(u){u[u!=0] <- u[u!=0] - mean(u[u!=0]); u})
 } 

 ## now, we have to check the signs. sometimes the qr decomposition turns around
 ## the signs of estimated coefficient groups. the backtransformed coefficients
 ## belonging to the original observations always have the correct sign, but 
 ## coef.stand and coef.pretres sometimes dont. therefore:
 if(standardize | control@backtransf){
  coef.stand <- Map(function(x,y){x*(sign(x)*sign(y))}, coef.stand, coef)
  coef.pretres <- Map(function(x,y){x*(sign(x)*sign(y))}, coef.pretres, coef)
 }       

 ## prepare the rest of the output
 if(standardize){
  x.stand <- dat$x
  dat$x <- x.original
  if(hasV){
   V.stand <- dat$V
   dat$V <- V.original
  } 
 }
 
 ## list of active parameter groups/predictors
 guessed.active <- list()
 guessed.active[[1]] <- numeric(length(unique(grpindex[[1]])))
 guessed.active.coef <- list()
 guessed.active.coef[[1]] <- matrix(nrow = K, ncol = length(unique(grpindex[[1]])))
 guessed.active.groupdiff <- list()
 guessed.active.groupdiff[[1]] <- numeric(length(unique(grpindex[[1]])))
 guessed.active.diff <- list()
 if(fusion != F){
  guessed.active.diff[[1]] <- matrix(nrow = nrow(R), ncol = P)
 }
 if(hasV){
  guessed.active[[2]] <- numeric(length(unique(grpindex[[2]])))
  guessed.active.coef[[2]] <- matrix(nrow = K, ncol = length(unique(grpindex[[2]])))
 }

 for(j in unique(grpindex[[1]])){
  j.which <- which(grpindex[[1]] == j)
  guessed.active[[1]][j] <- any(coef[[1]][, j.which] != 0)
  guessed.active.coef[[1]][,j] <- apply(coef[[1]][, j.which, drop=F], 1, function(u) any(u != 0))
  if(fusion != F && ( (!is.numeric(constr) & K > 1) || (is.numeric(constr) & K > 2) )){
   guessed.active.groupdiff[[1]][j] <- any(R%*%coef[[1]][, j.which] != 0)
   guessed.active.diff[[1]][,j.which] <- apply(R%*%coef[[1]][, j.which, drop=F], 1, function(u) any(u != 0))
  }
 }
 if(hasV){
  lp <- length(unique(grpindex[[1]]))
  for(j in unique(grpindex[[2]])){
   j.which <- which(grpindex[[2]] == j)
   guessed.active[[2]][j - lp] <- any(coef[[2]][, j.which] != 0)
   guessed.active.coef[[2]][,j - lp] <- apply(coef[[2]][, j.which, drop=F], 1, function(u) any(u != 0))
  }
 }   

 ## residual matrix or whatever. still to come
 res <- NULL
 
 AICfit <- -2 * logl + 2 * df
 if(all(round(weights) == weights)){tnobs <- wnobs}else{tnobs <- nobs}
 BICfit <- -2 * logl + log(tnobs) * df
 if(name %in% c("Sequential Logit Model")){
  brier <- Brier(y = cbind(dat$y, 1-rowSums(dat$y)), mu = cbind(mu, 1-rowSums(mu)), weights = weights)
 }else{
  brier <- Brier(y = dat$y, mu = mu, weights = weights)
 }
 
 ## fisher matrix of the model
 if(control@fisher){
  fish <- fisher(dat=dat, mu=mu, weights=weights, Proximal.args=Proximal.args)
 }else{
  fish <- NULL
 }
  
 ## names of the data:
 colnames(coef[[1]]) <- colnames(dat$x)
 rownames(coef[[1]]) <- colnames(dat$y)
 if(hasV){
  colnames(coef[[2]]) <- colnames(dat$V[[1]])
  rownames(coef[[2]]) <- colnames(dat$y)
 } 
 colnames(coef.stand[[1]]) <- colnames(dat$x)
 rownames(coef.stand[[1]]) <- colnames(dat$y)
 if(hasV){
  colnames(coef.stand[[2]]) <- colnames(dat$V[[1]])
  rownames(coef.stand[[2]]) <- colnames(dat$y)
 } 
 colnames(coef.pretres[[1]]) <- colnames(dat$x)
 rownames(coef.pretres[[1]]) <- colnames(dat$y)
 if(hasV){
  colnames(coef.pretres[[2]]) <- colnames(dat$V[[1]])
  rownames(coef.pretres[[2]]) <- colnames(dat$y)
 }
 
 ## dont attach the data to the object if keepdat is FALSE.
 y <- dat$y
 if(!keepdat){
  dat <- NULL
  y <- NULL
 }

 if(!keepdat | !standardize){
  x.stand <- NULL
  x.original <- NULL
 }
 if(!keepdat | !standardize | !hasV){ 
  V.stand <- NULL
  V.original <- NULL
 }

 ## set the class of the coef objects to MRSP.coef
 class(coef) <- "MRSP.coef"
 class(coef.stand) <- "MRSP.coef"
 class(coef.pretres) <- "MRSP.coef"

 ## list with all arguments present in the current environment. this is needed
 ## to compute the refits for a list of lambda values more efficiently.
 if(refit == "L" | control@keeparglist == T){
  arglist <- list()
  helplist <- ls()[-which(ls() == "arglist" | ls() == "ybar")]
  if(control@noarglistdat == T){
   helplist <- helplist[-which(helplist %in% c("dat", "eta", "mu", "V.original", "V.stand", "Vs", "Vw",
                                               "V.centered", "decomp.V", "scale.pen.V", "scale.notpen.V",
                                               "x.centered", "x.original", "x.stand", "xw", "decomp.x",
                                               "scale.pen.x", "scale.notpen.x", "y", "Proximal.args",
                                               "mycall", "fish", "fisher", "eweights", "seweights",
                                               "sweights", "updated"))]
  }
  arglist <- mget(helplist, envir = as.environment(-1))
  names(arglist) <- helplist
 }else{arglist <- NULL}

 ## the final output, an object of class "MRSP"
 out <- new("MRSP",
             coef = coef,
             coef.stand = coef.stand,
             coef.pretres = coef.pretres,
             dat = dat,
             x.original = x.original,
             x.stand = x.stand,
             V.original = V.original,
             V.stand = V.stand,
             y = y,
             weights = weights,
             penindex = penindex,
             grpindex = grpindex,
             penweights = penweights,
             guessed.active = guessed.active,
             guessed.active.coef = guessed.active.coef,
             guessed.active.groupdiff = guessed.active.groupdiff,
             guessed.active.diff = guessed.active.diff,
             df = df,
             tuning = tuning.old,
             lambda = lambda,
             lambdaR = lambdaR,
             lambdaF = lambdaF,
             fusion = fusion,
             gamma = gamma,
             psi = psi,
             eta = eta,
             mu = mu,
             indg = indg,
             indcs = indcs,
             offset = offset,
             residuals = res,
             loglik = logl,
             penalty = pen,
             fn.val = fn.val,
             model = model,
             constr = constr,
             control = control,
             iter.count = iter.count,
             best.iter = best.iter,
             ridgestabil = ridgestabil,
             name = name,
             AIC = AICfit,
             BIC = BICfit,
             Brier = brier,
             threshold = threshold,
             refit = refit,
             fisher = fish,
             arglist = arglist,
             call = mycall)
             
 if(refit == T) out <- refit(out)
 return(out)
})


## this applies MRSP.fit to a sequence of lambda-values:
setMethod("MRSP.fit",
          signature(lambda = "list"),
function(dat, coef.init, coef.stand.init, coef.pretres.init, offset = rep(0, nrow(dat$y)),
                     weights = rep(1, nrow(dat$y)), grpindex = NULL, penindex = NULL,
                     lambda, lambdaR = lambda, lambdaF = lambda, gamma = 1, psi = 1,
                     indg = NULL, indcs = NULL,
                     model = multinomlogit(), constr = NULL, control = MRSP.control(),
                     fista.control = NULL, Proximal.control = NULL, Proximal.args = NULL,
                     penweights = NULL, mlfit = NULL, adaptive = FALSE, threshold = FALSE,
                     refit = FALSE, fusion = FALSE, nonneg = FALSE, ...)
{
 mycall <- match.call()
 ## if any objects that are not formals of MRSP.fit are passed via ... , we need
 ## to explicitly assign the corresponding objects:
 dotlist <- list(...)
 for(i in seq_len(length(dotlist))) assign(names(dotlist)[i], dotlist[[i]])
 if(is.expression(model)) model <- eval(model)
 
 if(control@expandcall){
  excl <- c(1, which(names(mycall) %in% c("dat", "model")))
  if(is.null(mycall$model)){
   mycall$model <- expression(multinomlogit())
   excl <- c(excl, length(mycall))
  }else{
   mycall[[which(names(mycall) == "model")]] <- switch(model@name,
    "Multinomial Logit Model" = expression(multinomlogit()),
    "Sequential Logit Model" = expression(sequentiallogit()),
    "CUB Binomial Logit Model" = expression(CUBbinomiallogit()),
    "OLS Regression" = expression(OLSreg())
   )
  }
  mycall[-excl] <- mget(names(mycall[-excl]), envir = as.environment(-1))
 }

 if(missing(constr) | is.null(constr))  constr <- model@constraint
 if(nonneg == T && constr == "symmetric"){
  constr <- 1
  warning()
  cat(" You cannot combine a nonnegativity constraint on the coefficients with a symmetric side constraint!", "\n", "The first response category is instead used as reference.","\n")
 }
 ## are there any category-specific predictors? they are stored in the 'V'-entry of dat
 hasV <- !is.null(dat$V)
 
 nobs <- nrow(dat$y)
 wnobs <- sum(weights)
 K <- ncol(dat$y)
 P <- ncol(dat$x)
 if(hasV & !is.matrix(dat$V[[1]])){dat$V <- lapply(dat$V, as.matrix)}
 if(hasV){L <- ncol(dat$V[[1]])}

 
 if(hasV){
  if(missing(penindex) | is.null(penindex)){
   if((missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indg <- integer(0)
    indcs <- seq(1, L)
   }else if((missing(indg) | is.null(indg)) && !(missing(indcs) | is.null(indcs))){
    indg <- seq(1, L)[-which(seq(1, L) %in% indcs)]
   }else if(!(missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indcs <- seq(1, L)[-which(seq(1, L) %in% indg)]
   }  
  }else{
   indg <- which(penindex[[2]] %in% c(2, 20, 21))
   indcs <- which(penindex[[2]] %in% c(3, 30, 31, 32, 33))
  } 
 }

 ## handling the intercept
 which.intercept <- which(apply(dat$x, 2, function(u) diff(range(u)) < .Machine$double.eps ^ 0.5))
 has.intercept <- !(length(which.intercept) == 0)
 if((length(which.intercept) == 1) && which.intercept != 1){
  dat$x[,c(1, which.intercept)] <- dat$x[,c(which.intercept, 1)]
  penindex[[1]][c(1,which.intercept)] <- penindex[[1]][c(which.intercept,1)]
  grpindex[[1]][c(1,which.intercept)] <- grpindex[[1]][c(which.intercept,1)]
  if(!all(coef[[1]] == 0))
  coef[[1]][,c(1,which.intercept)] <- coef[[1]][,c(which.intercept,1)]
  which.intercept <- 1
 }
 if(length(which.intercept) > 1)
   stop("more than one intercept column specified")

 if(missing(penindex) | is.null(penindex)){
  penindex <- list()
  penindex[[1]] <- rep(1, P)
  penindex[[1]][1] <- 10
  if(hasV){
   penindex[[2]][indg] <- rep(2, length(indg))
   penindex[[2]][indcs] <- rep(3, length(indcs))
  }
 }
 
 #K <- ncol(dat$y)
 ## if the first response category is chosen as reference, this can cause 
 ## problems on many occasions. therefore, we rearrange stuff in this case.
 ## really? probably not! commented out for now!
 #if(constr == 1){
 # dat$y[,c(1, K)] <- dat$y[, c(K, 1)]
 # colnames(dat$y)[c(1, K)] <- colnames(dat$y)[c(K, 1)]
 # dat$V[c(1, K)] <- dat$V[c(K, 1)]
 # if(!(missing(mlfit) | is.null(mlfit)) | !(missing(coef.init) | is.null(coef.init)) |
 #    !(missing(coef.stand.init) | is.null(coef.stand.init)) | 
 #    !(missing(coef.pretres.init) | is.null(coef.pretres.init)) |
 #    !(missing(penweights) | is.null(penweights))  ){
 #     stop("something went completely wrong in the passing on of arguments for the case of 'constr == 1'. please use a response category different from the first one as reference.")
 # }  
 #}
  
 if(!(missing(grpindex) | is.null(grpindex))){
  if(!is.list(grpindex)){grpindex <- list(grpindex)}
 }
 if(!(missing(penindex) | is.null(penindex))){
  if(!is.list(penindex)){penindex <- list(penindex)}
 }


 if(!is.list(lambda)) stop("the supplied lambda was not a list although it should have been")
 if(length(lambda) == 1){lambda <- lambda[[1]]}else{lambda <- unlist(lambda)}
 if(!is.numeric(lambda)) lambda <- as.numeric(lambda)

 if(is.list(lambdaR)){
  if(length(lambdaR) == 1 ){lambdaR <- lambdaR[[1]]}else{lambdaR <- unlist(lambdaR)}
 }
 if(is.numeric(lambdaR) & (length(lambdaR) == 1)){
  lambdaR <- rep(lambdaR, length(lambda))
 }
 if(length(lambdaR) != length(lambda)) stop("lambdaR not of appropriate length")
 if(!is.numeric(lambdaR)) lambdaR <- as.numeric(lambdaR)

 lambdaR <- sort(lambdaR, decreasing = T)
 lambda <- sort(lambda, decreasing = T)
 nrlambda <- length(lambda)

 if(is.list(lambdaF)){
  if(length(lambdaF) == 1 ){lambdaF <- lambdaF[[1]]}else{lambdaF <- unlist(lambdaF)}
 }
 if(is.numeric(lambdaF) & (length(lambdaF) == 1)){
  lambdaF <- rep(lambdaF, length(lambda))
 }
 if(length(lambdaF) != length(lambda)) stop("lambdaF not of appropriate length")
 if(!is.numeric(lambdaF)) lambdaF <- as.numeric(lambdaF)

 lambdaF <- sort(lambdaF, decreasing = T)

 if((missing(mlfit) | is.null(mlfit)) & control@doMLfit == T){
  mlfit <- getmlfit(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              offset = offset, weights = weights, grpindex = grpindex,
                              penindex = penindex, lambda = lambda, lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma,
                              psi = psi, indg = indg, indcs = indcs, model = model, constr = constr,
                              control = control, fista.control = fista.control,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit,
                              nonneg = nonneg, ...)
  control@doMLfit <- F
 }

 if(missing(penweights) | is.null(penweights)){
  penweights <- getpenweights(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              coef.pretres.init = coef.pretres.init, offset = offset, weights = weights,
                              grpindex = grpindex, penindex = penindex, lambda = lambda, lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma, psi = psi, indg = indg, indcs = indcs,
                              model = model, constr = constr, control = control, fista.control = fista.control,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit, mlfit = mlfit,
                              fusion = fusion, nonneg = nonneg, ...)
 }

 refitold <- refit
 if(refit == T) refit <- "L"
 
 fitlist <- list()

 fitlist[[1]] <- MRSP.fit(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                          coef.pretres.init = coef.pretres.init, offset = offset, weights = weights,
                          grpindex = grpindex, penindex = penindex, lambda = lambda[1], lambdaR = lambdaR[1],
                          lambdaF = lambdaF[1], gamma = gamma, psi = psi, indg = indg, indcs = indcs,
                          model = model, constr = constr, control = control, 
                          fista.control = fista.control, Proximal.control = Proximal.control,
                          Proximal.args = Proximal.args, penweights = penweights, mlfit = mlfit,
                          adaptive = adaptive, threshold = threshold, refit = refit, fusion = fusion,
                          nonneg = nonneg, ...)
                          
 if(nrlambda > 1){
  control@keepdat <- F
  control@noarglistdat <- T
  for(pos in seq(2, nrlambda)){
   if((fitlist[[pos-1]]@control)@ridgestabilrf == T) control@ridgestabilrf <- T  
   
   fitlist[[pos]] <- MRSP.fit(dat = dat, coef.init = fitlist[[pos-1]]@coef,
                              coef.stand.init = fitlist[[pos-1]]@coef.stand,
                              coef.pretres.init = fitlist[[pos-1]]@coef.pretres,
                              offset = offset, weights = weights, grpindex = grpindex,
                              penindex = penindex, lambda = lambda[pos], lambdaR = lambdaR[pos],
                              lambdaF = lambdaF[pos], gamma = gamma, psi = psi, indg = indg,
                              indcs = indcs, model = model,
                              constr = constr, control = control, fista.control = fista.control,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args, 
                              penweights = penweights, mlfit = mlfit, adaptive = adaptive, 
                              threshold = threshold, refit = refit, fusion = fusion,
                              nonneg = nonneg, ...)
                              
  }
 }
 
 if(nrlambda == 1){fit <- fitlist[[1]]}
 if(nrlambda > 1){
  fit <- fitlist
  #fit[[nrlambda + 1]] <- mycall
  class(fit) <- "MRSP.list"
  fit <- structure(fit, call = mycall, dat = dat)
 }
 
 if(refitold == T){
  argl1 <- fit[[1]]@arglist
  if(!control@keeparglist) fit[[1]]@arglist <- NULL
  argl1$coef.init <- argl1$coef
  argl1$coef.stand.init <- argl1$coef.stand
  argl1$coef.pretres.init <- argl1$coef.pretres
  
  fit[[1]] <- refit(fit[[1]], argl1)
  rm(argl1)
  
  if(length(fit) > 1){
   for(pos in seq(2, length(fit)-1)){
    argl <- fit[[pos]]@arglist
    if(!control@keeparglist) fit[[pos]]@arglist <- NULL
    argl$dat <- fit[[1]]@dat
    argl$coef <- argl$coef.init <- fit[[pos-1]]@coef
    argl$coef.stand <- argl$coef.stand.init <- fit[[pos-1]]@coef.stand
    argl$coef.pretres <- argl$coef.pretres.init <- fit[[pos-1]]@coef.pretres
  
    fit[[pos]] <- refit(fit[[pos]], argl)
    rm(argl)
   }
  }
 }
 
 if(!control@keeparglist){
  for(pos in seq_len(length(fit)-1)){
   fit[[pos]]@arglist <- NULL
  }
 }
 
 fit <- asS4(fit)   
 return(fit)
})


## standard method for missing lambda
setMethod("MRSP.fit",
          signature(lambda = "missing"),
function(dat, coef.init, coef.stand.init, coef.pretres.init, offset = rep(0, nrow(dat$y)),
                     weights = rep(1, nrow(dat$y)), grpindex = NULL, penindex = NULL,
                     lambda, lambdaR = NULL, lambdaF = 0, gamma = 1, psi = 1, indg = NULL, indcs = NULL,
                     model = multinomlogit(), constr = NULL, control = MRSP.control(),
                     fista.control = NULL, Proximal.control = NULL, Proximal.args = NULL,
                     penweights = NULL, mlfit = NULL, adaptive = FALSE, threshold = FALSE,
                     refit = FALSE, fusion = FALSE, nonneg = FALSE, ...)
{
 mycall <- match.call()
 ## if any objects that are not formals of MRSP.fit are passed via ... , we need
 ## to explicitly assign the corresponding objects:
 dotlist <- list(...)
 for(i in seq_len(length(dotlist))) assign(names(dotlist)[i], dotlist[[i]])
 if(is.expression(model)) model <- eval(model)
 
 if(control@expandcall){
  excl <- c(1, which(names(mycall) %in% c("dat", "model")))
  if(is.null(mycall$model)){
   mycall$model <- expression(multinomlogit())
   excl <- c(excl, length(mycall))
  }else{
   mycall[[which(names(mycall) == "model")]] <- switch(model@name,
    "Multinomial Logit Model" = expression(multinomlogit()),
    "Sequential Logit Model" = expression(sequentiallogit()),
    "CUB Binomial Logit Model" = expression(CUBbinomiallogit()),
    "OLS Regression" = expression(OLSreg())
   )
  }
  mycall[-excl] <- mget(names(mycall[-excl]), envir = as.environment(-1))
 }

 if(missing(constr) | is.null(constr))  constr <- model@constraint
 if(nonneg == T && constr == "symmetric"){
  constr <- 1
  warning()
  cat(" You cannot combine a nonnegativity constraint on the coefficients with a symmetric side constraint!", "\n", "The first response category is instead used as reference.","\n")
 }
 ## are there any category-specific predictors? they are stored in the 'V'-entry of dat
 hasV <- !is.null(dat$V)
 
 nobs <- nrow(dat$y)
 wnobs <- sum(weights)
 K <- ncol(dat$y)
 P <- ncol(dat$x)
 if(hasV & !is.matrix(dat$V[[1]])){dat$V <- lapply(dat$V, as.matrix)}
 if(hasV){L <- ncol(dat$V[[1]])}

 if(hasV){
  if(missing(penindex) | is.null(penindex)){
   if((missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indg <- integer(0)
    indcs <- seq(1, L)
   }else if((missing(indg) | is.null(indg)) && !(missing(indcs) | is.null(indcs))){
    indg <- seq(1, L)[-which(seq(1, L) %in% indcs)]
   }else if(!(missing(indg) | is.null(indg)) && (missing(indcs) | is.null(indcs))){
    indcs <- seq(1, L)[-which(seq(1, L) %in% indg)]
   }  
  }else{
   indg <- which(penindex[[2]] %in% c(2, 20, 21))
   indcs <- which(penindex[[2]] %in% c(3, 30, 31, 32, 33))
  } 
 }

 ## handling the intercept
 which.intercept <- which(apply(dat$x, 2, function(u) diff(range(u)) < .Machine$double.eps ^ 0.5))
 has.intercept <- !(length(which.intercept) == 0)
 if((length(which.intercept) == 1) && which.intercept != 1){
  dat$x[,c(1, which.intercept)] <- dat$x[,c(which.intercept, 1)]
  penindex[[1]][c(1,which.intercept)] <- penindex[[1]][c(which.intercept,1)]
  grpindex[[1]][c(1,which.intercept)] <- grpindex[[1]][c(which.intercept,1)]
  if(!all(coef[[1]] == 0))
  coef[[1]][,c(1,which.intercept)] <- coef[[1]][,c(which.intercept,1)]
  which.intercept <- 1
 }
 if(length(which.intercept) > 1)
   stop("more than one intercept column specified")

 if(missing(penindex) | is.null(penindex)){
  penindex <- list()
  penindex[[1]] <- rep(1, P)
  penindex[[1]][1] <- 10
  if(hasV){
   penindex[[2]][indg] <- rep(2, length(indg))
   penindex[[2]][indcs] <- rep(3, length(indcs))
  }
 }
 
 #K <- ncol(dat$y)
 ## if the first response category is chosen as reference, this can cause 
 ## problems on many occasions. therefore, we rearrange stuff in this case.
 ## really? probably not! commented out for now!
 #if(constr == 1){
 # dat$y[,c(1, K)] <- dat$y[, c(K, 1)]
 # colnames(dat$y)[c(1, K)] <- colnames(dat$y)[c(K, 1)]
 # dat$V[c(1, K)] <- dat$V[c(K, 1)]
 # if(!(missing(mlfit) | is.null(mlfit)) | !(missing(coef.init) | is.null(coef.init)) |
 #   !(missing(coef.stand.init) | is.null(coef.stand.init)) | 
 #   !(missing(coef.pretres.init) | is.null(coef.pretres.init)) |
 #   !(missing(penweights) | is.null(penweights))  ){
 #    stop("something went completely wrong in the passing on of arguments for the case of 'constr == 1'. please use a response category different from the first one as reference.")
 #}  
 #} 

 if(!(missing(grpindex) | is.null(grpindex))){
  if(!is.list(grpindex)){grpindex <- list(grpindex)}
 }
 if(!(missing(penindex) | is.null(penindex))){
  if(!is.list(penindex)){penindex <- list(penindex)}
 }

 if((missing(mlfit) | is.null(mlfit)) & control@doMLfit == T){
  mlfit <- getmlfit(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              offset = offset, weights = weights, grpindex = grpindex,
                              penindex = penindex, lambda = 123, lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma,
                              psi = psi, indg = indg, indcs = indcs, model = model, constr = constr,
                              control = control, fista.control = fista.control,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit,
                              nonneg = nonneg, ...)
  control@doMLfit <- F
 }

 if(!missing(lambda))
   stop("something went wrong with the passing on of lambda in the calls to MRSP.fit")
 
 if(missing(penweights) | is.null(penweights)){
  penweights <- getpenweights(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                              coef.pretres.init = coef.pretres.init, offset = offset, weights = weights,
                              grpindex = grpindex, penindex = penindex, lambda = list(123), lambdaR = lambdaR,
                              lambdaF = lambdaF, gamma = gamma, psi = psi, indg = indg, indcs = indcs,
                              model = model, constr = constr, control = control, fista.control = fista.control,
                              Proximal.control = Proximal.control, Proximal.args = Proximal.args,
                              adaptive = adaptive, threshold = threshold, refit = refit, mlfit = mlfit,
                              fusion = fusion, nonneg = nonneg, ...)
 }
  
 nrlambda <- control@nrlambda
 lambdabase <- control@lambdabase
 lambdamax <- control@lambdamax
 if(is.null(lambdamax)){
  lambdamax <- getlambdamax(dat = dat, offset = offset, weights = weights,
                            grpindex = grpindex, penindex = penindex, psi = psi,
                            gamma = gamma, model = model, constr = constr,
                            control = control, penweights = penweights,
                            indg = indg, indcs = indcs, adaptive = adaptive,
                            threshold = threshold, refit = refit, fusion = fusion,
                            lambdaF = lambdaF, nonneg = nonneg)
 }
 lambdamin <- min(control@lambdamin, lambdamax / 100)
 lambdalogvals <- seq(log(lambdamin / lambdabase), log(lambdamax / lambdabase), length.out = nrlambda)
 lambda <- lambdabase * exp(lambdalogvals)
 if(missing(lambdaR) | is.null(lambdaR)) lambdaR <- lambda ## attention, lambdaR may not be a list yet here!
 if(missing(lambdaF) | is.null(lambdaF)) lambdaF <- lambda
                                         
 fit <- MRSP.fit(dat = dat, coef.init = coef.init, coef.stand.init = coef.stand.init,
                          coef.pretres.init = coef.pretres.init, offset = offset, weights = weights,
                          grpindex = grpindex, penindex = penindex, lambda = list(lambda),
                          lambdaR = list(lambdaR), lambdaF = list(lambdaF), gamma = gamma, psi = psi,
                          indg = indg, indcs = indcs, model = model, constr = constr, control = control,
                          fista.control = fista.control, Proximal.control = Proximal.control,
                          Proximal.args = Proximal.args, penweights = penweights, adaptive = adaptive,
                          threshold = threshold, refit = refit, mlfit = mlfit, fusion = fusion,
                          nonneg = nonneg, ...)

 if(length(fit) > 1){
  fit <- structure(fit, call = mycall, dat = dat)
 }
 fit <- asS4(fit)
 return(fit)
})