grpsel: Group Subset Selection

#' @title Group subset selection
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Fits the regularisation surface for a regression model with a group subset selection
#' penalty. The group subset penalty can be combined with either a group lasso or ridge penalty
#' for shrinkage. The group subset parameter is \code{lambda} and the group lasso/ridge parameter is
#' \code{gamma}.
#'
#' @param x a predictor matrix
#' @param y a response vector
#' @param group a vector of length \code{ncol(x)} with the jth element identifying the group that
#' the jth predictor belongs to; alternatively, a list of vectors with the kth vector identifying
#' the predictors that belong to the kth group (useful for overlapping groups)
#' @param penalty the type of penalty to apply; one of 'grSubset', 'grSubset+grLasso', or
#' 'grSubset+Ridge'
#' @param loss the type of loss function to use; 'square' for linear regression or 'logistic' for
#' logistic regression
#' @param local.search a logical indicating whether to perform local search after coordinate
#' descent; typically leads to higher quality solutions
#' @param orthogonalise a logical indicating whether to orthogonalise within groups
#' @param nlambda the number of group subset selection parameters to evaluate when  \code{lambda} is
#' computed automatically; may evaluate fewer parameters if \code{pmax} or \code{gmax} is reached
#' first
#' @param lambda.step the step size taken when computing \code{lambda} from the data; should be a
#' value strictly between 0 and 1; larger values typically lead to a finer grid of subset sizes
#' @param lambda an optional list of decreasing sequences of group subset selection parameters; the
#' list should contain a vector for each value of \code{gamma}
#' @param lambda.factor a vector of penalty factors applied to the group subset selection penalty;
#' equal to the group sizes by default
#' @param ngamma the number of group lasso or ridge parameters to evaluate when \code{gamma} is
#' computed automatically
#' @param gamma.max the maximum value for \code{gamma} when \code{penalty='grSubset+Ridge'}; when
#' \code{penalty='grSubset+grLasso'} \code{gamma.max} is computed automatically from the data
#' @param gamma.min the minimum value for \code{gamma} when \code{penalty='grSubset+Ridge'} and the
#' minimum value for \code{gamma} as a fraction of \code{gamma.max} when
#' \code{penalty='grSubset+grLasso'}
#' @param gamma an optional decreasing sequence of group lasso or ridge parameters
#' @param gamma.factor a vector of penalty factors applied to the shrinkage penalty; by default,
#' equal to the square root of the group sizes when \code{penalty='grSubset+grLasso'} or a vector of
#' ones when \code{penalty='grSubset+Ridge'}
#' @param pmax the maximum number of predictors ever allowed to be active; ignored if \code{lambda}
#' is supplied
#' @param gmax the maximum number of groups ever allowed to be active; ignored if \code{lambda} is
#' supplied
#' @param eps the convergence tolerance; convergence is declared when the relative maximum
#' difference in consecutive coefficients is less than \code{eps}
#' @param max.cd.iter the maximum number of coordinate descent iterations allowed per value of
#' \code{lambda} and \code{gamma}
#' @param max.ls.iter the maximum number of local search iterations allowed per value of
#' \code{lambda} and \code{gamma}
#' @param active.set a logical indicating whether to use active set updates; typically lowers the
#' run time
#' @param active.set.count the number of consecutive coordinate descent iterations in which a
#' subset should appear before running active set updates
#' @param sort a logical indicating whether to sort the coordinates before running coordinate
#' descent; required for gradient screening; typically leads to higher quality solutions
#' @param screen the number of groups to keep after gradient screening; smaller values typically
#' lower the run time
#' @param warn a logical indicating whether to print a warning if the algorithms fail to converge
#'
#' @details For linear regression (\code{loss='square'}) the response and predictors are centred
#' about zero and scaled to unit l2-norm. For logistic regression (\code{loss='logistic'}) only the
#' predictors are centred and scaled and an intercept is fit during the course of the algorithm.
#'
#' @return An object of class \code{grpsel}; a list with the following components:
#' \item{beta}{a list of matrices whose columns contain fitted coefficients for a given value of
#' \code{lambda}; an individual matrix in the list for each value of \code{gamma}}
#' \item{gamma}{a vector containing the values of \code{gamma} used in the fit}
#' \item{lambda}{a list of vectors containing the values of \code{lambda} used in the fit; an
#' individual vector in the list for each value of \code{gamma}}
#' \item{np}{a list of vectors containing the number of active predictors per value of
#' \code{lambda}; an individual vector in the list for each value of \code{gamma}}
#' \item{ng}{a list of vectors containing the the number of active groups per value of
#' \code{lambda}; an individual vector in the list for each value of \code{gamma}}
#' \item{iter.cd}{a list of vectors containing the number of coordinate descent iterations per value
#' of \code{lambda}; an individual vector in the list for each value of \code{gamma}}
#' \item{iter.ls}{a list of vectors containing the number of local search iterations per value
#' of \code{lambda}; an individual vector in the list for each value of \code{gamma}}
#' \item{loss}{a list of vectors containing the evaluated loss function per value of \code{lambda}
#' evaluated; an individual vector in the list for each value of \code{gamma}}
#'
#' @references Thompson, R. and Vahid, F. (2024). 'Group selection and shrinkage: Structured
#' sparsity for semiparametric additive models'. Journal of Computational and Graphical Statistics
#' 33.4, pp. 1286–1297.
#'
#' @example R/examples/example-grpsel.R
#'
#' @export

grpsel <- \(x, y, group = seq_len(ncol(x)),
            penalty = c('grSubset', 'grSubset+grLasso', 'grSubset+Ridge'),
            loss = c('square', 'logistic'), local.search = FALSE, orthogonalise = FALSE,
            nlambda = 100, lambda.step = 0.99, lambda = NULL,  lambda.factor = NULL, ngamma = 10,
            gamma.max = 100, gamma.min = 1e-4, gamma = NULL, gamma.factor = NULL, pmax = ncol(x),
            gmax = length(unique(group)), eps = 1e-4, max.cd.iter = 1e4, max.ls.iter = 100,
            active.set = TRUE, active.set.count = 3, sort = TRUE, screen = 500, warn = TRUE) {

  penalty <- match.arg(penalty)
  loss <- match.arg(loss)

  # Check data is valid
  if (!is.matrix(x)) x <- as.matrix(x)
  if (!is.matrix(y)) y <- as.matrix(y)
  if (anyNA(y)) stop('y contains NAs; remove or impute rows with missing values')
  if (anyNA(x)) stop('x contains NAs; remove or impute rows with missing values')
  attributes(y) <- list(dim = attributes(y)$dim)
  attributes(x) <- list(dim = attributes(x)$dim)
  if (loss == 'logistic' & !all(y %in% c(0, 1))) stop('y must take values in {0,1}')

  # Check arguments are valid
  if (nrow(x) != nrow(y)) stop('x and y must have same number of observations')
  overlap <- is.list(group)
  if (overlap & length(unique(unlist(group))) != ncol(x)) {
    stop('each column of x must be assigned to a group')
  }
  if (!overlap & length(group) != ncol(x)) {
    stop('each column of x must be assigned to a group')
  }
  if (nlambda < 1) stop('nlambda must be at least one')
  if (ngamma < 1) stop('ngamma must be at least one')
  if (gamma.max <= 0) stop('gamma.max must be positive')
  if (gamma.min <= 0) stop('gamma.min must be positive')
  if (penalty != 'grSubset' & !is.null(lambda)) {
    if (is.null(gamma) & length(lambda) != ngamma) {
      stop('lambda must be a list with length ngamma')
    }
    if (!is.null(gamma) & length(lambda) != length(gamma)) {
      stop('lambda must be a list with same length as gamma')
    }
  }
  if (overlap) g <- length(group) else g <- length(unique(group))
  if (!is.null(lambda.factor) & length(lambda.factor) != g) {
    stop('length of lambda.factor must equal number of groups')
  }
  if (!is.null(gamma.factor) & length(gamma.factor) != g) {
    stop('length of gamma.factor must equal number of groups')
  }
  if (lambda.step >= 1 | lambda.step <= 0) stop('lambda.step must be between 0 and 1 (strictly)')

  # Expand predictor matrix if groups overlap and orthogonality is required
  if (overlap & orthogonalise) {
      coef.id <- sort(unlist(group))
      x <- x[, coef.id]
      group <- rep(seq_along(group), times = lengths(group))[order(unlist(group))]
  }

  # Cap pmax if necessary
  n <- nrow(x)
  p <- ncol(x)
  pmax <- min(pmax, n - 1, p)

  # Standardise data
  x <- standardise(x)
  if (loss == 'square') y <- standardise(y)
  x.c <- attributes(x)$`scaled:center`
  x.s <- attributes(x)$`scaled:scale`
  y.c <- attributes(y)$`scaled:center`
  y.s <- attributes(y)$`scaled:scale`

  # Set up groups
  if (!is.list(group)) group <- stats::aggregate(1:p ~ group, FUN = c, simplify = FALSE)[, 2]
  pk <- vapply(group, length, integer(1))
  if (orthogonalise & any(pk > n)) {
    stop('maximum group size cannot exceed sample size; set orthogonalisation=F')
  }
  group0 <- lapply(group, '-', 1) # For C++
  if (!overlap | (overlap & orthogonalise)) {
    group.ind <- group
    group0.ind <- group0
  } else {
    group.ind <- stats::aggregate(1:sum(pk) ~ rep(1:g, pk), FUN = c, simplify = FALSE)[, 2]
    group0.ind <- lapply(group.ind, '-', 1) # For C++
  }

  # Set up penalty factors
  if (is.null(lambda.factor)) lambda.factor <- pk
  if (is.null(gamma.factor)) {
    if (penalty == 'grSubset+grLasso') gamma.factor <- sqrt(pk) else gamma.factor <- rep(1, g)
  }
  penalty.factor <- cbind(lambda.factor, gamma.factor, gamma.factor)

  # Orthogonalise groups
  if (orthogonalise) {
    x <- orthogonalise(x, group0)
    z <- x$z
    x <- x$x
  }

  # Compute Lipschitz constants
  if (orthogonalise | all(pk == 1)) {
    lips.const <- rep(1, g)
  } else {
    lips.const <- lipschitz(x, group0)
  }
  if (loss == 'logistic') lips.const <- lips.const / 4

  # Set up regularisation sequences
  if (is.null(gamma)) {
    if (penalty == 'grSubset') {
      ngamma <- 1
      gamma <- 0
    } else if (penalty == 'grSubset+grLasso') {
      ind <- unlist(group[gamma.factor == 0])
      if (is.null(ind)) {
        if (loss == 'square') {
          r <- y
        } else if (loss == 'logistic') {
          r <- stats::residuals(stats::glm(y ~ 1, family = 'binomial'), type = 'response')
        }
      } else {
        if (loss == 'square') {
          r <- stats::residuals(stats::glm(y ~ x[, ind], family = 'gaussian'), type = 'response')
        } else if (loss == 'logistic') {
          r <- stats::residuals(stats::glm(y ~ x[, ind], family = 'binomial'), type = 'response')
        }
      }
      xr <- crossprod(x, r)
      gamma.max <- max(vapply(which(gamma.factor != 0), \(l) sqrt(sum(xr[group[[l]]] ^ 2)) /
                                gamma.factor[l], numeric(1)))
      gamma <- exp(seq(log(gamma.max), log(gamma.max * gamma.min), length.out = ngamma))
      gamma[1] <- gamma[1] * 1.00001 # Ensures first solution is zero when lambda=0
    } else if (penalty == 'grSubset+Ridge') {
      gamma <- exp(seq(log(gamma.max), log(gamma.min), length.out = ngamma))
    }
  } else {
    ngamma <- length(gamma)
  }
  if (is.null(lambda)) lambda <- replicate(ngamma, rep(- 1, nlambda), simplify = FALSE)

  # Fit regularisation surface
  result <- fitsurface(x, y, group0, group0.ind, local.search, penalty.factor, lambda, gamma,
                       which(penalty == c('grSubset', 'grSubset+grLasso', 'grSubset+Ridge')),
                       lambda.step, pmax, gmax, active.set, active.set.count, sort, screen, eps,
                       max.cd.iter, max.ls.iter, lips.const,
                       which(loss == c('square', 'logistic')))

  # Unwind orthogonalisation
  if (orthogonalise) result$beta <- lapply(result$beta, unorthogonalise, groups = group0, z = z)

  # Aggregate coefficients if groups overlap
  if (overlap & !orthogonalise) {
    result$beta <- lapply(result$beta,
                          \(beta) stats::aggregate(beta ~ unlist(group), FUN = sum)[, - 1])
    result$beta <- lapply(result$beta, \(beta) {colnames(beta) <- NULL; as.matrix(beta)})
  }

  # Unwind standardisation
  result$beta <- mapply(unstandardise, beta = result$beta, intercept = result$intercept,
                        MoreArgs = list(x.c = x.c, x.s = x.s, y.c = y.c, y.s = y.s, loss = loss),
                        SIMPLIFY = FALSE)
  result$intercept <- NULL

  # Aggregate coefficients if groups overlap
  if (overlap & orthogonalise) {
    result$beta <- lapply(result$beta,
                          \(beta) stats::aggregate(beta ~ c(0, coef.id), FUN = sum)[, - 1])
    result$beta <- lapply(result$beta, \(beta) {colnames(beta) <- NULL; as.matrix(beta)})
  }

  # Warn if maximum iterations reached
  if (warn & any(unlist(result$iter.cd) == max.cd.iter)) {
    warning('coordinate descent did not converge for at least one set of regularisation parameters')
  }
  if (warn & any(unlist(result$iter.ls) == max.ls.iter)) {
    warning('local search did not converge for at least one set of regularisation parameters')
  }

  # Return result
  class(result) <- 'grpsel'
  return(result)

}

#==================================================================================================#
# Coefficient function
#==================================================================================================#

#' @title Coefficient function for grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Extracts coefficients for specified values of the tuning parameters.
#'
#' @param object an object of class \code{grpsel}
#' @param lambda the value of \code{lambda} indexing the desired fit
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A matrix of coefficients.
#'
#' @method coef grpsel
#'
#' @export
#'
#' @importFrom stats "coef"

coef.grpsel <- \(object, lambda = NULL, gamma = NULL, ...) {

  if (is.null(gamma) & is.null(lambda)) {
    do.call(cbind, object$beta)
  } else if (!is.null(gamma) & !is.null(lambda)) {
    index1 <- which.min(abs(gamma - object$gamma))
    index2 <- which.min(abs(lambda - object$lambda[[index1]]))
    object$beta[[index1]][, index2, drop = FALSE]
  } else if (!is.null(gamma) & is.null(lambda)) {
    index <- which.min(abs(gamma - object$gamma))
    object$beta[[index]]
  } else if (is.null(gamma) & !is.null(lambda)) {
    index <- vapply(object$lambda, \(x) which.min(abs(lambda - x)), integer(1))
    vapply(seq_along(object$gamma), \(x) object$beta[[x]][, index[x], drop = FALSE],
           numeric(nrow(object$beta[[1]])))
  }

}

#==================================================================================================#
# Predict function
#==================================================================================================#

#' @title Predict function for grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.auu>
#'
#' @description Generate predictions for new data using specified values of the tuning parameters.
#'
#' @param object an object of class \code{grpsel}
#' @param x.new a matrix of new values for the predictors
#' @param lambda the value of \code{lambda} indexing the desired fit
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A matrix of predictions.
#'
#' @method predict grpsel
#'
#' @export
#'
#' @importFrom stats "predict"

predict.grpsel <- \(object, x.new, lambda = NULL, gamma = NULL, ...) {

  beta <- coef.grpsel(object, lambda, gamma, ...)
  if (!is.matrix(x.new)) x.new <- as.matrix(x.new)
  cbind(1, x.new) %*% beta

}

#==================================================================================================#
# Plot function
#==================================================================================================#

globalVariables(c('ng', 'beta', 'predictor'))

#' @title Plot function for grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Plot the coefficient profiles from group subset selection for a specified value of
#' \code{gamma}.
#'
#' @param x an object of class \code{grpsel}
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A plot of the coefficient profiles.
#'
#' @method plot grpsel
#'
#' @export
#'
#' @importFrom graphics "plot"

plot.grpsel <- \(x, gamma = 0, ...) {

  index <- which.min(abs(gamma - x$gamma))
  beta <- x$beta[[index]]
  beta <- beta[- 1, , drop = FALSE]
  df <- data.frame(beta = as.vector(beta), predictor = as.factor(seq_along(beta[, 1])),
                   ng = rep(x$ng[[index]], each = nrow(beta)))
  df <- df[df$beta != 0, ]
  p <- ggplot2::ggplot(df, ggplot2::aes(ng, beta, col = predictor)) +
    ggplot2::geom_point() +
    ggplot2::xlab('number of groups')
  p

}