rare: Linear Model with Tree-Based Lasso Regularization for Rare Features

Documented in find.leaves plot.rarefit print.rarefit rarefit tree.matrix

#' Find all descendant leaves of a node in an hclust tree
#'
#' The function recursively finds all leaves that are descendants of a
#' node in an \code{hclust} tree.
#'
#' @param ind Index of the tree node. For an \code{hclust} tree
#' of \code{p} leaves, -\code{j} denotes the \code{j}th leaf and \code{k}
#' denotes the interior node formed at the \code{k}th merging in constructing
#' the tree. The range of \code{ind} is
#' \{-1, ..., -p, 1,..., p-1\} where \code{p-1} is the number of interior nodes.
#' @param merge A (\code{p-1})-by-2 matrix that encodes the order of
#' mergings in constructing the tree. \code{merge} uses the same notation for
#' nodes and mergings in an \code{hclust} object.
#' See \code{\link[stats]{hclust}} for details.
#'
#' @return Returns a sequence of indices for descendant leaves
#' in the leaf set \{1, ..., p\}. Unlike the notation used in
#' \code{ind}, we use positive integers to denote leaves here.
#'
#' @examples
#' \dontrun{
#' hc <- hclust(dist(USArrests), "ave")
#' # Descendant leaves of the 10th leaf (should be iteself)
#' find.leaves(-10, hc$merge)
#'
#' # Descendant leaves of the 10th interior node
#' find.leaves(10, hc$merge)
#'
#' # Descendant leaves of the root (should be all leaves)
#' ind_root <- nrow(hc$merge)
#' all.equal(find.leaves(ind_root, hc$merge), hc$order)
#' }
#'
#' @export
find.leaves <- function(ind, merge) {
  n_interior <- nrow(merge)
  stopifnot(ind >= -n_interior - 1 & ind <= n_interior)
  c(find_leaves(ind, merge)) # return as vector
}


#' Generate matrix A encoding ancestor-descendant relationships
#' in an hclust tree
#'
#' The function generates the binary matrix \code{A} defined in Yan
#' and Bien (2018). The matrix encodes ancestor-descendant relationships between leaves
#' and tree nodes in an \code{hclust} tree.
#'
#' @param hc An \code{hclust} object.
#'
#' @return Returns a \code{nvars}-by-\code{nnodes} binary matrix \code{A}
#' where \code{nvars} is the number of leaves (we associate covariate with leaf),
#' and \code{nnodes} is the number of tree nodes (including both leaves and interior nodes).
#' For an \code{hclust} tree, \code{nnodes} = \code{2*nvars-1}. \code{A[i,j]} is 1 if the \code{i}th leaf
#' is a descendant of the \code{j}th node in the tree, and 0 otherwise. \emph{By default, we
#' let the first \code{nvars} columns correspond to leaves and the remaining
#' \code{nvars-1} columns correspond to interior nodes.}
#' \code{A} is in sparse matrix format (inherit from class
#' \code{\link[Matrix]{sparseMatrix}} as in package \code{Matrix}).
#'
#' @seealso \code{\link{find.leaves}} for finding descendant leaves of a node.
#'
#' @examples
#' \dontrun{
#' # For a perfect binary tree of depth 2 below
#' #
#' #      3
#' #      /\
#' #    1    2
#' #   /\    /\
#' # -1 -2 -3 -4
#' #
#' # A can expressed as the following:
#' A_true <- cbind(diag(4),
#'                 as.matrix(c(1, 1, 0, 0)),
#'                 as.matrix(c(0, 0, 1, 1)),
#'                 as.matrix(c(1, 1, 1, 1)))
#' # Now use tree.matrix to generate A
#' tree0 <- list()
#' tree0$merge <- matrix(c(-1, -2, -3, -4, 1, 2),
#'                       ncol = 2, byrow = TRUE)
#' tree0$labels <- c("leaf1", "leaf2", "leaf3", "leaf4")
#' A <- tree.matrix(tree0)
#' all(A_true == as.matrix(A))
#'
#' # Another example
#' hc <- hclust(dist(USArrests), "ave")
#' A <- tree.matrix(hc)
#' }
#'
#' @references Yan, X. and Bien, J. (2018) \emph{Rare Feature Selection in High Dimensions}, \url{https://arxiv.org/abs/1803.06675}.
#'
#' @export
tree.matrix <- function(hc) {
  p <- nrow(hc$merge) + 1 # number of leaves
  n_interior <- nrow(hc$merge) # number of interior nodes
  A_i <- c(as.list(seq(p)), sapply(seq(n_interior), function(x) find.leaves(x, hc$merge)))
  A_j <- sapply(seq(length(A_i)), function(x) rep(x, len=length(A_i[[x]])))
  A <- sparseMatrix(i = unlist(A_i), j = unlist(A_j), x = rep(1, len=length(unlist(A_i))))
  A
}


#' Fit the rare feature selection model
#'
#' Fit the rare feature selection model proposed in Yan and Bien (2018):
#' \deqn{min_{\beta, \gamma} 0.5 * ||y - X\beta - \beta_01_n||_2^2 +
#' \lambda * (\alpha * ||\gamma_{-root}||_1 + (1-\alpha) * ||\beta||_1)}
#' using an alternating direction method of multipliers (ADMM) algorithm
#' described in Algorithm 1 of the same paper.
#' The regularization path is computed over a two-dimensional grid of
#' regularization parameters: \code{lambda} and \code{alpha}. Of the two,
#' \code{lambda} controls the overall amount of regularization, and \code{alpha}
#' controls the tradeoff between sparsity and fusion of \eqn{\beta} (larger \code{alpha}
#' induces more fusion in \eqn{\beta}).
#'
#' The function splits model fitting path by \code{alpha}. At each \code{alpha} value,
#' the model is fit on the entire sequence of \code{lambda} with warm start. We recommend
#' including an intercept (by setting \code{intercept=T}) unless the input data have been
#' centered.
#'
#' @param y Length-\code{nobs} response variable.
#' @param X \code{nobs}-by-\code{nvars} input matrix:
#' each row is an observation vector and each column stores a count covariate.
#' @param A \code{nvars}-by-\code{nnodes} binary matrix encoding ancestor-descendant relationships
#' between leaves and tree nodes, where \code{nnodes} is the total number of tree nodes.
#' \code{A[i,j]} is 1 if the \code{i}th leaf is a descendant of the \code{j}th
#' node in the tree, and 0 otherwise. \code{A} should be in sparse matrix format
#' (inherit from class \code{\link[Matrix]{sparseMatrix}} as in package \code{Matrix}).
#' When \code{A} is \code{NULL}, the function will learn \code{A} from \code{hc}.
#' @param Q \code{(nvars+nnodes)}-by-\code{nnodes} matrix with columns forming an orthonormal
#' basis for the null space of \eqn{[I_nvars:-A]}. When \code{Q} is \code{NULL}, the function will learn
#' \code{Q} using the singular value decomposition.
#' @param hc An \code{hclust} tree of \code{nvars} leaves where each leaf corresponds
#' to a covariate. If the tree is not an \code{hclust} object, user needs to provide the matrix \code{A} instead.
#' @param intercept Whether intercept be fitted (default = TRUE) or set to zero (FALSE).
#' @param lambda A user-supplied \code{lambda} sequence. Typical usage is to
#' have the program compute its own \code{lambda} sequence based on
#' \code{nlam} and \code{lam.min.ratio}.
#' @param alpha A user-supplied \code{alpha} sequence. If letting the program
#' compute its own \code{alpha} sequence, a length-\code{nalpha} sequence of
#' equally-spaced \code{alpha} values between 0 and 1 will be used. In practice,
#' user may want to provide a more fine \code{alpha} sequence to tune
#' the model to its best performance (e.g., \code{alpha = c(1-exp(seq(0, log(1e-2), len = nalpha - 1)), 1)}).
#' @param nlam Number of \code{lambda} values (default = 50).
#' @param lam.min.ratio Smallest value for \code{lambda}, as a fraction of
#' \code{lambda.max} (i.e., the smallest value for which all coefficients are
#' zero). The default value is \code{1e-4}.
#' @param nalpha Number of \code{alpha} values (default = 10).
#' @param rho Penalty parameter for the quadratic penalty in the ADMM algorithm.
#' The default value is \code{1e-2}.
#' @param eps1 Convergence threshold in terms of the absolute tolerance level
#' for the ADMMM algorithm. The default value is \code{1e-6}.
#' @param eps2 Convergence threshold in terms of the relative tolerance level
#' for the ADMM algorithm. The default value is \code{1e-5}.
#' @param maxite Maximum number of passes over the data for every pair of
#' (\code{lambda}, \code{alpha}). The default value is \code{1e6}.
#'
#' @return Returns regression coefficients for \code{beta} and \code{gamma} and
#' intercept \code{beta0}. We use a \emph{matrix-nested-within-list} structure to store the coefficients: each list
#' item corresponds to an \code{alpha} value; matrix (or vector) in that list item stores
#' coefficients at various \code{lambda} values by columns (or entries).
#'
#' \item{beta0}{Length-\code{nalpha} list with each item storing
#' intercept across various \code{lambda} in a vector: \code{beta0[[j]][i]}
#' is intercept fitted at (\code{lambda[i]}, \code{alpha[j]}).
#' If \code{intercept = FALSE}, \code{beta0} is \code{NULL}.}
#' \item{beta}{Length-\code{nalpha} list with each item storing
#' \code{beta} coefficient at various \code{lambda} in columns of a \code{nvars}-by-\code{nlam} matrix:
#' \code{beta[[j]][, i]} is \code{beta} coeffcient fitted at (\code{lambda[i]}, \code{alpha[j]}).}
#' \item{gamma}{Length-\code{nalpha} list with each item storing
#' \code{gamma} coefficient at various \code{lambda} in columns of a \code{nnodes}-by-\code{nlam} matrix:
#' \code{gamma[[j]][, i]} is \code{gamma} coeffcient vector fitted at (\code{lambda[i]}, \code{alpha[j]}).
#' If \code{alpha[j] = 0}, the problem becomes the lasso on \code{beta} and is solved
#' with \code{\link[glmnet]{glmnet}} on \code{beta}, in which case \code{gamma[[j]] = NA}.}
#' \item{lambda}{Sequence of \code{lambda} values used in model fit.}
#' \item{alpha}{Sequence of \code{alpha} values used in model fit.}
#' \item{A}{Binary matrix encoding ancestor-descendant relationship between leaves and nodes in the tree.}
#' \item{Q}{Matrix with columns forming an orthonormal basis for the null space of \eqn{[I_nvars:-A]}.}
#' \item{intercept}{Whether an intercept is included in model fit.}
#'
#' @examples
#' \dontrun{
#' # See vignette for more details.
#' set.seed(100)
#' ts <- sample(1:length(data.rating), 400) # Train set indices
#' # Fit the model on train set
#' ourfit <- rarefit(y = data.rating[ts], X = data.dtm[ts, ], hc = data.hc, lam.min.ratio = 1e-6,
#'                   nlam = 20, nalpha = 10, rho = 0.01, eps1 = 1e-5, eps2 = 1e-5, maxite = 1e4)
#' }
#'
#' @seealso \code{\link{rarefit.cv}}, \code{\link{rarefit.predict}}
#'
#' @references Yan, X. and Bien, J. (2018) \emph{Rare Feature Selection in High Dimensions}, \url{https://arxiv.org/abs/1803.06675}.
#'
#' @export
rarefit <- function(y, X, A = NULL, Q = NULL, hc, intercept = T, lambda = NULL, alpha = NULL,
                    nlam = 50, lam.min.ratio = 1e-4, nalpha = 10,
                    rho = 1e-2, eps1 = 1e-6, eps2 = 1e-5, maxite = 1e6) {
  if (is.null(A)) {
    if (class(hc) != "hclust") stop("The tree needs to be an hclust object.")
    A <- tree.matrix(hc)
  } else {
    if (class(A) != "dgCMatrix") stop("A is not in sparse matrix format.")
  }
  n <- nrow(X)
  p <- ncol(X)
  nnodes <- ncol(A)

  X_use <- X <- as.matrix(X) # X to-be-used for model fit
  y_use <- as.vector(y) # y to-be-used for model fit
  # Center X and y and intercept is to be included
  if (intercept) {
    X_use <- X - matrix(rep(1, times = n), ncol = 1) %*% matrix(colMeans(X), nrow = 1)
    y_use <- y - mean(y)
  }

  # Generate alpha sequence
  if (is.null(alpha)) {
    alpha <- seq(0, 1, len = nalpha)
  } else {
    if (min(alpha) < 0 || max(alpha) > 1) stop("alpha range is out of [0, 1].")
    #alpha <- sort(alpha)
    nalpha <- length(alpha)
  }
  # Generate lambda sequence
  if(is.null(lambda)) {
    lambda <- max(abs(t(X_use) %*% y_use))/n * exp(seq(0, log(lam.min.ratio), len = nlam))
  } else {
    if (min(lambda) < 0) stop("lambda cannot be negative.")
    #lambda <- sort(lambda)
    nlam <- length(lambda)
  }

  # SVD of (I:-A)
  if (is.null(Q)) Q <- svdA(A)
  # SVD of X
  E <- svdX(X_use, rho)
  # Two implications up to this point: 1. Q and E will be stored in memory and be
  # passed in the solver as arguments (which can be problematic when n or p is large).
  # 2. ADMM iterates at a fixed rho. If varying rho is adopted, E should be updated as well.

  # Iterate solution paths along alpha
  beta0 <- beta <- gamma <- list()
  for (i in seq(nalpha)) {
    if (alpha[i] == 0) {
      # lasso on beta
      ret <- glmnet(X_use, y_use, family = "gaussian", lambda = lambda, standardize = F,
                    intercept = F, thresh = min(eps1, eps2), maxit = maxite)
      beta[[i]] <- as.matrix(ret$beta)
      gamma[[i]] <- NA
    } else if (alpha[i] == 1) {
      # lasso on non-root gamma
      ret <- glmnet(X_use %*% A, y_use, family = "gaussian", lambda = lambda, standardize = F,
                    intercept = F, penalty.factor = c(rep(1, nnodes-1), 0),
                    thresh = min(eps1, eps2), maxit = maxite)
      beta[[i]] <- as.matrix(A %*% ret$beta)
      gamma[[i]] <- as.matrix(ret$beta)
    } else {
      # general case when 0 < alpha < 1
      ret <- our_solver(X_use, as.matrix(y_use), Q, E, lambda, alpha[i], rho, eps1, eps2, maxite)
      beta[[i]] <- ret$beta
      gamma[[i]] <- ret$gamma
    }
    cat(sprintf("Finished model fits for alpha[%s].\n", i))
  }
  # Take care of intercept
  if (intercept) {
    beta0 <- lapply(beta, function(b) (sum(y) - c(matrix(colSums(X), nrow = 1) %*% b))/n)
  }
  out <- list(beta0 = beta0, beta = beta, gamma = gamma, lambda = lambda, alpha = alpha, A = A, Q = Q, intercept = intercept)
  class(out) <- "rarefit"
  return(out)
}

#' Print information about a rarefit object
#'
#' @param x rarefit object
#' @param ... additional parameters (not used)
#' @export
#' @method print rarefit
print.rarefit <- function(x, ...) {
  cat("Number of groups (for each alpha/lambda pair):", fill = TRUE)
  count_groups <- function(gvec) length(unique(group.recover(gvec, x$A)))
  num_groups <- matrix(NA, length(x$lambda), length(x$alpha))
  for (i in seq_along(x$gamma)) {
    if (length(x$gamma[[i]]) == 1) next
    num_groups[, i] <- apply(x$gamma[[i]], 2, count_groups)
  }
  rownames(num_groups) <- paste0("lambda", 1:length(x$lambda))
  colnames(num_groups) <- paste0("alpha", 1:length(x$alpha))
  print(num_groups)
}

#' Plot a rarefit object
#'
#' @param x rarefit object
#' @param y unused argument
#' @param alpha_index specify which alpha value to plot
#' @param type "beta" or "gamma" determines which path is to be plotted
#' @param ... additional parameters passed to `matplot`
#' @export
#' @method plot rarefit
plot.rarefit <- function(x, y, alpha_index = NULL, type = c("beta", "gamma"),
                         ...) {
  if (is.null(alpha_index)) stop("Please specify an alpha_index.")
  type <- match.arg(type)
  if (length(x$gamma[[alpha_index]]) == 1 & type == "gamma")
    stop("Only beta available for this alpha_index.")
  matplot(x$lambda,
          t(x[[type]][[alpha_index]]), type = "l", log="x",
          xlab = "lambda",
          ylab = type,
          main = paste0("Regularization path for alpha = ",
                        round(x$alpha[alpha_index], 4)),
          ...)
}