R/generate.matrix.R
In abess: Fast Best Subset Selection

Documented in generate.matrix

#' @title Generate matrix composed of a sparse matrix and low-rank matrix
#'
#' @description Generate simulated matrix that is the superposition of
#' a low-rank component and a sparse component.
#'
#' @param n The number of observations.
#' @param p The number of predictors of interest.
#' @param rank The rank of low-rank matrix.
#' @param support.size The number of nonzero coefficients in the underlying regression
#' model. Can be omitted if \code{beta} is supplied.
#' @param beta The coefficient values in the underlying regression model.
#' If it is supplied, \code{support.size} would be omitted.
#' @param snr A positive value controlling the signal-to-noise ratio (SNR).
#' A larger SNR implies the identification of sparse matrix is much easier.
#' Default \code{snr = Inf} enforces no noise exists. 
#' @param sigma A numerical value supplied the variance of the gaussian noise. 
#' Default \code{sigma = NULL} implies it is determined by \code{snr}.
#' @param seed random seed. Default: \code{seed = 1}.
#'
#'
#' @return A \code{list} object comprising:
#' \item{x}{An \eqn{n}-by-\eqn{p} matrix.}
#' \item{L}{The latent low rank matrix.}
#' \item{S}{The latent sparse matrix.}
#'
#' @details
#' The low rank matrix \eqn{L} is generated by \eqn{L = UV}, where
#' \eqn{U} is an \eqn{n}-by-\eqn{rank} matrix and
#' \eqn{V} is a \eqn{rank}-by-\eqn{p} matrix.
#' Each element in \eqn{U} (or \eqn{V}) are i.i.d. drawn from \eqn{N(0, 1/n)}.
#'
#' The sparse matrix \eqn{S} is an \eqn{n}-by-\eqn{rank} matrix.
#' It is generated by choosing a support set of size
#' \code{support.size} uniformly at random.
#' The non-zero entries in \eqn{S} are independent Bernoulli (-1, +1) entries.
#'
#' The noise matrix \eqn{N} is an \eqn{n}-by-\eqn{rank} matrix,
#' the elements in \eqn{N} are i.i.d. gaussian random variable
#' with standard deviation \eqn{\sigma}.
#'
#' The SNR is defined as
#' as the variance of vectorized matrix \eqn{L + S} divided
#' by \eqn{\sigma^2}.
#'
#' The matrix \eqn{x} is the superposition of \eqn{L}, \eqn{S}, \eqn{N}:
#' \deqn{x = L + S + N.}
#'
#' @author Jin Zhu
#'
#' @export
#'
#' @examples
#' # Generate simulated data
#' n <- 30
#' p <- 20
#' dataset <- generate.matrix(n, p)
#' \donttest{
#' stats::heatmap(as.matrix(dataset[["S"]]),
#'   Rowv = NA,
#'   Colv = NA,
#'   scale = "none",
#'   col = grDevices::cm.colors(256),
#'   frame.plot = TRUE,
#'   margins = c(2.4, 2.4)
#' )
#' }
generate.matrix <- function(n,
                            p,
                            rank = NULL,
                            support.size = NULL,
                            beta = NULL,
                            snr = Inf,
                            sigma = NULL,
                            seed = 1) {
  set.seed(seed)

  stopifnot(length(n) == 1)
  stopifnot(is.numeric(n))
  check_integer_warning_variable(n, "n")
  n <- as.integer(n)

  stopifnot(length(p) == 1)
  stopifnot(is.numeric(p))
  check_integer_warning_variable(p, "p")
  p <- as.integer(p)

  if (is.null(rank)) {
    L_rank <- round(0.05 * n)
  } else {
    stopifnot(rank <= min(n, p))
    check_integer_warning_variable(rank, "rank")
    L_rank <- as.integer(rank)
  }
  x <- matrix(rnorm(n * L_rank, sd = 1 / sqrt(n)), nrow = n)
  y <- matrix(rnorm(p * L_rank, sd = 1 / sqrt(p)), ncol = p)
  L <- x %*% y

  if (is.null(support.size)) {
    k <- round(0.05 * n * p)
  } else {
    check_integer_warning_variable(support.size, "support.size")
    k <- as.integer(support.size)
  }
  index <- sample(0:(n * p - 1), size = k, replace = FALSE)
  row_index <- floor(index / p)
  col_index <- index %% p
  value <- sample(c(-1, 1), size = k, replace = TRUE)
  S <- Matrix::sparseMatrix(
    dims = c(n, p),
    i = row_index,
    j = col_index,
    x = value,
    index1 = FALSE
  )
  x <- L + S

  if (is.null(sigma)) {
    signal_var <- stats::var(as.vector(x))
    noise_var <- sqrt(signal_var / snr)
  } else {
    noise_var <- sigma
  }
  noise <-
    matrix(rnorm(n = n * p, sd = noise_var),
      nrow = n,
      ncol = p
    )
  x <- x + noise
  x <- Matrix::as.matrix(x)

  set.seed(NULL)
  colnames(x) <- paste0("x", 1:p)
  return(list(
    x = x,
    L = L,
    S = S,
    rank = L_rank,
    support.size = k
  ))
}