R/simdata-guo.r
In sortinghat: sortinghat

Documented in simdata_guo

#' Generates data from \code{K} multivariate normal data populations having the
#' covariance structure from Guo et al. (2007).
#'
#' We generate \eqn{n_k} observations \eqn{(k = 1, \ldots, K)} from each of
#' \eqn{K} multivariate normal distributions. Let the \eqn{k}th population have
#' a \eqn{p}-dimensional multivariate normal distribution, \eqn{N_p(\mu_k,
#' \Sigma_k)} with mean vector \eqn{\mu_k} and positive-definite covariance
#' matrix \eqn{\Sigma_k}. Each covariance matrix \eqn{\Sigma_k} consists of
#' block-diagonal autocorrelation matrices.
#' 
#' The \eqn{k}th class covariance matrix is defined as
#' \deqn{\Sigma_k = \Sigma^{(\rho)} \oplus \Sigma^{(-\rho)} \oplus \ldots
#' \oplus \Sigma^{(\rho)},} where \eqn{\oplus} denotes the direct sum and the
#' \eqn{(i,j)}th entry of \eqn{\Sigma^{(\rho)}} is
#' \deqn{\Sigma_{ij}^{(\rho)} = \{ \rho^{|i - j|} \}.}
#'
#' The matrix \eqn{\Sigma^{(\rho)}} is referred to as a block. Its dimensions
#' are provided in the \code{block_size} argument, and the number of blocks are
#' specified in the \code{num_blocks} argument.
#'
#' Each matrix \eqn{\Sigma_k} is generated by the
#' \code{\link{cov_block_autocorrelation}} function.
#' 
#' The number of populations, \code{K}, is determined from the length of the
#' vector of sample sizes, code{n}. The mean vectors can be given in a list of
#' length \code{K}. If one mean is given (as a vector or a list having 1
#' element), then all populations share this common mean.
#'
#' The block sizes can be given as a numeric vector or a single value, in which
#' case the degrees of freedom is replicated \code{K} times. The same logic
#' applies to \code{num_blocks}, \code{rho}, and \code{sigma2}.
#' 
#' For each class, the number of features, \code{p}, is computed as
#' \code{block_size * num_blocks}. The values for \code{p} must agree for each
#' class.
#'
#' The block-diagonal covariance matrix with autocorrelated blocks was
#' popularized by Guo et al. (2007) for studying classification of
#' high-dimensional data.
#'
#' @references Guo, Y., Hastie, T., & Tibshirani, R. (2007). "Regularized linear
#' discriminant analysis and its application in microarrays," Biostatistics, 8,
#' 1, 86-100.
#' @importFrom mvtnorm rmvnorm
#' @export
#' @param n a vector (of length K) of the sample sizes for each population
#' @param mean a vector or a list (of length K) of mean vectors
#' @param block_size a vector (of length K) of the sizes of the square block
#' matrices for each population. See details.
#' @param num_blocks a vector (of length K) giving the number of block matrices
#' for each population. See details.
#' @param rho a vector (of length K) of the values of the autocorrelation
#' parameter for each class covariance matrix
#' @param sigma2 a vector (of length K) of the variance coefficients for each
#' class covariance matrix
#' @param seed seed for random number generation (If \code{NULL}, does not set
#' seed)
#' @return named list containing:
#' \describe{
#'   \item{x:}{A matrix whose rows are the observations generated and whose
#'   columns are the \code{p} features (variables)}
#'   \item{y:}{A vector denoting the population from which the observation in
#'   each row was generated.}
#' }
#' @examples
#' # Generates 10 observations from two multivariate normal populations having
#' # a block-diagonal autocorrelation structure.
#' block_size <- 3
#' num_blocks <- 3
#' p <- block_size * num_blocks
#' means_list <- list(seq_len(p), -seq_len(p))
#' data <- simdata_guo(n = c(10, 10), mean = means_list, block_size = block_size,
#'                     num_blocks = num_blocks, rho = 0.9, seed = 42)
#' dim(data$x)
#' table(data$y)
#'
#' # Generates 15 observations from each of three multivariate normal
#' # populations having block-diagonal autocorrelation structures. The
#' # covariance matrices are unequal.
#' p <- 16
#' block_size <- c(2, 4, 8)
#' num_blocks <- p / block_size
#' rho <- c(0.1, 0.5, 0.9)
#' sigma2 <- 1:3
#' mean_list <- list(rep.int(-5, p), rep.int(0, p), rep.int(5, p))
#'
#' set.seed(42)
#' data2 <- simdata_guo(n = c(15, 15, 15), mean = mean_list,
#'                     block_size = block_size, num_blocks = num_blocks,
#'                     rho = rho, sigma2 = sigma2)
#' dim(data2$x)
#' table(data2$y)
simdata_guo <- function(n, mean, block_size, num_blocks, rho, sigma2 = 1,
                        seed = NULL) {

  n <- as.integer(n)
  block_size <- as.integer(block_size)
  num_blocks <- as.integer(num_blocks)
  rho <- as.numeric(rho)

  # The number of populations
  K <- length(n)

  # If only one block size is given, replicate K times
  if (length(block_size) == 1) {
    block_size <- rep.int(block_size, times = K)
  } else if (length(block_size) != K) {
    stop("The length of 'block_size' must equal K.")
  }

  # If only one num_blocks is given, replicate K times
  if (length(num_blocks) == 1) {
    num_blocks <- rep.int(num_blocks, times = K)
  } else if (length(num_blocks) != K) {
    stop("The length of 'num_blocks' must equal K.")
  }

  # If only one rho is given, replicate K times
  if (length(rho) == 1) {
    rho <- rep.int(rho, times = K)
  } else if (length(rho) != K) {
    stop("The length of 'rho' must equal K.")
  }

  # If only one sigma2 is given, replicate K times
  if (length(sigma2) == 1) {
    sigma2 <- rep.int(sigma2, times = K)
  } else if (length(sigma2) != K) {
    stop("The length of 'sigma2' must equal K.")
  }

  # The number of features for each class. These should be equal.
  p <- block_size * num_blocks

  if (!all_equal(p)) {
    stop("The number of features 'p' must be the same for all classes.")
  }

  if (!is.list(mean)) {
    mean <- list(mean)
  }
  # If only one mean is given, replicate K times
  if (length(mean) == 1) {
    mean <- replicate(K, mean)
  } else if (length(mean) != K) {
    stop("The number of 'mean' vectors must equal 'K'.")
  }
  if (!all(sapply(mean, length) == p)) {
    stop("The length of the 'mean' vectors must equal 'p'.")
  }
  
  # Construct a list of blockdiagonal autocorrelated covariance matrices
  cov_list <- lapply(seq_len(K), function(k) {
      cov_block_autocorrelation(num_blocks = num_blocks[k],
                                block_size = block_size[k],
                                rho = rho[k],
                                sigma2 = sigma2[k])
  })

  # Generate data via simdata_normal
  simdata_normal(n = n, mean = mean, cov = cov_list, seed = seed)
}