R/generateSampleDataCat.R
In VICatMix: Variational Mixture Models for Clustering Categorical Data

Documented in generateSampleDataCat

#' generateSampleDataCat
#'
#' Generate sample clustered categorical data with cluster labels. The
#' probability of a '1' in each cluster for each variable is randomly generated
#' via a Dirichlet (1, ..., cat) distribution, where cat is the number of
#' categories for each variable. For noisy variables, the probability of a '1'
#' is also generated by a Dirichlet (1, ..., cat) distribution but this
#' probability is the same regardless of the cluster membership of the
#' observation. An outcome variable associated with the clustering structure can
#' be generated with a different number of categories, also generated with a
#' Dirichlet distribution. Package 'gtools' must be installed for this function.
#'
#' @param n Number of observations in dataset.
#' @param K Number of clusters desired.
#' @param w A vector of mixture weights (proportion of population in each
#'   cluster).
#' @param p Number of clustering variables/covariates in dataset.
#' @param Irrp Number of irrelevant/noisy variables/covariates in dataset. Note
#'   that these variables will be the final Irrp columns in the simulated
#'   dataset. Total data dimension is p + Irrp.
#' @param yout Default FALSE. Indicate whether an outcome associated with
#'   clustering is required.
#' @param cat Number of categories in each covariate. Default is 2.
#' @param ycat Number of categories for the outcome varaible. Default is 2.
#'
#'
#' @returns A list with the following components: 
#'   \item{data}{A matrix consisting of the simulated data.} 
#'   \item{trueClusters}{A vector with the simulated cluster assignments.}
#'   \item{outcome}{If yout = TRUE, this will be a vector with the outcome
#'   variable.}
#'
#' @examples
#' # example code
#' generatedData <- generateSampleDataCat(1000, 4, c(0.1, 0.2, 0.3, 0.4), 100, 0, cat = 3)
#'
#' @importFrom gtools rdirichlet
#' @export

generateSampleDataCat <- function(n, K, w, p, Irrp, yout = FALSE, cat = 2, ycat = 2){
  
  # Variable called "clusterLabels" to store the cluster labels, 
  
  clusterLabels <- sample(1:K, n, replace = T, prob = w)
  
  clusterParameters <- array(0, dim = c(p, cat, K))
  for(i in 1:K){
    for(j in 1:p)
    {
      for(c in 1:cat){
        clusterParameters[, ,i] <- gtools::rdirichlet(p, c(1:cat))
      }
    }
  }
  
  if(Irrp != 0){
    unclusteredParameters <- gtools::rdirichlet(Irrp, c(1:cat))
  }
  
  # Use cluster labels and cluster parameters to generate sample data
  
  dataMatrix    <- matrix(nrow = n, ncol = p + Irrp)
  for(i in 1:n)
  {
    currentClusterLabel <- clusterLabels[i]  # Get the cluster label for the i-th person
    for(j in 1:p)
    {
      # Simulate data according to the parameters for the current cluster
      dataMatrix[i,j] <- sample(1:cat, 1, replace = TRUE, prob = clusterParameters[j, , currentClusterLabel])
    }
    if (Irrp != 0){
      for (j in 1:Irrp){
        #Simulate data according to random probability, no clustering structure
        dataMatrix[i,p + j] <- sample(1:cat, 1, replace = TRUE, prob = unclusteredParameters[j,])
      }
    }
  }
  
  if (yout == TRUE){
    y <- vector(mode = "numeric", length = n)
    yParams <- matrix(0, nrow = K, ncol = ycat) 
    
    yParams <- gtools::rdirichlet(K, c(1:ycat)) 
    
    for (i in 1:n){
      currentClusterLabel <- clusterLabels[i]
      y[i] <- sample(1:ycat, 1, replace = TRUE, prob = yParams[currentClusterLabel,])
    }
    
  }
  
  #Create an "annotation row" to show the cluster label for each person
  annotationRow <- data.frame(
    Cluster = factor(clusterLabels)
  )
  #Create data frame for outcome for each person if needed
  if (yout == TRUE){
    y <- data.frame(
      outcome = factor(y)
    )
  }
  
  #We require the data, outcome and annotation row to have the same rownames:
  rownames(annotationRow) <- rownames(dataMatrix) <- paste0("Person", seq(1,n))
  if(yout == TRUE){
    rownames(y) <- rownames(dataMatrix)
  }
  
  if(yout == TRUE){
    return(list(data = dataMatrix, trueClusters = annotationRow, outcome = y))
  } else{
    return(list(data = dataMatrix, trueClusters = annotationRow))
  }
}