R/generateSampleDataBin.R
In VICatMix: Variational Mixture Models for Clustering Categorical Data

Documented in generateSampleDataBin

#' generateSampleDataBin
#'
#' Generate sample clustered binary data with cluster labels. The probability of
#' a '1' in each cluster for each variable is randomly generated via a Beta(1,
#' 5) distribution, encouraging sparse probabilities which vary across clusters.
#' For noisy variables, the probability of a '1' is also generated by a Beta(1,
#' 5) distribution but this probability is the same regardless of the cluster
#' membership of the observation.
#'
#' @param n Number of observations in dataset.
#' @param K Number of clusters desired.
#' @param w A vector of mixture weights (proportion of population in each
#'   cluster).
#' @param p Number of clustering variables/covariates in dataset.
#' @param Irrp Number of irrelevant/noisy variables/covariates in dataset. Note
#'   that these variables will be the final Irrp columns in the simulated
#'   dataset. Total data dimension is p + Irrp.
#' @param yout Default FALSE. Indicate whether a binary outcome associated with
#'   clustering is required.
#'
#'
#' @returns A list with the following components: 
#'   \item{data}{A matrix consisting of the simulated data.} 
#'   \item{trueClusters}{A vector with the simulated cluster assignments.}
#'   \item{outcome}{If yout = TRUE, this will be a vector with the outcome
#'   variable.}
#'
#' @examples
#' # example code
#' generatedData <- generateSampleDataBin(1000, 4, c(0.1, 0.2, 0.3, 0.4), 100, 0)
#'
#' @importFrom stats rbeta
#' @importFrom stats rbinom
#' @export

generateSampleDataBin <- function(n, K, w, p, Irrp, yout = FALSE){
  # Variable called "clusterLabels" to store the cluster labels, 

  clusterLabels <- sample(1:K, n, replace = T, prob = w)
  
  clusterParameters <- matrix(nrow = K, ncol = p)
  for(i in 1:K)
  {
    for(j in 1:p)
    {
      clusterParameters[i,j] <- stats::rbeta(1, 1, 5) 
      #ie. generate one parameter for each variable in each cluster via Beta dist
    }
  }
  
  if(Irrp != 0){
    unclusteredParameters <- stats::rbeta(Irrp, 1, 5)
  }
  
  # Use cluster labels and cluster parameters to generate sample data

  dataMatrix    <- matrix(nrow = n, ncol = p + Irrp)
  for(i in 1:n)
  {
    currentClusterLabel <- clusterLabels[i]  # Get the cluster label for the i-th person
    for(j in 1:p)
    {
      # Simulate data according to the parameters for the current cluster
      dataMatrix[i,j] <- stats::rbinom(1,1, clusterParameters[currentClusterLabel,j]) 
      #Generated via Bernoulli trial and uses beta probability
    }
    if (Irrp != 0){
      for (j in 1:Irrp){
        #Simulate data according to random probability, no clustering structure
        dataMatrix[i,p + j] <- stats::rbinom(1, 1, unclusteredParameters[j])
      }
    }
  }
  
  if (yout == TRUE){
    y <- vector(mode = "numeric", length = n)
    yParams <- vector(mode = "numeric", length = K) 
    
    for (k in 1:K){
      yParams[k] <- stats::rbeta(1, 1, 5)
    }
    
    for (i in 1:n){
      currentClusterLabel <- clusterLabels[i]
      y[i] <- stats::rbinom(1, 1, yParams[currentClusterLabel])
    }
    
  }
  
  #Create an "annotation row" to show the cluster label for each person
  annotationRow <- data.frame(
    Cluster = factor(clusterLabels)
  )
  #Create data frame for outcome for each person if needed
  if (yout == TRUE){
    y <- data.frame(
      outcome = factor(y)
    )
  }
  
  #We require the data, outcome and annotation row to have the same rownames:
  rownames(annotationRow) <- rownames(dataMatrix) <- paste0("Person", seq(1,n))
  if(yout == TRUE){
    rownames(y) <- rownames(dataMatrix)
  }
  
  if(yout == TRUE){
    return(list(data = dataMatrix, trueClusters = annotationRow, outcome = y))
  } else{
    return(list(data = dataMatrix, trueClusters = annotationRow))
  }
  }