Nothing
#' generateSampleDataCat
#'
#' Generate sample clustered categorical data with cluster labels. The
#' probability of a '1' in each cluster for each variable is randomly generated
#' via a Dirichlet (1, ..., cat) distribution, where cat is the number of
#' categories for each variable. For noisy variables, the probability of a '1'
#' is also generated by a Dirichlet (1, ..., cat) distribution but this
#' probability is the same regardless of the cluster membership of the
#' observation. An outcome variable associated with the clustering structure can
#' be generated with a different number of categories, also generated with a
#' Dirichlet distribution. Package 'gtools' must be installed for this function.
#'
#' @param n Number of observations in dataset.
#' @param K Number of clusters desired.
#' @param w A vector of mixture weights (proportion of population in each
#' cluster).
#' @param p Number of clustering variables/covariates in dataset.
#' @param Irrp Number of irrelevant/noisy variables/covariates in dataset. Note
#' that these variables will be the final Irrp columns in the simulated
#' dataset. Total data dimension is p + Irrp.
#' @param yout Default FALSE. Indicate whether an outcome associated with
#' clustering is required.
#' @param cat Number of categories in each covariate. Default is 2.
#' @param ycat Number of categories for the outcome varaible. Default is 2.
#'
#'
#' @returns A list with the following components:
#' \item{data}{A matrix consisting of the simulated data.}
#' \item{trueClusters}{A vector with the simulated cluster assignments.}
#' \item{outcome}{If yout = TRUE, this will be a vector with the outcome
#' variable.}
#'
#' @examples
#' # example code
#' generatedData <- generateSampleDataCat(1000, 4, c(0.1, 0.2, 0.3, 0.4), 100, 0, cat = 3)
#'
#' @importFrom gtools rdirichlet
#' @export
generateSampleDataCat <- function(n, K, w, p, Irrp, yout = FALSE, cat = 2, ycat = 2){
# Variable called "clusterLabels" to store the cluster labels,
clusterLabels <- sample(1:K, n, replace = T, prob = w)
clusterParameters <- array(0, dim = c(p, cat, K))
for(i in 1:K){
for(j in 1:p)
{
for(c in 1:cat){
clusterParameters[, ,i] <- gtools::rdirichlet(p, c(1:cat))
}
}
}
if(Irrp != 0){
unclusteredParameters <- gtools::rdirichlet(Irrp, c(1:cat))
}
# Use cluster labels and cluster parameters to generate sample data
dataMatrix <- matrix(nrow = n, ncol = p + Irrp)
for(i in 1:n)
{
currentClusterLabel <- clusterLabels[i] # Get the cluster label for the i-th person
for(j in 1:p)
{
# Simulate data according to the parameters for the current cluster
dataMatrix[i,j] <- sample(1:cat, 1, replace = TRUE, prob = clusterParameters[j, , currentClusterLabel])
}
if (Irrp != 0){
for (j in 1:Irrp){
#Simulate data according to random probability, no clustering structure
dataMatrix[i,p + j] <- sample(1:cat, 1, replace = TRUE, prob = unclusteredParameters[j,])
}
}
}
if (yout == TRUE){
y <- vector(mode = "numeric", length = n)
yParams <- matrix(0, nrow = K, ncol = ycat)
yParams <- gtools::rdirichlet(K, c(1:ycat))
for (i in 1:n){
currentClusterLabel <- clusterLabels[i]
y[i] <- sample(1:ycat, 1, replace = TRUE, prob = yParams[currentClusterLabel,])
}
}
#Create an "annotation row" to show the cluster label for each person
annotationRow <- data.frame(
Cluster = factor(clusterLabels)
)
#Create data frame for outcome for each person if needed
if (yout == TRUE){
y <- data.frame(
outcome = factor(y)
)
}
#We require the data, outcome and annotation row to have the same rownames:
rownames(annotationRow) <- rownames(dataMatrix) <- paste0("Person", seq(1,n))
if(yout == TRUE){
rownames(y) <- rownames(dataMatrix)
}
if(yout == TRUE){
return(list(data = dataMatrix, trueClusters = annotationRow, outcome = y))
} else{
return(list(data = dataMatrix, trueClusters = annotationRow))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.