Nothing
#' generateSampleDataBin
#'
#' Generate sample clustered binary data with cluster labels. The probability of
#' a '1' in each cluster for each variable is randomly generated via a Beta(1,
#' 5) distribution, encouraging sparse probabilities which vary across clusters.
#' For noisy variables, the probability of a '1' is also generated by a Beta(1,
#' 5) distribution but this probability is the same regardless of the cluster
#' membership of the observation.
#'
#' @param n Number of observations in dataset.
#' @param K Number of clusters desired.
#' @param w A vector of mixture weights (proportion of population in each
#' cluster).
#' @param p Number of clustering variables/covariates in dataset.
#' @param Irrp Number of irrelevant/noisy variables/covariates in dataset. Note
#' that these variables will be the final Irrp columns in the simulated
#' dataset. Total data dimension is p + Irrp.
#' @param yout Default FALSE. Indicate whether a binary outcome associated with
#' clustering is required.
#'
#'
#' @returns A list with the following components:
#' \item{data}{A matrix consisting of the simulated data.}
#' \item{trueClusters}{A vector with the simulated cluster assignments.}
#' \item{outcome}{If yout = TRUE, this will be a vector with the outcome
#' variable.}
#'
#' @examples
#' # example code
#' generatedData <- generateSampleDataBin(1000, 4, c(0.1, 0.2, 0.3, 0.4), 100, 0)
#'
#' @importFrom stats rbeta
#' @importFrom stats rbinom
#' @export
generateSampleDataBin <- function(n, K, w, p, Irrp, yout = FALSE){
# Variable called "clusterLabels" to store the cluster labels,
clusterLabels <- sample(1:K, n, replace = T, prob = w)
clusterParameters <- matrix(nrow = K, ncol = p)
for(i in 1:K)
{
for(j in 1:p)
{
clusterParameters[i,j] <- stats::rbeta(1, 1, 5)
#ie. generate one parameter for each variable in each cluster via Beta dist
}
}
if(Irrp != 0){
unclusteredParameters <- stats::rbeta(Irrp, 1, 5)
}
# Use cluster labels and cluster parameters to generate sample data
dataMatrix <- matrix(nrow = n, ncol = p + Irrp)
for(i in 1:n)
{
currentClusterLabel <- clusterLabels[i] # Get the cluster label for the i-th person
for(j in 1:p)
{
# Simulate data according to the parameters for the current cluster
dataMatrix[i,j] <- stats::rbinom(1,1, clusterParameters[currentClusterLabel,j])
#Generated via Bernoulli trial and uses beta probability
}
if (Irrp != 0){
for (j in 1:Irrp){
#Simulate data according to random probability, no clustering structure
dataMatrix[i,p + j] <- stats::rbinom(1, 1, unclusteredParameters[j])
}
}
}
if (yout == TRUE){
y <- vector(mode = "numeric", length = n)
yParams <- vector(mode = "numeric", length = K)
for (k in 1:K){
yParams[k] <- stats::rbeta(1, 1, 5)
}
for (i in 1:n){
currentClusterLabel <- clusterLabels[i]
y[i] <- stats::rbinom(1, 1, yParams[currentClusterLabel])
}
}
#Create an "annotation row" to show the cluster label for each person
annotationRow <- data.frame(
Cluster = factor(clusterLabels)
)
#Create data frame for outcome for each person if needed
if (yout == TRUE){
y <- data.frame(
outcome = factor(y)
)
}
#We require the data, outcome and annotation row to have the same rownames:
rownames(annotationRow) <- rownames(dataMatrix) <- paste0("Person", seq(1,n))
if(yout == TRUE){
rownames(y) <- rownames(dataMatrix)
}
if(yout == TRUE){
return(list(data = dataMatrix, trueClusters = annotationRow, outcome = y))
} else{
return(list(data = dataMatrix, trueClusters = annotationRow))
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.