#'
#' @title Needs editing
#' @description Needs editing
#' @details Needs editing
#' @param df is a string character of the data set
#' @param num.clust specifies the number of clusters for the computation
#' @param vbleSelec specifies the max. number of iterations allowed
#' @param crit.varsel relates to the number of random sets if clusters is a number and not a set of initial cluster centers
#' @param initModel refers to the algorithm of calculating the kmeans and can be either 'Hartigan-Wong', 'Lloyd', 'Forgy' or 'MacQueen'
#' @param nbcores is the name of the new object which is created with this function
#' @param nbSmall is a logical or integer specifying whether tracing information on the progress of the algorithm is procuded for the Hartigan-Wong algorithm
#' @param iterSmall is a logical or integer specifying whether tracing information on the progress of the algorithm is procuded for the Hartigan-Wong algorithm
#' @param nbKeep is a logical or integer specifying whether tracing information on the progress of the algorithm is procuded for the Hartigan-Wong algorithm
#' @param iterKeep is a logical or integer specifying whether tracing information on the progress of the algorithm is procuded for the Hartigan-Wong algorithm
#' @param tolKeep represents the number at which point two successive models are defined to be converged; default is 1e-7
#' @import VarSelLCM
#' @import dplyr
#' @export
#'
varSelLcmSingleDS1 <- function(df, num.clust, vbleSelec, crit.varsel, initModel, nbcores, nbSmall, iterSmall, nbKeep, iterKeep, tolKeep){
df <- eval(parse(text=df), envir = parent.frame())
# Cuts the tree
results <- VarSelLCM::VarSelCluster(x = df,
gvals = num.clust,
vbleSelec = vbleSelec,
crit.varsel = crit.varsel,
initModel = initModel,
nbcores = nbcores,
nbSmall = nbSmall,
iterSmall = iterSmall,
nbKeep = nbKeep,
iterKeep = iterKeep,
tolKeep = tolKeep)
categories <- sapply(df, is.factor)
colNum <- ncol(df)
results_values <- fitted(results)
modelResults <- data.frame(df, results_values)
initialResults_Mean <- modelResults %>%
group_by(results_values) %>%
mutate(across(-any_of(colnames(df[ ,categories])), as.numeric)) %>%
summarise(across(-any_of(colnames(df[ ,categories])), ~mean(.x, na.rm = TRUE))) %>%
rename_with(~ paste0("Mean_X_", .), -results_values)
initialResults_SD <- modelResults %>%
group_by(results_values) %>%
mutate(across(-any_of(colnames(df[ ,categories])), as.numeric)) %>%
summarise(across(-any_of(colnames(df[ ,categories])), ~sd(.x, na.rm = TRUE))) %>%
rename_with(~ paste0("SD_X_", .), -results_values)
observations_clusters <- modelResults %>%
group_by(results_values) %>%
summarise(Observations = n())
disclosure_risk_numeric <- observations_clusters
initialResults <- data.frame(initialResults_Mean, initialResults_SD, observations_clusters)
cols <- colnames(df[, categories])
cols_levels <- list()
for (i in seq_along(cols)){
cols_levels[[i]] <- levels(df[[cols[i]]])
}
cat_names <- c()
count <- 1
for (p in seq_along(cols_levels)){
for (k in 1:length(cols_levels[[p]])){
cat_names[count] <- paste0("CAT_X_", cols[p], "_X_", cols_levels[[p]][k])
count <- count + 1
}
}
#### using smart round to account for NAs in dataset for factor lengths below
smart.round <- function(x) {
y <- floor(x)
indices <- tail(order(x-y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y
}
for (i in 1:length(initialResults$results_values)){
for (p in 1:length(cat_names)){
initialResults[[cat_names[p]]][i] <- length(modelResults[ , strsplit(cat_names[p], "_X_")[[1]][2]][which(modelResults$results_values == initialResults$results_values[i] & modelResults[ , strsplit(cat_names[p], "_X_")[[1]][2]] == as.numeric(strsplit(cat_names[p], "_X_")[[1]][3]))])
}
}
disclosure_risk_factors <- initialResults[, cat_names]
probabilities <- initialResults[, cat_names]
a <- strsplit(cat_names, "_X_")
storing_length <- c()
missings <- c()
for (cc in 1:dim(probabilities)[1]){
for (yy in 1:length(cols)){
sum_factor <- 0
for (i in 1:length(a)){
if(a[[i]][2] == cols[yy]){
sum_factor <- initialResults[[cat_names[i]]][cc] + sum_factor
}
}
storing_length[yy] <- sum_factor
missings[yy] <- initialResults$Observations[cc] - sum_factor
indices_vector <- c()
length_indices_vector <- 0
for (k in 1:length(a)){
if(a[[k]][2] == cols[yy]){
probabilities[[cat_names[k]]][cc] <- (probabilities[[cat_names[k]]][cc] / storing_length[yy]) * missings[yy]
indices_vector[length_indices_vector+1] <- k
length_indices_vector <- length(indices_vector)
}
}
probabilities[indices_vector][cc,] <- t(smart.round(t(probabilities[indices_vector][cc,])))
}
}
initialResults[cat_names] <- initialResults[cat_names] + probabilities[cat_names]
outcome <- initialResults
#### Disclosure Risk Testing for clusters in general & factor levels
cell_count_threshold <- dsBase::listDisclosureSettingsDS()
nfilter.tab <- as.numeric(cell_count_threshold$nfilter.tab)
invalid_clusters <- (sum(disclosure_risk_numeric$Observations < nfilter.tab &
disclosure_risk_numeric$Observations > 0)>=1)
if(invalid_clusters){
stop(paste0("Cluster creation caused one cluster to have between 1 and ", nfilter.tab-1, " observations."))
}
invalid_clusters_factors <- (sum(disclosure_risk_factors < nfilter.tab &
disclosure_risk_factors > 0)>=1)
if(invalid_clusters_factors){
stop(paste0("Cluster creation caused one categorical variable to have between 1 and ", nfilter.tab-1, " observations in one cluster."))
}
# Assigns the resulting vector with the cluster numbers to the server side
#### should be initial Results but for testing purposes is modelResults
return(results_values)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.