#'
#' @title Needs editing
#' @description Needs editing
#' @details Needs editing
#' @param df is a string character of the data set
#' @import VarSelLCM
#' @import dplyr
#' @import truncnorm
#' @import clusterSim
#' @import dsBase
#' @export
#'
varSelLcmDS3 <- function(df){
df <- eval(parse(text=df), envir = parent.frame())
cluster_pre <- eval(parse(text="cluster_pre"), envir = parent.frame())[[1]]
FinalResults <- eval(parse(text="cluster_pre"), envir = parent.frame())[[2]]
results_values_final <- cluster_pre
results_values <- results_values_final[1:nrow(df)]
categories <- sapply(df, is.factor)
colNum <- ncol(df)
modelResults <- data.frame(df, results_values)
initialResults_Mean <- modelResults %>%
group_by(results_values) %>%
mutate(across(-any_of(colnames(df[ ,categories])), as.numeric)) %>%
summarise(across(-any_of(colnames(df[ ,categories])), ~mean(.x, na.rm = TRUE))) %>%
rename_with(~ paste0("Mean_X_", .), -results_values)
observations_clusters <- modelResults %>%
group_by(results_values) %>%
summarise(Observations = n())
initialResults <- data.frame(initialResults_Mean,
observations_clusters)
disclosure_risk_numeric <- observations_clusters
# stop("after init")
cols <- colnames(df[, categories])
cols_levels <- list()
for (i in seq_along(cols)){
cols_levels[[i]] <- levels(df[[cols[i]]])
}
cat_names <- c()
count <- 1
for (p in seq_along(cols_levels)){
for (k in 1:length(cols_levels[[p]])){
cat_names[count] <- paste0("CAT_X_", cols[p], "_X_", cols_levels[[p]][k])
count <- count + 1
}
}
#### using smart round to account for NAs in dataset for factor lengths below
smart.round <- function(x) {
y <- floor(x)
indices <- tail(order(x-y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y
}
for (i in 1:length(initialResults$results_values)){
for (p in 1:length(cat_names)){
initialResults[[cat_names[p]]][i] <- length(modelResults[ , strsplit(cat_names[p], "_X_")[[1]][2]][which(modelResults$results_values == initialResults$results_values[i] & modelResults[ , strsplit(cat_names[p], "_X_")[[1]][2]] == as.numeric(strsplit(cat_names[p], "_X_")[[1]][3]))])
}
}
disclosure_risk_factors <- initialResults[, cat_names]
probabilities <- initialResults[, cat_names]
a <- strsplit(cat_names, "_X_")
storing_length <- c()
missings <- c()
for (cc in 1:dim(probabilities)[1]){
for (yy in 1:length(cols)){
sum_factor <- 0
for (i in 1:length(a)){
if(a[[i]][2] == cols[yy]){
sum_factor <- initialResults[[cat_names[i]]][cc] + sum_factor
}
}
storing_length[yy] <- sum_factor
missings[yy] <- initialResults$Observations[cc] - sum_factor
indices_vector <- c()
length_indices_vector <- 0
for (k in 1:length(a)){
if(a[[k]][2] == cols[yy]){
probabilities[[cat_names[k]]][cc] <- (probabilities[[cat_names[k]]][cc] / storing_length[yy]) * missings[yy]
indices_vector[length_indices_vector+1] <- k
length_indices_vector <- length(indices_vector)
}
}
probabilities[indices_vector][cc,] <- t(smart.round(t(probabilities[indices_vector][cc,])))
}
}
initialResults[cat_names] <- initialResults[cat_names] + probabilities[cat_names]
#### test for number of entries at this point for nfilter tab
initialResults <- initialResults %>%
mutate(across(all_of(cat_names), ~.x / Observations)) %>%
select(-all_of(c("results_values.1", "Observations")))
#stop("before dplyr")
# data_db <- df %>%
# mutate(across(where(is.factor), as.character)) %>%
# mutate(across(where(is.character), as.numeric))
#
# #stop("before DB")
#
# value_DB <- clusterSim::index.DB(x = data_db,
# cl = cluster_pre)[[1]]
outcome <- list(initialResults,
FinalResults@model@names.irrelevant,
FinalResults@criteria@discrim,
FinalResults@criteria@loglikelihood,
FinalResults@criteria@AIC,
FinalResults@criteria@BIC,
FinalResults@criteria@ICL)
#### Disclosure Risk Testing for clusters in general & factor levels
cell_count_threshold <- dsBase::listDisclosureSettingsDS()
nfilter.tab <- as.numeric(cell_count_threshold$nfilter.tab)
invalid_clusters <- (sum(disclosure_risk_numeric$Observations < nfilter.tab &
disclosure_risk_numeric$Observations > 0)>=1)
if(invalid_clusters){
stop(paste0("Final cluster creation caused one cluster to have between 1 and ", nfilter.tab-1, " observations."))
}
invalid_clusters_factors <- (sum(disclosure_risk_factors < nfilter.tab &
disclosure_risk_factors > 0)>=1)
if(invalid_clusters_factors){
stop(paste0("Final cluster creation caused one categorical variable to have between 1 and ", nfilter.tab-1, " observations in one cluster."))
}
return(outcome)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.