devtools::load_all()
path <- "C:/Users/pcuser/OneDrive/Students/VARIOUS/Georgia Charalampous/Data/employee_satisfaction_data.xlsx"
path <- "SampleData/sample_spss.sav"
temp <- read_file(path)
# nms <- names(temp)
#
# new_names <- nms %>% stringr::str_trunc(10)
#
# names(temp) %>%
# stringr::str_trunc(20)
# nchar()
#
# names(temp) <- make.unique(new_names)
#
# temp %>%
# #select(all_of(vars_cluster())) %>%
# purrr::keep(is.numeric) %>%
# mutate(across(where(is.numeric), scale2, na.rm = TRUE))
# mutate_if(is.numeric, ~ scale2(., na.rm = TRUE)) %>%
# as.data.frame()
#
#
#
#
# make.names(new_names, unique = TRUE)
#
# stringr::str_
# readxl::read_xlsx(path, .name_repair = "universal")
#
# safe_read <- purrr::safely(read_file)
#
# safe_read("SampleData/sample_tbl_silhouette.rds")
lab_list <- get_var_labels(temp)
vars_for_cluster <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm")
# vars_for_cluster <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm")
#
# labelled::var_label(temp) <- NULL
#
# purrr::map(names(temp), function(x){
#
# attr(temp[[x]], "label") <<- lab_list[[x]]
#
# })
dta <-
temp %>%
slice_sample(n = 30) %>%
# add a row with missing values
add_row(species = "Adelie") %>%
arrange(species) %>%
tibble::rowid_to_column(".rowid") %>%
as.data.frame()
#purrr::keep(is.numeric) %>%
#na.omit()
dta_clust <-
dta %>%
select(all_of(vars_for_cluster)) %>%
mutate_if(is.numeric, ~ scale2(., na.rm = TRUE)) %>%
as.data.frame() %>%
na.omit()
# dissimilarity ----
diss_matrix <- calc_diss_matrix( dta_clust )
class(diss_matrix)
str(diss_matrix)
sum(is.na(diss_matrix))
# k means ----
res_kmeans <- kmeans(dta_clust, 3, nstart = 25)
class(res_kmeans)
res_kmeans$centers %>% dimnames() %>% .[[2]]
# k meds ----
res_pam <- cluster::pam(diss_matrix, k = 3)
class(res_pam)
res_pam$clustering %>% length()
plot(res_pam)
get_pam_cluster_indx(
dta,
res_pam$clustering
)
# Hierarchical ----
res_agnes <- cluster::agnes(diss_matrix)
class(res_agnes)
# add a cluster- a named vector
ids <- attr(diss_matrix, "Labels")
res_agnes$cluster <- stats::cutree(res_agnes, 3) %>% setNames(ids)
nrow(dta_clust)
cluster_stats <- fpc::cluster.stats(
d = diss_matrix,
clustering = res_kmeans$cluster)
as_tbl_sep_matrix(cluster_stats$separation.matrix)
gg_separation_matrix(cluster_stats$separation.matrix)
length(unique(dim(cluster_stats$separation.matrix))) == 1
cluster_stats$separation.matrix %>% gg_separation_matrix()
matrix(
c(0, 1.06, 2.4,
1.06, 0, 2.03,
2.4, 2.03, 0),
nrow = 3, ncol = 3
) %>%
gg_separation_matrix()
# Silhouette are defined for 2 <= k <= n-1; k-cluster, n subjects
# cluster::silhouette(res_agnes$cluster, diss_matrix)
#
# get_sil_widths(res_agnes, diss_matrix)
## silhouette widths ---
res_kmeans$silhouette <- get_sil_widths(res_kmeans, diss_matrix)
tbl_sil <- get_sil_widths(res_kmeans, diss_matrix)
dta %>%
left_join(tbl_sil, by = ".rowid", suffix = c("_varOfDF", ""))
dta_updt <-
dta %>%
add_silhouette(tbl_sil)
glimpse(dta_updt)
library(ggplot2)
dta_updt %>%
select(vars_for_cluster, cluster) %>%
plot_density()
# Aggregated Tbl silhouette
# a list of statistics for silhouette summaries
silhouette_summary = list(
size = ~n(),
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE)
)
res_sil <- get_sil_widths(res_pam, diss_matrix)
by_group <-
res_sil %>%
group_by(cluster = as.character(cluster)) %>%
summarise(
across(sil_width, silhouette_summary, .names = "{.fn}")
) %>%
ungroup() %>%
mutate(prop = size/sum(size), .after = size)
overall <-
res_sil %>%
summarise(
across(sil_width, silhouette_summary, .names = "{.fn}")
) %>%
mutate(cluster = "ALL", prop = 1)
bind_rows(by_group, overall) %>%
xtable::xtable() %>% print(hline.after = c(3), type = "html")
flextable::flextable()
res_pam$clustering %>%
as.character() %>%
forcats::fct_count(prop = TRUE) %>%
transmute("cluster" = as.character(f), "size" = n, "prop" = p) %>%
left_join(by_group, by = "cluster") %>%
bind_rows(overall)
# Dendrogram
library(dendextend)
dendro <- as.dendrogram(res_agnes)
plot(dendro)
summary(dendro)
labels(dendro)
ord <- res_agnes$order
lab <- res_agnes$order.lab %>% as.numeric()
dta[ord,]
dta[lab,]
new_labels <- dta$species[as.numeric(res_agnes$order.lab)]
dendro %>%
# set("leaves_pch", c(19, 19, NA)) %>%
# set("leaves_cex", c(1, 2)) %>%
set("labels_colors", value = c("skyblue", "orange", "grey"), k=3) %>%
set("branches_k_color", value = c("skyblue", "orange", "grey"), k = 3) %>%
set("leaves_pch", 19) %>%
set("nodes_cex", 0.7) %>%
set("labels", new_labels) %>%
plot(horiz=TRUE, axes=FALSE)
#' Check the selected variables for k means clustering
#'
#' This function does a validation check if all the variables
#' the user selected for K-means, conform to some rules
#' i.e. at least one is numeric, and if there are any
#'
check_vars_for_kmeans <- function(dta, vars){
stopifnot(inherits(dta, "data.frame"))
stopifnot(is.character(vars))
stopifnot(length(vars) > 0)
stopifnot(all(vars %in% names(dta)))
list(
number <- any(sapply(dta[, vars, drop = FALSE], is.numeric))
)
}
sapply(mtcars, is.numeric)
mtcars %>% get_vars_of_type("factor")
names(iris)[vapply(iris, is.numeric, logical(1))]
check_vars_for_kmeans(penguins, c("island", "bill_length_mm") )
penguins[, c("island", "bill_length_mm")]
mtcars[, c("hp"), drop = FALSE]
dta <-
mtcars %>%
as_tibble() %>%
select(cyl, disp, wt) %>%
slice_head(n = 15)
dta_clst <- dta
dta_clst$cyl[sample(15,2)] <- NA
dta_clst$disp[sample(15,2)] <- NA
res <- kmeans(na.omit(dta_clst), 2)
res$cluster
# complete <- complete.cases(dta_clst)
#
# dta_clst$cluster <- NA_integer_
#
# dta_clst$cluster[which(complete)] <- res$cluster
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.