#' Score a Kmeans Clustering Based on TF-IDF scores
#'
#' Generate a kmeans clustering for each of n_topics and
#' score those clusterings using the scores in
#' \code{clues::adjustedRand} function
#'
#' @param A list of integers for each value k to cluster
#' @param param from_cache Logical : Return cached values if true
#' @return A dataframe with columns k, score, metric
#' @export
score_kmeans <- function(n_topics, from_cache = FALSE) {
if(from_cache) {
readRDS(here::here("inst", "data", "kmeans_scores.RDS"))
} else {
if(!exists("dtm") | !exists("total_words")) {
cat("Loading table 'set_colors'")
load_csv(sample_data = FALSE)
load_csv(sample_data = FALSE)
}
tf_idf <- tm::weightTfIdf(m = dtm, normalize = TRUE)
tf_idf_mat <- as.matrix(tf_idf)
tf_idf_norm <- tf_idf_mat / apply(tf_idf_mat, 1, function(x) sum(x ^ 2) ^ 0.5)
kmclust <- n_topics %>%
parallel::mclapply(.,
function(x) kmeans(
tf_idf_norm,
centers = x,
iter.max = 50
),
mc.cores = (parallel::detectCores() - 1)
)
kmdf <- tibble(k = list(n_topics), model = kmclust)
kmtidy <- kmdf %>%
mutate(km = purrr:::map(model, broom::tidy))
km_rand <- apply(
kmtidy, 1,
function(x) clues::adjustedRand(x$model$cluster, total_words$root_id)
) %>%
t() %>%
data.frame()
km_rand$k <- n_topics
km_rand <- km_rand %>%
tidyr::gather(key = metric, value = score, -k)
saveRDS(km_rand, here::here("inst", "data", "kmeans_scores.RDS"))
km_rand
} # end else
}
#' Retrieve Topic Numbers
#'
#' Returns a list of topic numbers from LDA models
#'
#' @param models A list of \code{topicmodels::LDA} models
#' @return A list integers
#' @export
get_topic_numbers <- function(models){
purrr::map_int(models, ~.x@k)
}
#' Cluster LDA Models
#'
#' @param A list of LDA models
#' @return A tibble with k, score and metric used
#' @export
get_lda_clusters <- function(lda_models){
# Assign sets topic which has maximum value for that set
top_cluster <- function(x){
apply(topicmodels::posterior(x)$topics, 1, which.max)
}
tibble(clust = purrr::map(lda_models, top_cluster),
k = get_topic_numbers(lda_models))
}
#' Score LDA Clusters
#'
#' Scores LDA clusters based on \code{clues::adjustedRand}
#'
#' @param lda_clusters : A tibble generated by \code{get_lda_clusters}
#' @param total_words : A dataframe generated by \code{load_color_tables}
#' @return return
#' @export
score_lda <- function(lda_clusters, total_words) {
lda_rand <- lda_clust %>%
mutate(score = map(clust, ~clues::adjustedRand(.x, total_words$root_id))) %>%
mutate(metric = map(score, names))
tidyr::unnest(lda_rand, score, metric)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.