R/cluster-scores.R

#' Score a Kmeans Clustering Based on TF-IDF scores
#'
#' Generate a kmeans clustering for each of n_topics and
#' score those clusterings using the scores in 
#' \code{clues::adjustedRand} function
#'
#' @param A list of integers for each value k to cluster
#' @param param from_cache Logical : Return cached values if true
#' @return A dataframe with columns k, score, metric
#' @export
score_kmeans <- function(n_topics, from_cache = FALSE) {
  if(from_cache) {
    readRDS(here::here("inst", "data", "kmeans_scores.RDS"))
  } else {
    if(!exists("dtm") | !exists("total_words")) {
      cat("Loading table 'set_colors'")
      load_csv(sample_data = FALSE)
      load_csv(sample_data = FALSE)
    }

    tf_idf <- tm::weightTfIdf(m = dtm, normalize = TRUE)
    tf_idf_mat <- as.matrix(tf_idf)

    tf_idf_norm <- tf_idf_mat / apply(tf_idf_mat, 1, function(x) sum(x ^ 2) ^ 0.5)

    kmclust <- n_topics %>%
      parallel::mclapply(., 
        function(x) kmeans(
          tf_idf_norm,
          centers = x,
          iter.max = 50
        ),
        mc.cores = (parallel::detectCores() - 1)
      ) 

    kmdf <- tibble(k = list(n_topics), model = kmclust)
    kmtidy <- kmdf %>%
      mutate(km = purrr:::map(model, broom::tidy))

    km_rand <- apply(
      kmtidy, 1,
      function(x) clues::adjustedRand(x$model$cluster, total_words$root_id)
    ) %>%
      t() %>%
      data.frame()

    km_rand$k <- n_topics

    km_rand <- km_rand %>%
      tidyr::gather(key = metric, value = score, -k)
    saveRDS(km_rand, here::here("inst", "data", "kmeans_scores.RDS"))
    km_rand
  } # end else
} 



#' Retrieve Topic Numbers
#'
#' Returns a list of topic numbers from LDA models
#'
#' @param models A list of \code{topicmodels::LDA} models
#' @return A list integers
#' @export
get_topic_numbers <- function(models){
  purrr::map_int(models, ~.x@k)
}

#' Cluster LDA Models
#'
#' @param A list of LDA models 
#' @return A tibble with k, score and metric used
#' @export
get_lda_clusters <- function(lda_models){
  # Assign sets topic which has maximum value for that set 
  top_cluster <- function(x){ 
    apply(topicmodels::posterior(x)$topics, 1, which.max)
  }
  tibble(clust = purrr::map(lda_models, top_cluster), 
    k = get_topic_numbers(lda_models))
}

 
#' Score LDA Clusters
#'
#' Scores LDA clusters based on \code{clues::adjustedRand}
#'
#' @param lda_clusters : A tibble generated by \code{get_lda_clusters}
#' @param total_words : A dataframe generated by \code{load_color_tables}
#' @return return
#' @export
score_lda <- function(lda_clusters, total_words) {
  lda_rand <- lda_clust %>% 
    mutate(score = map(clust, ~clues::adjustedRand(.x, total_words$root_id))) %>% 
    mutate(metric = map(score, names)) 

  tidyr::unnest(lda_rand, score, metric)
}
nateaff/legolda documentation built on May 18, 2019, 10:15 a.m.