R/kmeans.R

Defines functions exp_kmeans iterate_kmeans

# internal function to iterate number of centers (k) from 1 to max_centers for elbow method to find optimal k.
iterate_kmeans <- function(df, max_centers = 10,
                           iter.max = 10,
                           nstart = 1,
                           algorithm = "Hartigan-Wong",
                           trace = FALSE,
                           normalize_data = TRUE,
                           seed = NULL
                           ) {
  # Limit the numbers of centers to search up to nrow(df) - 1.
  # Otherwise we will get "Centers should be less than rows" error.
  n_centers <- seq(min(max_centers, nrow(df) - 1))
  ret <- data.frame(center = n_centers)
  ret <- ret %>% dplyr::mutate(model = purrr::map(center, function(x) {
    model_df <- df %>% build_kmeans.cols(everything(),
                                         centers=x,
                                         iter.max = iter.max,
                                         nstart = nstart,
                                         algorithm = algorithm,
                                         trace = trace,
                                         normalize_data = normalize_data,
                                         seed=seed,
                                         keep.source=FALSE,
                                         augment=FALSE,
                                         na.rm = FALSE) # NA filtering is already done. Skip it to save time. 
    ret <- model_df$model[[1]]
    ret
  }))
  ret %>% rowwise(center) %>% glance_rowwise(model)
}

#' analytics function for K-means view
#' @export
exp_kmeans <- function(df, ...,
                       centers=3, # build_kmeans.cols arguments.
                       iter.max = 10,
                       nstart = 1,
                       algorithm = "Hartigan-Wong",
                       trace = FALSE,
                       normalize_data = TRUE,
                       max_nrow = NULL,
                       seed = 1,
                       elbow_method_mode=FALSE,
                       max_centers = 10
                       ) {
  # this evaluates select arguments like starts_with
  selected_cols <- tidyselect::vars_select(names(df), !!! rlang::quos(...))

  grouped_cols <- grouped_by(df)

  # Set seed just once.
  if(!is.null(seed)) { # Set seed before starting to call sample_n.
    set.seed(seed)
  }

  # list and difftime etc. causes error in tidy_rowwise(model, type="biplot").
  # For now, we are removing them upfront.
  df <- df %>% dplyr::select(-where(is.list),
                             -where(lubridate::is.difftime),
                             -where(lubridate::is.duration),
                             -where(lubridate::is.interval),
                             -where(lubridate::is.period))

  sampled_nrow <- NULL
  if (!is.null(max_nrow) && nrow(df) > max_nrow) {
    # Record that sampling happened.
    sampled_nrow <- max_nrow
    df <- df %>% sample_rows(max_nrow)
  }

  # As the name suggests, this preprocessing function was originally designed to be done
  # before sampling, but we found that for this k-means function, that makes the
  # process as a whole slower in the cases we tried. So, we are doing this after sampling.
  nrow_before_filter <- nrow(df)
  filtered_df <- preprocess_factanal_data_before_sample(df, selected_cols)
  excluded_nrow <- nrow_before_filter - nrow(filtered_df)
  selected_cols <- attr(filtered_df, 'predictors') # predictors are updated (removed) in preprocess_factanal_data_before_sample. Sync with it.
  df <- filtered_df

  if (!elbow_method_mode) {
    kmeans_model_df <- df %>% build_kmeans.cols(!!!rlang::syms(selected_cols),
                                                centers = centers,
                                                iter.max = iter.max,
                                                nstart = nstart,
                                                algorithm = algorithm,
                                                trace = trace,
                                                normalize_data = normalize_data,
                                                keep.source = FALSE,
                                                augment = FALSE,
                                                seed = NULL, # Seed is already done. Skip it.
                                                na.rm = FALSE) # NA filtering is already done. Skip it to save time. 
  }

  if (!elbow_method_mode) {
    # This is about how UI-side is done, but it can handle single column case, only if it is single column from the beginnig.
    # Check that and pass that info to do_prcomp() as allow_single_column.
    allow_single_column <- length(selected_cols) == 1
    ret <- do_prcomp(df, normalize_data = normalize_data, allow_single_column = allow_single_column, seed = NULL,
                     na.rm = FALSE, # Skip NA filtering since it is already done.
                     !!!rlang::syms(selected_cols))
    ret <- dplyr::ungroup(ret) # ungroup once so that the following mutate with purrr::map2 works.
    ret <- ret %>% dplyr::mutate(model = purrr::map2(model, !!kmeans_model_df$model, function(x, y) {
      x$kmeans <- y # Might need to be more careful on guaranteeing x and y are from same group, but we are not supporting group_by on UI at this point.
      x$sampled_nrow <- sampled_nrow
      x$excluded_nrow <- excluded_nrow
      x
    }))
  }
  else {
    kmeans_df <- df %>% dplyr::select(!!!rlang::syms(selected_cols))
    ret <- iterate_kmeans(kmeans_df,
                          max_centers = max_centers,
                          iter.max = iter.max,
                          nstart = nstart,
                          algorithm = algorithm,
                          trace = trace,
                          normalize_data = normalize_data,
                          seed=NULL) # Seed is already done in do_prcomp. Skip it.
    ret <- tibble::tibble(model = list(ret)) # Follow current output format for now. TODO: Revisit and support group_by.
  }
  # Rowwise grouping has to be redone with original grouped_cols, so that summarize(tidy(model)) later can add back the group column.
  if (length(grouped_cols) > 0) {
    ret <- ret %>% dplyr::rowwise(grouped_cols)
  } else {
    ret <- ret %>% dplyr::rowwise()
  }
  ret
}
exploratory-io/exploratory_func documentation built on April 23, 2024, 9:15 p.m.