R/kmean-funcs.R

Defines functions hai_kmeans_scree_plt hai_kmeans_scree_data_tbl hai_kmeans_mapped_tbl hai_kmeans_tidy_tbl hai_kmeans_obj hai_kmeans_user_item_tbl

Documented in hai_kmeans_mapped_tbl hai_kmeans_obj hai_kmeans_scree_data_tbl hai_kmeans_scree_plt hai_kmeans_tidy_tbl hai_kmeans_user_item_tbl

#' K-Means User Item Tibble
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' Takes in a data.frame/tibble and transforms it into an aggregated/normalized
#' user-item tibble of proportions. The user will need to input the parameters
#' for the rows/user and the columns/items.
#'
#' @details
#' This function should be used before using a k-mean model. This is
#' commonly referred to as a user-item matrix because "users" tend to be on the
#' rows and "items" (e.g. orders) on the columns. You must supply a column that
#' can be summed for the aggregation and normalization process to occur.
#'
#' @param .data The data that you want to transform
#' @param .row_input The column that is going to be the row (user)
#' @param .col_input The column that is going to be the column (item)
#' @param .record_input The column that is going to be summed up for the aggregation
#' and normalization process.
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' )
#'
#' @return
#' A aggregated/normalized user item tibble
#'
#' @export
#'

hai_kmeans_user_item_tbl <- function(.data, .row_input, .col_input, .record_input) {

  # * Tidyeval ----
  row_input_var_expr <- rlang::enquo(.row_input)
  col_input_var_expr <- rlang::enquo(.col_input)
  rec_input_var_expr <- rlang::enquo(.record_input)

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE, "(.data) is not a data.frame/tibble. Please supply.")
  }

  if (rlang::quo_is_missing(row_input_var_expr)) {
    stop(call. = FALSE, "You must supply a user/row input.")
  }

  if (rlang::quo_is_missing(col_input_var_expr)) {
    stop(call. = FALSE, "You must supply an item/column input")
  }

  if (rlang::quo_is_missing(rec_input_var_expr)) {
    stop(call. = FALSE, "You must supply a record column that can be summed up
             for the aggregation and normalization process.")
  }

  # * Data ----
  data_tbl <- tibble::as_tibble(.data)

  # * Manipulate ----
  data_summarized_tbl <- data_tbl %>%
    dplyr::group_by({{ row_input_var_expr }}, {{ col_input_var_expr }}) %>%
    dplyr::summarise(total_records = sum({{ rec_input_var_expr }}, na.rm = TRUE)) %>%
    dplyr::ungroup() %>%
    # Normalize proportions
    dplyr::group_by({{ row_input_var_expr }}) %>%
    dplyr::mutate(prop_of_total = total_records / sum(total_records)) %>%
    dplyr::ungroup()

  # User/Item format
  user_item_tbl <- data_summarized_tbl %>%
    dplyr::select({{ row_input_var_expr }}, {{ col_input_var_expr }}, prop_of_total) %>%
    dplyr::mutate(prop_of_total = base::ifelse(
      base::is.na(prop_of_total), 0, prop_of_total
    )) %>%
    tidyr::pivot_wider(
      names_from = {{ col_input_var_expr }},
      values_from = prop_of_total,
      values_fill = list(prop_of_total = 0)
    )

  # * Return ----
  return(user_item_tbl)
}

#' @rdname hai_kmeans_user_item_tbl
#' @export
kmeans_user_item_tbl <- hai_kmeans_user_item_tbl


#' K-Means Object
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' Takes the output of the [hai_kmeans_user_item_tbl()] function and applies the
#' k-means algorithm to it using [stats::kmeans()]
#'
#' @details Uses the [stats::kmeans()] function and creates a wrapper around it.
#'
#' @param .data The data that gets passed from [hai_kmeans_user_item_tbl()]
#' @param .centers How many initial centers to start with
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' ) %>%
#'   hai_kmeans_obj()
#'
#' @return
#' A stats k-means object
#'
#' @export
#'

hai_kmeans_obj <- function(.data, .centers = 5) {

  # * Tidyeval ----
  centers_var_expr <- .centers

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE("(.data) is missing. Please supply."))
  }

  # Default to 5
  if (is.null(centers_var_expr)) {
    centers_var_expr <- 5
  }

  # * Data ----
  data <- tibble::as_tibble(.data)

  # * k-means ----
  kmeans_tbl <- data %>%
    dplyr::select(-1)

  kmeans_obj <- kmeans_tbl %>%
    stats::kmeans(
      centers = centers_var_expr,
      nstart = 100
    )

  return(kmeans_obj)
}

#' @rdname hai_kmeans_obj
#' @export
kmeans_obj <- hai_kmeans_obj

#' K-Means Object Tidy Functions
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' K-Means tidy functions
#'
#' @details
#' Takes in a k-means object and its associated user item tibble and then
#' returns one of the items asked for. Either: [broom::tidy()], [broom::glance()]
#' or [broom::augment()]. The function defaults to [broom::tidy()].
#'
#' @param .kmeans_obj A [stats::kmeans()] object
#' @param .tidy_type "tidy","glance", or "augment"
#' @param .data The user item tibble created from [hai_kmeans_user_item_tbl()]
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#' library(broom)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' uit_tbl <- hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' )
#'
#' km_obj <- hai_kmeans_obj(uit_tbl)
#'
#' hai_kmeans_tidy_tbl(
#'   .kmeans_obj = km_obj,
#'   .data = uit_tbl,
#'   .tidy_type = "augment"
#' )
#'
#' hai_kmeans_tidy_tbl(
#'   .kmeans_obj = km_obj,
#'   .data = uit_tbl,
#'   .tidy_type = "glance"
#' )
#'
#' hai_kmeans_tidy_tbl(
#'   .kmeans_obj = km_obj,
#'   .data = uit_tbl,
#'   .tidy_type = "tidy"
#' ) %>%
#'   glimpse()
#'
#' @return
#' A tibble
#'
#' @export
#'

hai_kmeans_tidy_tbl <- function(.kmeans_obj, .data, .tidy_type = "tidy") {

  # * Tidyeval ----
  kmeans_obj <- .kmeans_obj
  tidy_type <- .tidy_type

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE, "(.user_item_data) is not a data.frame/tibble, please supply original user item tibble.")
  }

  if (!inherits(x = kmeans_obj, what = "kmeans")) {
    stop(call. = FALSE, "(.kmeans_obj) is not of class 'kmeans'")
  }

  if (!tidy_type %in% c("tidy", "augment", "glance")) {
    stop(
      call. = FALSE,
      "(.tidy_type) must be either tidy, glance, or augment"
    )
  }

  # * Manipulate ----
  uit_tbl <- tibble::as_tibble(.data)
  row_col <- colnames(uit_tbl[1])

  if (tidy_type == "tidy") {
    km_tbl <- kmeans_obj %>% broom::tidy()
  } else if (tidy_type == "glance") {
    km_tbl <- kmeans_obj %>% broom::glance()
  } else if (tidy_type == "augment") {
    km_tbl <- kmeans_obj %>%
      broom::augment(uit_tbl) %>%
      dplyr::select(row_col, .cluster) %>%
      dplyr::rename("cluster" = .cluster)
  }

  # * Return ----
  return(km_tbl)
}

#' @rdname hai_kmeans_tidy_tbl
#' @export
kmeans_tidy_tbl <- hai_kmeans_tidy_tbl

#' K-Means Mapping Function
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description Create a tibble that maps the [hai_kmeans_obj()] using [purrr::map()]
#' to create a nested data.frame/tibble that holds n centers. This tibble will be
#' used to help create a scree plot.
#'
#' @seealso
#' \url{https://en.wikipedia.org/wiki/Scree_plot}
#'
#' @details Takes in a single parameter of .centers. This is used to create the tibble
#' and map the [hai_kmeans_obj()] function down the list creating a nested tibble.
#'
#' @param .centers How many different centers do you want to try
#' @param .data You must have a tibble in the working environment from the
#' [hai_kmeans_user_item_tbl()]
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' ui_tbl <- hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' )
#'
#' hai_kmeans_mapped_tbl(ui_tbl)
#'
#' @return
#' A nested tibble
#'
#' @export
#'

hai_kmeans_mapped_tbl <- function(.data, .centers = 15) {

  # * Tidyeval ----
  centers_var_expr <- .centers

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE, "(.data) is not a data.frame/tibble. Please supply.")
  }

  input_data <- tibble::as_tibble(.data)

  km_mapper <- function(centers = 3) {
    input_data %>%
      dplyr::select(-1) %>%
      stats::kmeans(
        centers = centers,
        nstart = 100
      )
  }

  # * Manipulate ----
  data_tbl <- tibble::tibble(centers = 1:centers_var_expr) %>%
    dplyr::mutate(k_means = centers %>%
      purrr::map(km_mapper)) %>%
    dplyr::mutate(glance = k_means %>%
      purrr::map(broom::glance))

  # * Return ----
  return(data_tbl)
}

#' @rdname hai_kmeans_mapped_tbl
#' @export
kmeans_mapped_tbl <- hai_kmeans_mapped_tbl

#' K-Means Scree Plot Data Table
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description Take data from the [hai_kmeans_mapped_tbl()] and unnest it into a
#' tibble for inspection and for use in the [hai_kmeans_scree_plt()] function.
#'
#' @details Takes in a single parameter of .data from [hai_kmeans_mapped_tbl()] and
#' transforms it into a tibble that is used for [hai_kmeans_scree_plt()]. It will
#' show the values (tot.withinss) at each center.
#'
#' @param .data You must have a tibble in the working environment from the
#' [hai_kmeans_mapped_tbl()]
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' ui_tbl <- hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' )
#'
#' kmm_tbl <- hai_kmeans_mapped_tbl(ui_tbl)
#'
#' hai_kmeans_scree_data_tbl(kmm_tbl)
#'
#' @return
#' A nested tibble
#'
#' @export
#'

hai_kmeans_scree_data_tbl <- function(.data) {

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE, "(.data) is not a data.frame/tibble. Please supply.")
  }

  # * Manipulate ----
  data_tbl <- tibble::as_tibble(.data)

  data_tbl <- data_tbl %>%
    tidyr::unnest(glance) %>%
    dplyr::select(centers, tot.withinss)

  # * Return ----
  return(data_tbl)
}

#' @rdname hai_kmeans_scree_data_tbl
#' @export
kmeans_scree_data_tbl <- hai_kmeans_scree_data_tbl

#' K-Means Scree Plot
#'
#' @family Kmeans
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description Create a scree-plot from the [hai_kmeans_mapped_tbl()] function.
#'
#' @details Outputs a scree-plot
#'
#' @seealso
#' \url{https://en.wikipedia.org/wiki/Scree_plot}
#'
#' @param .data The data from the [hai_kmeans_mapped_tbl()] function
#'
#' @examples
#' library(healthyR.data)
#' library(dplyr)
#'
#' data_tbl <- healthyR_data %>%
#'   filter(ip_op_flag == "I") %>%
#'   filter(payer_grouping != "Medicare B") %>%
#'   filter(payer_grouping != "?") %>%
#'   select(service_line, payer_grouping) %>%
#'   mutate(record = 1) %>%
#'   as_tibble()
#'
#' ui_tbl <- hai_kmeans_user_item_tbl(
#'   .data = data_tbl,
#'   .row_input = service_line,
#'   .col_input = payer_grouping,
#'   .record_input = record
#' )
#'
#' kmm_tbl <- hai_kmeans_mapped_tbl(ui_tbl)
#'
#' hai_kmeans_scree_plt(.data = kmm_tbl)
#'
#' @return
#' A ggplot2 plot
#'
#' @export
#'

hai_kmeans_scree_plt <- function(.data) {

  # * Checks ----
  if (!is.data.frame(.data)) {
    stop(call. = FALSE, "(.data) is not a data.frame/tibble. Please supply.")
  }

  # * Manipulate ----
  data_tbl <- tibble::as_tibble(.data)

  data_tbl <- data_tbl %>%
    tidyr::unnest(glance) %>%
    dplyr::select(centers, tot.withinss)

  # * Plot
  p <- data_tbl %>%
    ggplot2::ggplot(
      mapping = ggplot2::aes(
        x = centers,
        y = tot.withinss
      )
    ) +
    ggplot2::geom_point() +
    ggplot2::geom_line() +
    # Kaiser Line
    ggplot2::geom_hline(yintercept = 1, color = "red", linetype = "dashed") +
    ggrepel::geom_label_repel(
      mapping = ggplot2::aes(
        label = centers
      )
    ) +
    ggplot2::theme_minimal() +
    ggplot2::labs(
      title = "Scree Plot",
      subtitle = "Measures the distance each of the users are from the closest k-means cluster",
      y = "Total Within Sum of Squares",
      x = "Centers"
    )

  # * Return ----
  return(p)
}

#' @rdname hai_kmeans_scree_plt
#' @export
kmeans_scree_plt <- hai_kmeans_scree_plt

#' @rdname hai_kmeans_scree_plt
#' @export
hai_kmeans_scree_plot <- hai_kmeans_scree_plt

Try the healthyR.ai package in your browser

Any scripts or data that you put into this service are public.

healthyR.ai documentation built on April 3, 2023, 5:24 p.m.