R/opt_bin.R

Defines functions opt_bin

Documented in opt_bin

#' Get the optimal bin width for a histogram given a data set, its value and
#' the desired amount of bins
#'
#' Modified from Hideaki Shimazaki
#' Department of Physics, Kyoto University
#' shimazaki at ton.scphys.kyoto-u.ac.jp
#' Feel free to modify/distribute this program.
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description
#' Gives the optimal bindwidth for a histogram given a data set, it's value and
#' the desired amoutn of bins
#'
#' @param .data The data set in question
#' @param .value_col The column that holds the values
#' @param .iters How many times the cost function loop should run
#'
#' @details
#' - Supply a data.frame/tibble with a value column. from this an optimal binwidth
#'   will be computed for the amount of binds desired
#'
#' @examples
#' library(dplyr)
#' library(tibble)
#' library(purrr)
#'
#' df_tbl <- rnorm(n = 1000, mean = 0, sd = 1)
#' df_tbl <- df_tbl %>%
#'   as_tibble() %>%
#'   set_names("value")
#'
#' df_tbl %>%
#'   opt_bin(
#'     .value_col = value
#'     , .iters = 100
#'   )
#'
#' @return
#' A tibble of histogram breakpoints
#'
#' @export
#'

opt_bin <- function(
  .data
  , .value_col
  , .iters = 30
) {

  # Tidyeval
  value_var_expr  <- rlang::enquo(.value_col)
  iters           <- .iters

  # Checks
  if(!is.data.frame(.data)) {
    stop(call. = FALSE, "(.data) is not a data.frame/tibble. Please supply.")
  }

  if(rlang::quo_is_missing(value_var_expr)) {
    stop(call. = FALSE, "(.value_col) is missing. Please supply.")
  }

  if(is.null(iters)) {
    iters = 30
  }

  # Data Column Pull
  data <- tibble::as_tibble(.data) %>%
    dplyr::select( {{value_var_expr}} ) %>%
    dplyr::pull( {{value_var_expr}} )


  # Get n sequence
  n <- 2:iters
  c <- base::numeric(base::length(n))
  d <- c

  # For Loop
  for (i in 1:length(n)) {

    d[i] <- diff(range( data ) ) / n[i]

    edges = seq(min( data ), max( data ), length = n[i])
    hp <- graphics::hist(data, breaks = edges, plot = FALSE)
    ki <- hp$counts

    k <- mean(ki)
    v <- sum((ki-k)^2/n[i])

    # Cost function
    c[i] <- (2*k - v)/d[i]^2

  }

  idx   <- which.min(c)
  opt_d <- d[idx]

  edges <- seq(min(data), max(data), length = n[idx])
  edges <- tibble::as_tibble(edges)

  return(edges)

}
spsanderson/LICHospitalR documentation built on Jan. 6, 2022, 12:32 a.m.