fabricated: Tools to Audit Survey Data Quality

Documented in unif_digits

#' Goodness of fit for uniform digits
#'
#' The 'uniform_digits()' function utilizes the 'count_digits()' function to count the number
#' of times that each digit occurs and the 'chisq.test()' function from the stats package to
#' test the assumption that the digits are uniformly distributed. Several other measures of
#' deviation from the uniform distribution are also available.
#'
#' This y Leemis, Schmeiser, and Evans (2000) for max.
#' Cho and Gaines (2007), Euclidean distance
#'
#' @param data A data frame
#' @param variable A numeric variable that includes the first decimal place.
#' @param group A second variable used to group the primary variable
#' @param decimal_place The decimal place for which digits are counted. The
#'    default is set to one but any place may be specified by numeric rank,
#'    i.e. "1" for the 1st decimal (tenths), "2" for the 2nd decimal (hundreds),
#'    etc.
#' @param measures Measures of the degree to which the data deviates from
#'     the uniform distribution.
#' @param counts The default is set as FALSE. If this is changed to TRUE,
#'     then results from count_digits will also be included in the output.
#'
#' @return A tibble that includes a column called "chisq" that hold the value
#'     of the chi-square statistic and a column called "chisq_p" that holds
#'     the p-value derived form the chi-square statistic.
#'
#' @export
#'
#' @examples
#' unif_digits(bodyweight, obs, group)
#'
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data


unif_digits <- function(data,
                        variable,
                        group = NULL,
                        decimal_place = 1,
                        measures = c("mad"),
                        counts = FALSE)  {

  output <- count_digits(data, {{ variable }}, {{ group }}, {{ decimal_place }}) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(total = sum(dplyr::across(tidyselect::starts_with("n_"))),
                  chisq = stats::chisq.test(c(.data$n_0,
                                              .data$n_1,
                                              .data$n_2,
                                              .data$n_3,
                                              .data$n_4,
                                              .data$n_5,
                                              .data$n_6,
                                              .data$n_7,
                                              .data$n_8,
                                              .data$n_9))$statistic,
                  chisq_p = stats::chisq.test(c(.data$n_0,
                                                .data$n_1,
                                                .data$n_2,
                                                .data$n_3,
                                                .data$n_4,
                                                .data$n_5,
                                                .data$n_6,
                                                .data$n_7,
                                                .data$n_8,
                                                .data$n_9))$p.value) %>%
    dplyr::mutate(expected = .data$total * 0.1,
                  deviation = sum(abs(.data$expected - dplyr::across(tidyselect::starts_with("n_")))))


  if("mad" %in% measures)  {

    output <- output %>%
      dplyr::mutate(mad = ((.data$deviation / .data$total) / 10))

  }


  if("max_dev" %in% measures)  {

    output <- output %>%
      dplyr::mutate(dev_max = max(abs(.data$expected - dplyr::across(tidyselect::starts_with("n_")))))

  }


  if("euclidean" %in% measures)  {

    output <- output %>%
      dplyr::mutate(dev_m = sqrt(sum(abs(.data$expected - dplyr::across(tidyselect::starts_with("n_"))^2))))

  }


  if(counts)  {
    output %>%
      dplyr::select(-.data$deviation, -.data$expected)
  }

  else  {
    output %>%
      dplyr::select(-(.data$n_0:.data$n_9), -.data$deviation, -.data$expected)

  }

}