R/digit_test.R
In jfa: Statistical Methods for Auditing

Documented in digit_test

# Copyright (C) 2020-2023 Koen Derks

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

#' Data Auditing: Digit Distribution Test
#'
#' @description This function extracts and performs a test of the distribution
#' of (leading) digits in a vector against a reference distribution. By default,
#' the distribution of leading digits is checked against Benford's law.
#'
#' @usage digit_test(
#'   x,
#'   check = c("first", "last", "firsttwo", "lasttwo"),
#'   reference = "benford",
#'   conf.level = 0.95,
#'   prior = FALSE
#' )
#'
#' @param x          a numeric vector.
#' @param check      location of the digits to analyze. Can be \code{first},
#'   \code{last}, \code{firsttwo}, or \code{lasttwo}.
#' @param reference  which character string given the reference distribution for
#'   the digits, or a vector of probabilities for each digit. Can be
#'   \code{benford} for Benford's law, \code{uniform} for the uniform
#'   distribution. An error is given if any entry of \code{reference} is
#'   negative. Probabilities that do not sum to one are normalized.
#' @param conf.level a numeric value between 0 and 1 specifying the
#'   confidence level (i.e., 1 - audit risk / detection risk).
#' @param prior      a logical specifying whether to use a prior distribution,
#'   or a numeric value equal to or larger than 1 specifying the prior
#'   concentration parameter, or a numeric vector containing the prior
#'   parameters for the Dirichlet distribution on the digit categories.
#'
#' @details Benford's law is defined as \eqn{p(d) = log10(1/d)}. The uniform
#'   distribution is defined as \eqn{p(d) = 1/d}.
#'
#' @return An object of class \code{jfaDistr} containing:
#'
#' \item{data}{the specified data.}
#' \item{conf.level}{a numeric value between 0 and 1 giving the confidence
#'   level.}
#' \item{observed}{the observed counts.}
#' \item{expected}{the expected counts under the null hypothesis.}
#' \item{n}{the number of observations in \code{x}.}
#' \item{statistic}{the value the chi-squared test statistic.}
#' \item{parameter}{the degrees of freedom of the approximate chi-squared
#'   distribution of the test statistic.}
#' \item{p.value}{the p-value for the test.}
#' \item{check}{checked digits.}
#' \item{digits}{vector of digits.}
#' \item{reference}{reference distribution}
#' \item{match}{a list containing the row numbers corresponding to the
#'   observations matching each digit.}
#' \item{deviation}{a vector indicating which digits deviate from their
#'   expected relative frequency under the reference distribution.}
#' \item{prior}{a logical indicating whether a prior distribution was used.}
#' \item{data.name}{a character string giving the name(s) of the data.}
#'
#' @author Koen Derks, \email{k.derks@nyenrode.nl}
#'
#' @references Benford, F. (1938). The law of anomalous numbers.
#'   \emph{In Proceedings of the American Philosophical Society}, 551-572.
#'
#' @seealso \code{\link{repeated_test}}
#'
#' @keywords audit Bayesian Benford digits distribution
#'
#' @examples
#' set.seed(1)
#' x <- rnorm(100)
#'
#' # First digit analysis against Benford's law
#' digit_test(x, check = "first", reference = "benford")
#'
#' # Bayesian first digit analysis against Benford's law
#' digit_test(x, check = "first", reference = "benford", prior = TRUE)
#'
#' # Last digit analysis against the uniform distribution
#' digit_test(x, check = "last", reference = "uniform")
#'
#' # Bayesian last digit analysis against the uniform distribution
#' digit_test(x, check = "last", reference = "uniform", prior = TRUE)
#'
#' # First digit analysis against a custom distribution
#' digit_test(x, check = "last", reference = 1:9)
#'
#' # Bayesian first digit analysis against a custom distribution
#' digit_test(x, check = "last", reference = 1:9, prior = TRUE)
#' @export

digit_test <- function(x,
                       check = c("first", "last", "firsttwo", "lasttwo"),
                       reference = "benford",
                       conf.level = 0.95,
                       prior = FALSE) {
  stopifnot("'x' must be a vector" = (NCOL(x) == 1) && !is.data.frame(x))
  check <- match.arg(check)
  bayesian <- prior[1] != FALSE
  dname <- deparse(substitute(x))
  x <- x[!is.na(x)]
  x <- x[!is.infinite(x)]
  x <- x[x != 0]
  d <- .extract_digits(x, check = check, include.zero = FALSE)
  d <- d[!is.na(d)]
  n <- length(d)
  d_tab <- table(d)
  dig <- if (check %in% c("firsttwo", "lasttwo")) 10:99 else seq_len(9)
  obs <- rep(0, length(dig))
  d_included <- as.numeric(names(d_tab))
  index <- if (check %in% c("firsttwo", "lasttwo")) d_included - 9 else d_included
  obs[index] <- as.numeric(d_tab)
  if (is.numeric(reference)) {
    stopifnot(
      "number of elements in 'reference' must be equal to the number of digits" = length(reference) == length(dig),
      "all elements in 'reference' must be >= 0" = all(reference >= 0)
    )
    p_exp <- reference / sum(reference)
  } else if (reference == "benford") {
    p_exp <- log10(1 + 1 / dig)
  } else if (reference == "uniform") {
    p_exp <- rep(1 / length(dig), length(dig))
  } else {
    stop("specify a valid input for the 'reference' argument")
  }
  exp <- n * p_exp
  if (!bayesian) {
    statistic <- sum((obs - exp)^2 / exp)
    parameter <- length(dig) - 1
    if (any(exp < 5)) {
      warning("Some expected counts < 5, Chi-squared approximation may be incorrect")
    }
    pval <- stats::pchisq(q = statistic, df = parameter, lower.tail = FALSE)
    names(statistic) <- "X-squared"
    names(parameter) <- "df"
  } else {
    if (isTRUE(prior)) {
      prior_vec <- rep(1, length(dig))
    } else if (is.numeric(prior) && length(prior) == 1) {
      stopifnot("'prior' must be a numeric value >= 1" = prior >= 1)
      prior_vec <- rep(prior, length(dig))
    } else if (is.numeric(prior) && length(prior) != length(dig)) {
      stop("number of elements in 'prior' must be equal to number of digits")
    } else {
      prior_vec <- prior
    }
    bf <- .multinomialBf(obs, p_exp, prior_vec)
    names(bf) <- "BF10"
  }
  mad <- mean(abs((obs / n) - (exp / n)))
  names(mad) <- "MAD"
  names(n) <- "n"
  names(obs) <- dig
  names(exp) <- dig
  result <- list()
  result[["data"]] <- x
  result[["conf.level"]] <- conf.level
  result[["observed"]] <- obs
  result[["expected"]] <- exp
  result[["n"]] <- n
  if (!bayesian) {
    result[["statistic"]] <- statistic
    result[["parameter"]] <- parameter
    result[["p.value"]] <- pval
  } else {
    result[["bf"]] <- bf
  }
  result[["mad"]] <- mad
  result[["check"]] <- check
  result[["digits"]] <- dig
  result[["reference"]] <- reference
  result[["match"]] <- split(x = data.frame(row = seq_along(d), value = x), f = d)
  result[["estimates"]] <- data.frame(d = dig, n = obs, p.exp = p_exp, p.obs = obs / n)
  if (isFALSE(prior)) {
    result[["estimates"]]$lb <- stats::qbeta((1 - conf.level) / 2, obs, 1 + n - obs)
    result[["estimates"]]$ub <- stats::qbeta(conf.level + (1 - conf.level) / 2, 1 + obs, n - obs)
    result[["estimates"]]$p.value <- apply(result[["estimates"]], 1, function(x, n) stats::binom.test(x[2], n, x[3], alternative = "two.sided")$p.value, n = n)
  } else {
    result[["estimates"]]$lb <- stats::qbeta((1 - conf.level) / 2, 1 + obs, 1 + n - obs)
    result[["estimates"]]$ub <- stats::qbeta(conf.level + (1 - conf.level) / 2, 1 + obs, 1 + n - obs)
    result[["estimates"]]$bf10 <- 1 / (stats::dbeta(p_exp, 1 + obs, 1 + n - obs) / stats::dbeta(p_exp, 1, 1))
  }
  deviation <- result[["estimates"]]$p.exp <= result[["estimates"]]$lb | result[["estimates"]]$p.exp >= result[["estimates"]]$ub
  names(deviation) <- dig
  result[["deviation"]] <- deviation
  result[["prior"]] <- prior
  result[["data.name"]] <- dname
  class(result) <- c("jfaDistr", "list")
  return(result)
}