R/internal.functions-new-code-2.R
In benford.analysis: Benford Analysis for Data Validation and Forensic Analytics

#### Benford ####

DF <- function(data){
  data <- data[data >= 10]
  collapsed <- 10 * (data) / 10^trunc(log10(data))
  DF <- (mean(collapsed) - 39.0865) / (39.0685)
  DF * 100
}

mantissa.arc.test <- function(data, data.name){ #data must be the mantissa of the log10
  x.coord <- cos(2*pi*data)
  y.coord <- sin(2*pi*data)
  
  L2 <- (mean(x.coord))^2 + (mean(y.coord))^2
  names(L2) <- "L2"
  
  p.value <- exp(-(L2)*length(data))
  mantissa.df <- 2
  names(mantissa.df) <- "df"
  
  mat.bfd <- list(statistic = L2,
                  parameter = mantissa.df,
                  p.value = p.value,
                  method = "Mantissa Arc Test",
                  data.name = data.name)
  
  class(mat.bfd) <- "htest"
  
  return(mat.bfd)
}


chisq.test.bfd <- function(squared.diff, data.name){
  
  chisq <- sum(squared.diff)
  names(chisq) <- "X-squared"
  
  df <- length(squared.diff) - 1
  names(df) <- "df"
  chisq.p.value <- pchisq(chisq, df, lower.tail = FALSE)
  
  
  chisq.bfd <- list(statistic = chisq,
                    parameter = df,
                    p.value = chisq.p.value,
                    methods = "Pearson's Chi-squared test",
                    data.name = data.name)
  
  class(chisq.bfd) <- "htest"
  
  return(chisq.bfd)
}


##' MAD conformity to Benford's Law using the MAD
##' 
##' This function checks the MAD agains the confomity criteria proposed by Nigrini (2012).
##' 
##' @param MAD The mean absolute deviation, as computed in the function \code{\link{benford}}
##' @param digits.used How many digits used in the analysis.
##' 
##' @return A list with the MAD, digits.used and the conformity level.
##' 
##' @references Nigrini, M. J. (2012). Benford's Law: Application for Forensic Accounting, Auditing and Fraud Detection. Wiley and Sons: New Jersey.
MAD.conformity <- 	function(MAD = NULL,
                            digits.used = c("First Digit",
                                            "Second Digit",
                                            "First-Two Digits",
                                            "First-Three Digits")){
  Conformity.Levels <- c("Close conformity", 
                         "Acceptable conformity", 
                         "Marginally acceptable conformity", 
                         "Nonconformity")
  mad.intervals <- switch(digits.used,
    "First Digit" = c(0.000, 0.006, 0.012, 0.015),
    "Second Digit" = c(0.000, 0.008, 0.010, 0.011),
    "First-Two Digits" = c(0.000, 0.0012, 0.0018, 0.0022),
    "First-Three Digits" = c(0.000, 0.00036, 0.00044, 0.00050)
    )
  
  conformity <- Conformity.Levels[findInterval(MAD, mad.intervals)]
  
  out <- list(MAD = MAD, digits.used = digits.used, conformity = conformity)
  
  return(out)
}

#' @title Extracts the leading digits from the data
#' @description It extracts the leading digits from the data.
#' 
#'This function is used by the main function of the package \code{\link{benford}} to extract the 
#'leading digits of the data.
#' @usage
#' 
#' extract.digits(data, number.of.digits = 2, 
#'                sign="positive", second.order = FALSE, discrete=TRUE, round=3)
#' @param data a numeric vector. 
#' @param number.of.digits how many first digits to analyse .
#' @param sign  The default value for sign is "positive" and it analyzes only data greater than zero. 
#' There are also the options "negative" and "both" that will analyze only negative values or both positive and negative values of the data,
#' respectively. For large datasets with both positive and negative numbers, 
#' it is usually recommended to perform a separate analysis for each group,
#' for the incentives to manipulate the numbers are usually different.
#' @param second.order If TRUE, the function will extract the first digits of the second order distribution.
#' @param discrete Most real data - like population numbers or accounting data - are discrete, so 
#' the default is TRUE. This paramater sets rounding to the differences of the ordered data to avoid floating point number
#' errors in the second order distribution, that usually occurs when data is discrete
#' and the ordered numbers are very close to each other. If your data is continuous
#' (like a simulated lognormal) you should run with discrete = FALSE. 
#' @param round it defines the number of digits that the rounding will use if discrete = TRUE and second.order = TRUE.
#' @return A data.frame with the data and the first digits.
#' @export
extract.digits <- function(data, number.of.digits = 2, sign="positive", second.order = FALSE, discrete = TRUE, round = 3) {
  
  if (!is.numeric(data)) stop("Data must be a numeric vector")
  
  ## cleaning data for analysis - only > 0 and either only positive or only negative
  if (sign == "positive")  positives <- data[data > 0 & !is.na(data)]
  if (sign == "negative")  positives <- data[data < 0 & !is.na(data)]*(-1)
  if (sign == "both")      positives <- abs(data[data != 0 & !is.na(data)]) 
  
  if (second.order) {
    
    if (number.of.digits > 4) {
      warning("There might be some floating point precision issues on the Second Order distribution")
    }
    
    n <- length(positives)
    first <- sort(positives)[1:(n - 1)]
    second <- sort(positives)[2:n]
    
    positives <-  if (discrete) {
      round(second - first, number.of.digits + round)
    } else {
      round(second - first, 8)
    } 
    
    positives <- positives[positives > 0]
  }
  
  results <- data.frame(data = positives,
                        data.digits = trunc((10^((floor(log10(positives))*-1) + 
                                                   number.of.digits - 1))*positives))
  return(results)
}


#' @title Probability of a digit sequence
#' @description It calculates the probability of a digit sequence "d".
#' @usage
#' 
#' p.these.digits(d)
#' @param d a digit sequence, like 1234 ou 999999.
#' @return The probability of the sequence d.
#' @examples
#' p.these.digits(1) # 0.30103
#' p.these.digits(11) # 0.03778856
#' p.these.digits(999999) # 4.342947e-07
#' @export 
p.these.digits <- function(d){
  if (!is.numeric(d)) stop("d must be numeric or integer")
  d <- trunc(d)
  if (d < 0) d <- d*(-1)
  prob <- log10(1 + 1/d)
  return(prob)
}


#' @title Probability of a digit at the nth position
#' @description It calculates the probability of digit "d" at the "n"th position.
#' @usage
#' 
#' p.this.digit.at.n(d,n)
#' @param d a digit from 0 to 9 (except at position n=1, where d cannot be 0, it wil give you NA).
#' @param n the nth position.
#' @return The probability of d at position n.
#' @examples
#' p.this.digit.at.n(1,1) # 0.30103
#' p.this.digit.at.n(1,2) # 0.1138901
#' p.this.digit.at.n(9,3) # 0.09826716
#' matrix <- as.data.frame(round(sapply(1:4, function(x) sapply(0:9,p.this.digit.at.n,n=x)),5))
#' names(matrix) <- paste0("n=",1:4)
#' rownames(matrix) <- paste0("d=",0:9)
#' matrix # a table with the probabilities of digits 0 to 9 in positions 1 to 4.
#' @export 
p.this.digit.at.n <- function(d,n){
  if (d < 0) d <- d*(-1)
  if (d == 0 & n == 1) return(NA)
  n1 <- strsplit(as.character(d), "")[[1]]
  n1 <- length(n1)
  if (n1 > 1) stop("d must have only 1 digit. This function evaluates 1 digit at position n")
  if (!is.numeric(d)) stop("d must be numeric or integer")
  if (!is.numeric(n)) stop("n must be numeric or integer")
  if (n < 1) stop("n must be greater than 1")
  if (n == 1) return(log10(1 + 1/d))
  if (n >= 9) return(0.1)
  k <- 10^(n - 2)
  j <- (10^(n - 1)) - 1
  i <- k:j
  sum <- sum(log10(1 + 1/(10*i + d)))
  return(sum)
}

generate.benford.digits <- function(number.of.digits) {
  number.of.digits <- as.integer(number.of.digits)
  begin <- 10^(number.of.digits - 1)
  ends <- 10^(number.of.digits) - 1
  benford.digits <- begin:ends
  return(benford.digits)
}

generate.benford.distribution <- function(benford.digits) {
  benford.dist <- sapply(benford.digits, p.these.digits)
  return(benford.dist)
}

generate.empirical.distribution <- function(data, number.of.digits,sign, second.order = FALSE, benford.digits, discrete = TRUE, round = 3){
   x <- NULL
   v <- NULL
   data.frame <- extract.digits(data, number.of.digits, sign, second.order, discrete = discrete, round = round)
   n <- length(data.frame$data.digits)
   DF <- data.table(x = c(data.frame$data.digits, benford.digits),
                    v = c(data.frame$data.digits, benford.digits) )
   DFcount <- DF[ ,length(x) - 1, by = v][order(v)]
   dist.freq <- DFcount$V1
   dist <- dist.freq/n
   results <- list(data = data.frame$data, 
                   data.digits = data.frame$data.digits, 
                   dist = dist, 
                   dist.freq = dist.freq)
   return(results)
}

extract.mantissa <- function(positives) {
  log <- log10(positives)
  log[log < 0] <- log[log < 0] + as.integer(log[log < 0])*(-1) + 1
  mantissa <- log - trunc(log)
  return(mantissa)
}

generate.summation <- function(benford.digits, data, data.digits) {
  x <- NULL
  v <- NULL
  table <- data.table(x = data.digits, v = data)
  table <- table[, sum(v), by = x][order(x)]
  setnames(table,c("x", "V1"), c("digits", "value"))
  
  if (length(which(!benford.digits %in% table$digits)) != 0) {
    add <- data.frame(digits = which(!benford.digits %in% table$digits), value = 0)
    table <- rbind(table, add)
    table <- table[order(table$digits),]
  }
  
  summation <- table$value
  return(summation)
}



#### Basic Calculations ####

excess.kurtosis <- function(x) 
  (mean((x - mean(x))^4)/(mean((x - mean(x))^2)^2)) - 3
  

skewness <- function(x)
  (mean((x - mean(x))^3)/(mean((x - mean(x))^2)^(3/2)))

#### plot ####

plotting.data.vs.benford <- function(x, ...) {
  
  xmarks <- barplot(x[["bfd"]]$data.dist.freq, 
                    col = "lightblue", 
                    main = "Digits Distribution",
                    xlab = "Digits", ylab = "Freq",
                    ylim = c(0,max(c(x[["bfd"]]$data.dist.freq, x[["bfd"]]$benford.dist.freq))*1.1))
  
  axis(1, at = xmarks, labels = x[["bfd"]]$digits)
  
  lines(xmarks, x[["bfd"]]$benford.dist.freq, lty = 2, lwd = 2, col = "red")
  
  #   legend("topright", lty=c(1,2), lwd=2, col=c("lightblue", "red"), 
  #          legend=c("Data", "Benford"))
}

plotting.second.order <- function(x, ...) {
  y <- x[["bfd"]]$benford.so.dist.freq
  xmarks <- barplot(x[["bfd"]]$data.second.order.dist.freq, 
                    col = "lightblue", 
                    main = "Digits Distribution \nSecond Order Test",
                    xlab = "Digits", ylab = "Freq",
                    ylim = c(0,max(c(x[["bfd"]]$data.second.order.dist.freq, y))*1.1)) 
  
  axis(1, at = xmarks, labels = x[["bfd"]]$digits)
  
  lines(xmarks, y, lty = 2, lwd = 2, col = "red")
  
  #   legend("topright", lty=c(1,2), lwd=2, col=c("lightblue", "red"), 
  #          legend=c("Data", "Benford"))
}

plotting.summation <- function(x, ...) {
  xmarks <- barplot(x[["bfd"]]$data.summation, 
                    col = "lightblue", 
                    main = "Summation Distribution by digits",
                    xlab = "Digits", ylab = "Summation",
                    ylim = c(0,max(x[["bfd"]]$data.summation))*1.1)
  
  axis(1, at = xmarks, labels = x[["bfd"]]$digits)
  
  lines(x = xmarks, y = rep(mean(x[["bfd"]]$data.summation), length(xmarks)), col = "red", lty = 2)
}

plotting.ordered.mantissa <- function(x, ...) {
  
  plot(sort(x[["data"]]$data.mantissa), 
       pch = ".",
       col = "blue", 
       main = "Ordered Mantissa",
       xlab = "Ordered Observation",
       ylab = "Mantissa")
  
  abline(a = 0, b = 1/length(x[["data"]]$data.mantissa), col = "red", lty = 2)
}

plotting.chi_squared <- function(x, ...) {
  
  plot(x[["bfd"]]$digits, x[["bfd"]]$squared.diff, 
       pch = "x", col = "blue", 
       xlab = "Digits",
       ylab = "Chi-squared", 
       main = "Chi-Squared Difference",
       xaxt = "n")
  
  axis(1, at = x[["bfd"]]$digits)
}

plotting.abs.diff <- function(x, ...) {
  plot(x[["bfd"]]$digits, x[["bfd"]]$absolute.diff, 
       pch = "x", 
       col = "blue", 
       xlab = "Digits",
       ylab = "Absolute Difference", 
       main = "Absolute Difference",
       xaxt = "n")
  
  axis(1, at = x[["bfd"]]$digits)
}

plotting.ex.summation <- function(x, ...) {
  plot(x[["bfd"]]$digits, x[["bfd"]]$abs.excess.summation, 
       pch = "x", 
       col = "blue", 
       xlab = "Digits",
       ylab = "Absolute Excess Summation", 
       main = "Summation Difference",
       xaxt = "n")
  axis(1, at = x[["bfd"]]$digits)
}

plotting.legend <- function(x) {
  plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", main = paste("Legend \n Dataset:", x[["info"]]$data.name))
  plot_colors <- c("lightblue","blue","red")
  legend(x = "top",
         inset = 0,
         legend = c("data", "data", "benford"), 
         col = plot_colors, 
         lwd = 2, 
         lty = c(1,1,2))
}
Any scripts or data that you put into this service are public.
benford.analysis documentation built on May 2, 2019, 10 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
benford.analysis
Benford Analysis for Data Validation and Forensic Analytics

R/internal.functions-new-code-2.R
In benford.analysis: Benford Analysis for Data Validation and Forensic Analytics

Try the benford.analysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

benford.analysis Benford Analysis for Data Validation and Forensic Analytics

R/internal.functions-new-code-2.R In benford.analysis: Benford Analysis for Data Validation and Forensic Analytics

Try the benford.analysis package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

benford.analysis
Benford Analysis for Data Validation and Forensic Analytics

R/internal.functions-new-code-2.R
In benford.analysis: Benford Analysis for Data Validation and Forensic Analytics