dq: Tools for Data Quality

Documented in dq_univariate format.dq_univariate summary.dq_univariate

calc_outlier <- function(x, cutoff)
{
  if(is.numericish(x))
  {
    x <- as.numeric(x)
    mu <- mean(x, na.rm = TRUE)
    std <- stats::sd(x, na.rm = TRUE)
    tmp <- stats::pnorm(x, mean = mu, sd = std)
    sum(tmp < cutoff/2 | tmp > 1-cutoff/2, na.rm = TRUE)
  } else
  {
    sum(table(x)[table(x)/length(x) < cutoff])
  }
}

#' Compute univariate quality metrics
#'
#' @param dat The input data set
#' @param cutoff The cutoff to determine outliers.
#' @param digits,digits.pct,digits.pval How many digits to print
#' @param x,object An R object
#' @param n Number of rows to print
#' @param ... Other arguments. For \code{summary()}, these are passed to \code{format()}.
#' @return An object of class "dq_univariate".
#' @details Make sure you set.seed before you run this function to get consistent results.
#' @name dq_univariate
NULL
#> NULL

#' @rdname dq_univariate
#' @export
dq_univariate <- function(dat, cutoff = 0.05)
{
  ## Calculate missings (counts and percents)
  nmiss <- colSums(is.na(dat))
  pct.miss <- 100*colMeans(is.na(dat))

  ## Kurtosis
  kurt <- vapply(dat, function(x) if(is.numericish(x)) e1071::kurtosis(as.numeric(x), na.rm = TRUE) - 3 else NA_real_, NA_real_)
  skew <- vapply(dat, function(x) if(is.numericish(x)) e1071::skewness(as.numeric(x), na.rm = TRUE) else NA_real_, NA_real_)

  ## outliers
  outliers <- vapply(dat, calc_outlier, NA_real_, cutoff = cutoff)
  pct_outliers <- 100*outliers/nrow(dat)

  out <- data.frame(
    variable = names(dat),
    missings = nmiss,
    pct.miss = pct.miss,
    skewness = skew,
    excess.kurt = kurt,
    outliers = outliers,
    pct.outliers = pct_outliers
  )
  out$trend.test <- trend.test(dat)

  structure(out, class = c("dq_univariate", "data.frame"))
}

#' @rdname dq_univariate
#' @export
format.dq_univariate <- function(x, digits = 3, digits.pct = 1, digits.pval = 4, ...)
{
  sn <- function(y) stats::setNames(y, x$variable)
  ## Calculate missings (counts and percents)
  nmiss <- sort(sn(x$missings), decreasing = TRUE)
  pct.miss <- sort(sn(x$pct.miss), decreasing = TRUE)

  ## Kurtosis
  kurt <- sn(x$excess.kurt)[order(x$excess.kurt, decreasing = TRUE, na.last = TRUE)]
  skew <- sn(x$skewness)[order(abs(x$skewness), decreasing = TRUE, na.last = TRUE)]

  ## outliers
  o <- order(x$outliers, decreasing = TRUE)
  outliers <- sn(x$outliers)[o]
  pct_outliers <-sn(x$pct.outliers)[o]

  trend.p <- vapply(x$trend.test, function(y) y$pval, NA_real_)
  trend <- sn(x$trend.test[order(trend.p)])

  data.frame(
    missings = paste0(names(nmiss), " (", nmiss, ", ", formatC(pct.miss, digits = digits.pct, format = "f"), "%)"),
    skewness = paste0(names(skew), " (", trimws(formatC(skew, digits = digits, format = "f")), ")"),
    excess.kurt = paste0(names(kurt), " (", trimws(formatC(kurt, digits = digits, format = "f")), ")"),
    outliers = paste0(names(outliers), " (", outliers, ", ", formatC(pct_outliers, digits = digits.pct, format = "f"), "%)"),
    trend.test = paste0(
      names(trend), " (Observation=",
      vapply(trend, function(y) y$ind.max, NA_real_), ", p-value=",
      formatC(vapply(trend, function(y) y$pval, NA_real_), digits = digits.pval, format = "f"), ")"
    ),
    stringsAsFactors = FALSE
  )
}

#' @rdname dq_univariate
#' @export
summary.dq_univariate <- function(object, n = 10, ...)
{
  out <- utils::head(format(object, ...), n)
  colnames(out) <- c("Missings (count, %)", "Skewness", "Excess Kurtosis", "Outliers (count, %)", "Trend Test")
  out
}