tests/testthat/test-describe.R

# create a sample dataset to describe
set.seed(41)
data <- data.frame(
  # numeric
  a = as.numeric(seq(20)),
  b = rnorm(20),
  # integer
  c = as.integer(seq(20)),
  d = sample.int(20, 10, replace = TRUE),
  # character
  e = sample(letters, 20, replace = TRUE),
  f = sample(c(letters, paste0(letters, letters), paste0(letters, letters, letters)),
             20, replace = TRUE),
  # factor
  g = factor(sample(letters, 20, replace = TRUE)),
  # posixct
  h = as.POSIXct(
    sample.int(365 * 60 * 60 * 24, 10, replace = TRUE),
    origin = "2020-01-01",
    tz = "UTC"
  )
)

for (n in names(data)) data[sample.int(nrow(data), 3), n] <- NA

normalize_mf <- function(x) {
  vapply(strsplit(x, ", ", fixed = TRUE), function(parts) {
    paste(sort(parts), collapse = ", ")
  }, character(1))
}


test_that("describe data.frame", {
  local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr")
  pkgs <- NULL

  expect_equal(detect_backend(data), "base-r")
  d <- describe(data, skip_ones = FALSE)
  expect_equal(class(d), "data.frame")
  exp <- d

  skip_if_not(
    requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("data.table", quietly = TRUE),
    "dplyr and data.table must be installed to run these tests"
  )

  # use data.table
  pkgs <- "data.table"
  expect_equal(detect_backend(data), "data.table")
  d <- describe(data, skip_ones = FALSE)
  expect_equal(class(d), c("data.table", "data.frame"))
  expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
               exp[setdiff(names(exp), "most_frequent")])
  expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))

  # use dplyr
  pkgs <- "dplyr"
  expect_equal(detect_backend(data), "dplyr")
  d <- describe(data, skip_ones = FALSE)
  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
  expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
               exp[setdiff(names(exp), "most_frequent")])
  expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))
})


test_that("describe sqlite", {
  skip_if_not(
    requireNamespace("DBI", quietly = TRUE) &&
      requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("RSQLite", quietly = TRUE),
    "DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality"
  )

  con <- DBI::dbConnect(RSQLite::SQLite())
  DBI::dbWriteTable(con, "data", data)
  x <- dplyr::tbl(con, "data")
  expect_equal(detect_backend(x), "collectibles")

  d <- describe(x, skip_ones = FALSE)

  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(d), ncol(data))
  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))

  DBI::dbDisconnect(con)
})


test_that("describe duckdb", {
  skip_if_not_installed("duckdb", "1.5.1.9002")
  skip_if_not(
    requireNamespace("DBI", quietly = TRUE) &&
      requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("duckdb", quietly = TRUE),
    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
  )

  con <- DBI::dbConnect(duckdb::duckdb())
  DBI::dbWriteTable(con, "data", data)
  x <- dplyr::tbl(con, "data")
  expect_equal(detect_backend(x), "collectibles")

  d <- describe(x, skip_ones = FALSE)

  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(d), ncol(data))
  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
  DBI::dbDisconnect(con)

})


test_that("describe duckdb supports skip_ones/digits without full-vector semantics changes", {
  skip_if_not_installed("duckdb", "1.5.1.9002")
  skip_if_not(
    requireNamespace("DBI", quietly = TRUE) &&
      requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("duckdb", quietly = TRUE),
    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
  )

  x <- data.frame(
    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
    chr = c("a", "a", "b", "c", NA_character_)
  )

  con <- DBI::dbConnect(duckdb::duckdb())
  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
  DBI::dbWriteTable(con, "data_mf", x, overwrite = TRUE)
  tbl <- dplyr::tbl(con, "data_mf")

  d <- describe(tbl, skip_ones = FALSE, digits = 2)
  expect_equal(normalize_mf(d$most_frequent[d$var == "num"]), "1.23 (2), 9.88 (2), NA (1)")
  expect_match(d$most_frequent[d$var == "chr"], "^a \\(2\\)")

  d2 <- describe(tbl, skip_ones = TRUE, digits = 2)
  expect_equal(normalize_mf(d2$most_frequent[d2$var == "num"]), "1.23 (2), 9.88 (2)")
  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})


test_that("describe arrow", {
  skip_if_not(
    requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("arrow", quietly = TRUE),
    "dplyr and arrow must be installed to test the functionality"
  )

  tmp <- tempfile()
  arrow::write_parquet(data, tmp)
  x <- arrow::open_dataset(tmp)
  expect_equal(detect_backend(x), "collectibles")

  d <- describe(x, skip_ones = FALSE)

  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
  expect_equal(nrow(d), ncol(data))
  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
})

test_that("describe_collectibles_stats works on arrow numeric columns in fast mode", {
  skip_if_not(
    requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("arrow", quietly = TRUE),
    "dplyr, dbplyr, and arrow must be installed to test the functionality"
  )

  x <- data.frame(num = c(1, 2, NA_real_, 4))
  tmp <- tempfile()
  arrow::write_parquet(x, tmp)
  ds <- arrow::open_dataset(tmp)

  stats <- dataverifyr:::describe_collectibles_stats(ds, "num", is_num = TRUE, fast = TRUE)

  expect_equal(stats$n, 4L)
  expect_equal(stats$n_na, 1L)
  expect_equal(stats$min, 1)
  expect_equal(stats$max, 4)
  expect_equal(round(stats$mean, 6), round(mean(x$num, na.rm = TRUE), 6))
  expect_equal(round(stats$sd, 6), round(stats::sd(x$num, na.rm = TRUE), 6))
})

test_that("describe arrow fast mode handles POSIXct columns", {
  skip_if_not(
    requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("arrow", quietly = TRUE),
    "dplyr, dbplyr, and arrow must be installed to test the functionality"
  )

  x <- data.frame(
    ts = as.POSIXct(c("2020-01-01 00:00:00", "2020-01-01 01:00:00", NA), tz = "UTC"),
    val = c(1, 2, 3)
  )
  tmp <- tempfile()
  arrow::write_parquet(x, tmp)
  ds <- arrow::open_dataset(tmp)

  d <- describe(ds, fast = TRUE, skip_ones = FALSE)
  ts_row <- d[d$var == "ts", , drop = FALSE]

  expect_true(nrow(ts_row) == 1)
  expect_false(is.na(ts_row$mean))
  expect_false(is.na(ts_row$sd))
})


test_that("describe most_frequent rounds numeric values and suppresses <=1 distinct", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    one_value = rep(3.1415926535, 5),
    all_na = rep(NA_real_, 5),
    mixed = c(1.23456789, 1.23456789, 9.87654321, NA_real_, 9.87654321)
  )

  d <- suppressWarnings(describe(x, skip_ones = FALSE))

  expect_equal(d$most_frequent[d$var == "one_value"], "")
  expect_equal(d$most_frequent[d$var == "all_na"], "")
  expect_equal(
    d$most_frequent[d$var == "mixed"],
    "1.2346 (2), 9.8765 (2), NA (1)"
  )
})


test_that("describe skips one-count values in most_frequent by default", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    mostly_unique = c(1, 2, 3, 4, 5),
    repeated = c(1, 1, 2, 3, NA_real_)
  )

  d <- describe(x)
  expect_equal(d$most_frequent[d$var == "mostly_unique"], "")
  expect_equal(d$most_frequent[d$var == "repeated"], "1 (2)")

  d_keep_ones <- describe(x, skip_ones = FALSE)
  expect_equal(d_keep_ones$most_frequent[d_keep_ones$var == "repeated"], "1 (2), 2 (1), 3 (1)")
})


test_that("describe supports configurable digits for most_frequent numeric rounding", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    mixed = c(1.23456789, 1.23456789, 9.87654321, 9.87654321, NA_real_)
  )

  d <- describe(x, skip_ones = FALSE, digits = 2)
  expect_equal(d$most_frequent[d$var == "mixed"], "1.23 (2), 9.88 (2), NA (1)")
})

test_that("describe supports configurable top_n for most_frequent", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    vals = c("a", "a", "b", "b", "c", "d")
  )

  d3 <- describe(x, skip_ones = FALSE, top_n = 3)
  expect_equal(d3$most_frequent[d3$var == "vals"], "a (2), b (2), c (1)")

  d2 <- describe(x, skip_ones = FALSE, top_n = 2)
  expect_equal(d2$most_frequent[d2$var == "vals"], "a (2), b (2)")
})

test_that("describe supports top_n = 0 to skip most_frequent values", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    vals = c("a", "a", "b", "b", "c", "d")
  )

  d0 <- describe(x, skip_ones = FALSE, top_n = 0)
  expect_equal(d0$most_frequent[d0$var == "vals"], "")
})

test_that("describe fast mode skips expensive fields", {
  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")

  x <- data.frame(
    num = c(1, 1, 2, 3, NA_real_),
    chr = c("a", "a", "b", "c", NA_character_)
  )

  d <- describe(x, fast = TRUE, skip_ones = FALSE, top_n = 2)
  expect_true(all(is.na(d$n_distinct)))
  expect_true(all(is.na(d$median)))
  expect_true(all(is.na(d$most_frequent)))
  expect_false(all(is.na(d$mean)))
})

test_that("describe duckdb fast mode skips expensive fields", {
  skip_if_not_installed("duckdb", "1.5.1.9002")
  skip_if_not(
    requireNamespace("DBI", quietly = TRUE) &&
      requireNamespace("dplyr", quietly = TRUE) &&
      requireNamespace("dbplyr", quietly = TRUE) &&
      requireNamespace("duckdb", quietly = TRUE),
    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
  )

  x <- data.frame(
    num = c(1, 1, 2, 3, NA_real_),
    chr = c("a", "a", "b", "c", NA_character_)
  )

  con <- DBI::dbConnect(duckdb::duckdb())
  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
  DBI::dbWriteTable(con, "data_fast", x, overwrite = TRUE)
  tbl <- dplyr::tbl(con, "data_fast")

  d <- describe(tbl, fast = TRUE, skip_ones = FALSE, top_n = 2)
  expect_true(all(is.na(d$n_distinct)))
  expect_true(all(is.na(d$median)))
  expect_true(all(is.na(d$most_frequent)))
  expect_false(all(is.na(d$mean)))
})

test_that("describe dplyr backend supports skip_ones and digits", {
  skip_if_not(requireNamespace("dplyr", quietly = TRUE),
              "dplyr must be installed to run this test")
  local_mocked_bindings(has_pkg = function(p) p == "dplyr", .package = "dataverifyr")

  x <- data.frame(
    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
    chr = c("a", "a", "b", "c", NA_character_)
  )

  d <- describe(x, skip_ones = FALSE, digits = 2)
  expect_equal(detect_backend(x), "dplyr")
  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
  expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
  expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")

  d2 <- describe(x, skip_ones = TRUE, digits = 2)
  expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})

test_that("describe data.table backend supports skip_ones and digits", {
  skip_if_not(requireNamespace("data.table", quietly = TRUE),
              "data.table must be installed to run this test")
  local_mocked_bindings(has_pkg = function(p) p == "data.table", .package = "dataverifyr")

  x <- data.frame(
    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
    chr = c("a", "a", "b", "c", NA_character_)
  )

  d <- describe(x, skip_ones = FALSE, digits = 2)
  expect_equal(detect_backend(x), "data.table")
  expect_equal(class(d), c("data.table", "data.frame"))
  expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
  expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")

  d2 <- describe(x, skip_ones = TRUE, digits = 2)
  expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})

Try the dataverifyr package in your browser

Any scripts or data that you put into this service are public.

dataverifyr documentation built on April 11, 2026, 1:06 a.m.