Nothing
# create a sample dataset to describe
set.seed(41)
data <- data.frame(
# numeric
a = as.numeric(seq(20)),
b = rnorm(20),
# integer
c = as.integer(seq(20)),
d = sample.int(20, 10, replace = TRUE),
# character
e = sample(letters, 20, replace = TRUE),
f = sample(c(letters, paste0(letters, letters), paste0(letters, letters, letters)),
20, replace = TRUE),
# factor
g = factor(sample(letters, 20, replace = TRUE)),
# posixct
h = as.POSIXct(
sample.int(365 * 60 * 60 * 24, 10, replace = TRUE),
origin = "2020-01-01",
tz = "UTC"
)
)
for (n in names(data)) data[sample.int(nrow(data), 3), n] <- NA
normalize_mf <- function(x) {
vapply(strsplit(x, ", ", fixed = TRUE), function(parts) {
paste(sort(parts), collapse = ", ")
}, character(1))
}
test_that("describe data.frame", {
local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr")
pkgs <- NULL
expect_equal(detect_backend(data), "base-r")
d <- describe(data, skip_ones = FALSE)
expect_equal(class(d), "data.frame")
exp <- d
skip_if_not(
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("data.table", quietly = TRUE),
"dplyr and data.table must be installed to run these tests"
)
# use data.table
pkgs <- "data.table"
expect_equal(detect_backend(data), "data.table")
d <- describe(data, skip_ones = FALSE)
expect_equal(class(d), c("data.table", "data.frame"))
expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
exp[setdiff(names(exp), "most_frequent")])
expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))
# use dplyr
pkgs <- "dplyr"
expect_equal(detect_backend(data), "dplyr")
d <- describe(data, skip_ones = FALSE)
expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
exp[setdiff(names(exp), "most_frequent")])
expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))
})
test_that("describe sqlite", {
skip_if_not(
requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("RSQLite", quietly = TRUE),
"DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality"
)
con <- DBI::dbConnect(RSQLite::SQLite())
DBI::dbWriteTable(con, "data", data)
x <- dplyr::tbl(con, "data")
expect_equal(detect_backend(x), "collectibles")
d <- describe(x, skip_ones = FALSE)
expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(d), ncol(data))
expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
DBI::dbDisconnect(con)
})
test_that("describe duckdb", {
skip_if_not_installed("duckdb", "1.5.1.9002")
skip_if_not(
requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("duckdb", quietly = TRUE),
"DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
)
con <- DBI::dbConnect(duckdb::duckdb())
DBI::dbWriteTable(con, "data", data)
x <- dplyr::tbl(con, "data")
expect_equal(detect_backend(x), "collectibles")
d <- describe(x, skip_ones = FALSE)
expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(d), ncol(data))
expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
DBI::dbDisconnect(con)
})
test_that("describe duckdb supports skip_ones/digits without full-vector semantics changes", {
skip_if_not_installed("duckdb", "1.5.1.9002")
skip_if_not(
requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("duckdb", quietly = TRUE),
"DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
)
x <- data.frame(
num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
chr = c("a", "a", "b", "c", NA_character_)
)
con <- DBI::dbConnect(duckdb::duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
DBI::dbWriteTable(con, "data_mf", x, overwrite = TRUE)
tbl <- dplyr::tbl(con, "data_mf")
d <- describe(tbl, skip_ones = FALSE, digits = 2)
expect_equal(normalize_mf(d$most_frequent[d$var == "num"]), "1.23 (2), 9.88 (2), NA (1)")
expect_match(d$most_frequent[d$var == "chr"], "^a \\(2\\)")
d2 <- describe(tbl, skip_ones = TRUE, digits = 2)
expect_equal(normalize_mf(d2$most_frequent[d2$var == "num"]), "1.23 (2), 9.88 (2)")
expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})
test_that("describe arrow", {
skip_if_not(
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("arrow", quietly = TRUE),
"dplyr and arrow must be installed to test the functionality"
)
tmp <- tempfile()
arrow::write_parquet(data, tmp)
x <- arrow::open_dataset(tmp)
expect_equal(detect_backend(x), "collectibles")
d <- describe(x, skip_ones = FALSE)
expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
expect_equal(nrow(d), ncol(data))
expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
})
test_that("describe_collectibles_stats works on arrow numeric columns in fast mode", {
skip_if_not(
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("arrow", quietly = TRUE),
"dplyr, dbplyr, and arrow must be installed to test the functionality"
)
x <- data.frame(num = c(1, 2, NA_real_, 4))
tmp <- tempfile()
arrow::write_parquet(x, tmp)
ds <- arrow::open_dataset(tmp)
stats <- dataverifyr:::describe_collectibles_stats(ds, "num", is_num = TRUE, fast = TRUE)
expect_equal(stats$n, 4L)
expect_equal(stats$n_na, 1L)
expect_equal(stats$min, 1)
expect_equal(stats$max, 4)
expect_equal(round(stats$mean, 6), round(mean(x$num, na.rm = TRUE), 6))
expect_equal(round(stats$sd, 6), round(stats::sd(x$num, na.rm = TRUE), 6))
})
test_that("describe arrow fast mode handles POSIXct columns", {
skip_if_not(
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("arrow", quietly = TRUE),
"dplyr, dbplyr, and arrow must be installed to test the functionality"
)
x <- data.frame(
ts = as.POSIXct(c("2020-01-01 00:00:00", "2020-01-01 01:00:00", NA), tz = "UTC"),
val = c(1, 2, 3)
)
tmp <- tempfile()
arrow::write_parquet(x, tmp)
ds <- arrow::open_dataset(tmp)
d <- describe(ds, fast = TRUE, skip_ones = FALSE)
ts_row <- d[d$var == "ts", , drop = FALSE]
expect_true(nrow(ts_row) == 1)
expect_false(is.na(ts_row$mean))
expect_false(is.na(ts_row$sd))
})
test_that("describe most_frequent rounds numeric values and suppresses <=1 distinct", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
one_value = rep(3.1415926535, 5),
all_na = rep(NA_real_, 5),
mixed = c(1.23456789, 1.23456789, 9.87654321, NA_real_, 9.87654321)
)
d <- suppressWarnings(describe(x, skip_ones = FALSE))
expect_equal(d$most_frequent[d$var == "one_value"], "")
expect_equal(d$most_frequent[d$var == "all_na"], "")
expect_equal(
d$most_frequent[d$var == "mixed"],
"1.2346 (2), 9.8765 (2), NA (1)"
)
})
test_that("describe skips one-count values in most_frequent by default", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
mostly_unique = c(1, 2, 3, 4, 5),
repeated = c(1, 1, 2, 3, NA_real_)
)
d <- describe(x)
expect_equal(d$most_frequent[d$var == "mostly_unique"], "")
expect_equal(d$most_frequent[d$var == "repeated"], "1 (2)")
d_keep_ones <- describe(x, skip_ones = FALSE)
expect_equal(d_keep_ones$most_frequent[d_keep_ones$var == "repeated"], "1 (2), 2 (1), 3 (1)")
})
test_that("describe supports configurable digits for most_frequent numeric rounding", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
mixed = c(1.23456789, 1.23456789, 9.87654321, 9.87654321, NA_real_)
)
d <- describe(x, skip_ones = FALSE, digits = 2)
expect_equal(d$most_frequent[d$var == "mixed"], "1.23 (2), 9.88 (2), NA (1)")
})
test_that("describe supports configurable top_n for most_frequent", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
vals = c("a", "a", "b", "b", "c", "d")
)
d3 <- describe(x, skip_ones = FALSE, top_n = 3)
expect_equal(d3$most_frequent[d3$var == "vals"], "a (2), b (2), c (1)")
d2 <- describe(x, skip_ones = FALSE, top_n = 2)
expect_equal(d2$most_frequent[d2$var == "vals"], "a (2), b (2)")
})
test_that("describe supports top_n = 0 to skip most_frequent values", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
vals = c("a", "a", "b", "b", "c", "d")
)
d0 <- describe(x, skip_ones = FALSE, top_n = 0)
expect_equal(d0$most_frequent[d0$var == "vals"], "")
})
test_that("describe fast mode skips expensive fields", {
local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
x <- data.frame(
num = c(1, 1, 2, 3, NA_real_),
chr = c("a", "a", "b", "c", NA_character_)
)
d <- describe(x, fast = TRUE, skip_ones = FALSE, top_n = 2)
expect_true(all(is.na(d$n_distinct)))
expect_true(all(is.na(d$median)))
expect_true(all(is.na(d$most_frequent)))
expect_false(all(is.na(d$mean)))
})
test_that("describe duckdb fast mode skips expensive fields", {
skip_if_not_installed("duckdb", "1.5.1.9002")
skip_if_not(
requireNamespace("DBI", quietly = TRUE) &&
requireNamespace("dplyr", quietly = TRUE) &&
requireNamespace("dbplyr", quietly = TRUE) &&
requireNamespace("duckdb", quietly = TRUE),
"DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
)
x <- data.frame(
num = c(1, 1, 2, 3, NA_real_),
chr = c("a", "a", "b", "c", NA_character_)
)
con <- DBI::dbConnect(duckdb::duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
DBI::dbWriteTable(con, "data_fast", x, overwrite = TRUE)
tbl <- dplyr::tbl(con, "data_fast")
d <- describe(tbl, fast = TRUE, skip_ones = FALSE, top_n = 2)
expect_true(all(is.na(d$n_distinct)))
expect_true(all(is.na(d$median)))
expect_true(all(is.na(d$most_frequent)))
expect_false(all(is.na(d$mean)))
})
test_that("describe dplyr backend supports skip_ones and digits", {
skip_if_not(requireNamespace("dplyr", quietly = TRUE),
"dplyr must be installed to run this test")
local_mocked_bindings(has_pkg = function(p) p == "dplyr", .package = "dataverifyr")
x <- data.frame(
num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
chr = c("a", "a", "b", "c", NA_character_)
)
d <- describe(x, skip_ones = FALSE, digits = 2)
expect_equal(detect_backend(x), "dplyr")
expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")
d2 <- describe(x, skip_ones = TRUE, digits = 2)
expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})
test_that("describe data.table backend supports skip_ones and digits", {
skip_if_not(requireNamespace("data.table", quietly = TRUE),
"data.table must be installed to run this test")
local_mocked_bindings(has_pkg = function(p) p == "data.table", .package = "dataverifyr")
x <- data.frame(
num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
chr = c("a", "a", "b", "c", NA_character_)
)
d <- describe(x, skip_ones = FALSE, digits = 2)
expect_equal(detect_backend(x), "data.table")
expect_equal(class(d), c("data.table", "data.frame"))
expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")
d2 <- describe(x, skip_ones = TRUE, digits = 2)
expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.