tests/testthat/test-groupby.R

test_that("group_by + summarise with count", {
  df <- data.frame(g = c("a", "b", "a", "b", "a"), x = 1:5,
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(g) |> summarise(cnt = n()) |> collect()
  # First-seen order: "a" first, "b" second
  expect_equal(result$g, c("a", "b"))
  expect_equal(result$cnt, c(3, 2))
})

test_that("group_by + summarise with sum", {
  df <- data.frame(g = c("a", "a", "b", "b"), x = c(1.0, 2.0, 3.0, 4.0),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(g) |> summarise(total = sum(x)) |> collect()
  expect_equal(result$g, c("a", "b"))
  expect_equal(result$total, c(3, 7))
})

test_that("group_by + summarise with mean", {
  df <- data.frame(g = c("a", "a", "b", "b"), x = c(10.0, 20.0, 30.0, 40.0),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(g) |> summarise(avg = mean(x)) |> collect()
  expect_equal(result$avg, c(15, 35))
})

test_that("group_by + summarise with min and max", {
  df <- data.frame(g = c("a", "a", "b", "b"), x = c(5.0, 1.0, 8.0, 3.0),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |>
    group_by(g) |>
    summarise(lo = min(x), hi = max(x)) |>
    collect()
  expect_equal(result$lo, c(1, 3))
  expect_equal(result$hi, c(5, 8))
})

test_that("group_by with NA key values", {
  df <- data.frame(g = c("a", NA, "a", NA), x = c(1.0, 2.0, 3.0, 4.0),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(g) |> summarise(total = sum(x)) |> collect()
  expect_equal(nrow(result), 2)
  # "a" group and NA group
  expect_equal(result$total[result$g == "a" & !is.na(result$g)], 4)
  expect_equal(result$total[is.na(result$g)], 6)
})

test_that("multiple grouping columns", {
  df <- data.frame(
    a = c("x", "x", "y", "y"),
    b = c(1L, 2L, 1L, 2L),
    v = c(10.0, 20.0, 30.0, 40.0)
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(a, b) |> summarise(s = sum(v)) |> collect()
  expect_equal(nrow(result), 4)
})

test_that("summarise with na.rm", {
  df <- data.frame(g = c("a", "a", "b"), x = c(1.0, NA, 3.0),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |>
    group_by(g) |>
    summarise(total = sum(x, na.rm = TRUE)) |>
    collect()
  expect_equal(result$total[result$g == "a"], 1)
  expect_equal(result$total[result$g == "b"], 3)
})

test_that("summarise without na.rm gives NA for all-NA group", {
  df <- data.frame(g = c("a", "a"), x = c(NA_real_, NA_real_),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |>
    group_by(g) |>
    summarise(total = sum(x)) |>
    collect()
  expect_true(is.na(result$total))
})

test_that("string key arena survives multiple resizes", {
  # Initial arena capacity is 64. 200 unique groups forces resizes at 65 and 129.
  # Regression test for UAF in arena_ensure when string data was aliased.
  set.seed(1)
  n <- 2000
  n_groups <- 200
  df <- data.frame(
    g = sample(paste0("grp_", seq_len(n_groups)), n, replace = TRUE),
    x = rnorm(n),
    stringsAsFactors = FALSE
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |> group_by(g) |> summarise(sx = sum(x), n = n()) |> collect()
  expect_equal(nrow(result), n_groups)
  expect_equal(sum(result$n), n)
  # Verify against R base
  ref <- aggregate(x ~ g, data = df, FUN = sum)
  ref <- ref[match(result$g, ref$g), ]
  expect_equal(result$sx, ref$x, tolerance = 1e-10)

  # Chain a downstream operation on the grouped result to exercise
  # post-resize hash probing with the result keys still intact.
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(f2), add = TRUE)
  write_vtr(result, f2)
  result2 <- tbl(f2) |> filter(n > 5) |> collect()
  expect_true(all(result2$n > 5))
  expect_true(nrow(result2) > 0)
})

test_that("filter then group_by then summarise", {
  df <- data.frame(
    g = c("a", "b", "a", "b", "a"),
    x = c(1.0, 2.0, 3.0, 4.0, 5.0),
    stringsAsFactors = FALSE
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f)
  result <- tbl(f) |>
    filter(x > 2) |>
    group_by(g) |>
    summarise(s = sum(x)) |>
    collect()
  expect_equal(result$s[result$g == "a"], 8)
  expect_equal(result$s[result$g == "b"], 4)
})

Try the vectra package in your browser

Any scripts or data that you put into this service are public.

vectra documentation built on May 8, 2026, 9:06 a.m.