tests/testthat/test-compression.R

# -- Phase 1: compression round-trips ------------------------------------------

test_that("compress='fast' round-trips doubles", {
  df <- data.frame(x = rnorm(500), y = runif(500))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$x, df$x)
  expect_equal(result$y, df$y)
})

test_that("compress='small' round-trips doubles", {
  df <- data.frame(x = rnorm(500), y = runif(500))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small")
  result <- tbl(f) |> collect()
  expect_equal(result$x, df$x)
  expect_equal(result$y, df$y)
})

test_that("compress='small' round-trips integers", {
  df <- data.frame(a = 1:1000, b = sample(1e6, 1000))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small")
  result <- tbl(f) |> collect()
  expect_equal(result$a, as.double(df$a))
  expect_equal(result$b, as.double(df$b))
})

test_that("compress='small' round-trips strings", {
  df <- data.frame(s = sample(letters, 500, replace = TRUE),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small")
  result <- tbl(f) |> collect()
  expect_equal(result$s, df$s)
})

test_that("compress='small' round-trips booleans with NA", {
  df <- data.frame(flag = c(TRUE, FALSE, NA, TRUE, FALSE, rep(TRUE, 495)))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small")
  result <- tbl(f) |> collect()
  expect_equal(result$flag, df$flag)
})

test_that("compress='small' round-trips all types together", {
  df <- data.frame(
    i = c(1L, 2L, NA, 4L, seq(5L, 500L)),
    d = c(1.1, NA, 3.3, 4.4, rnorm(496)),
    b = c(TRUE, NA, FALSE, TRUE, sample(c(TRUE, FALSE), 496, replace = TRUE)),
    s = c("a", NA, "c", "d", sample(letters, 496, replace = TRUE)),
    stringsAsFactors = FALSE
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small")
  result <- tbl(f) |> collect()
  expect_equal(result$d, df$d)
  expect_equal(result$b, df$b)
  expect_equal(result$s, df$s)
  expect_equal(result$i, as.double(df$i))
})

test_that("small files are no larger than fast files on structured data", {
  df <- data.frame(x = rep(1:100, 50), y = rep(rnorm(100), 50))
  f_fast  <- tempfile(fileext = ".vtr")
  f_ratio <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f_fast, f_ratio)))
  write_vtr(df, f_fast,  compress = "fast")
  write_vtr(df, f_ratio, compress = "small")
  expect_lte(file.size(f_ratio), file.size(f_fast))
})

test_that("compress='small' with multiple row groups", {
  df <- data.frame(x = rnorm(1000), y = 1:1000)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small", batch_size = 200)
  result <- tbl(f) |> collect()
  expect_equal(nrow(result), 1000L)
  expect_equal(result$x, df$x)
  expect_equal(result$y, as.double(df$y))
})

test_that("small with quantize round-trips", {
  set.seed(42)
  df <- data.frame(temp = rnorm(1000, 15, 5))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small",
            quantize = list(temp = list(precision = 0.01, type = "int16")))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$temp - df$temp))
  expect_lt(max_err, 0.01 + 1e-10)
})

test_that("small with spatial round-trips", {
  nx <- 50; ny <- 50
  set.seed(1)
  temp <- 10 + 0.05 * rep(1:nx, ny) + 0.03 * rep(1:ny, each = nx) +
          rnorm(nx * ny, 0, 0.5)
  df <- data.frame(temp = temp)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small",
            quantize = list(temp = list(precision = 0.001, type = "int16")),
            spatial = list(nx = nx, ny = ny))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$temp - df$temp))
  expect_lt(max_err, 0.001 + 1e-10)
})

test_that("small with narrow int round-trips", {
  df <- data.frame(a = sample(-100L:100L, 500, replace = TRUE),
                   b = sample(-30000L:30000L, 500, replace = TRUE))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "small",
            col_types = c(a = "int8", b = "int16"))
  result <- tbl(f) |> collect()
  expect_equal(result$a, as.double(df$a))
  expect_equal(result$b, as.double(df$b))
})

test_that("compress='none' round-trips doubles", {
  df <- data.frame(x = rnorm(500), y = runif(500))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "none")
  result <- tbl(f) |> collect()
  expect_equal(result$x, df$x)
  expect_equal(result$y, df$y)
})

test_that("compress='fast' round-trips integers", {
  df <- data.frame(a = 1:1000, b = sample(1e6, 1000))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$a, as.double(df$a))
  expect_equal(result$b, as.double(df$b))
})

test_that("compress='fast' round-trips strings", {
  df <- data.frame(s = sample(letters, 500, replace = TRUE),
                   stringsAsFactors = FALSE)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$s, df$s)
})

test_that("compress='fast' round-trips booleans with NA", {
  df <- data.frame(flag = c(TRUE, FALSE, NA, TRUE, FALSE, rep(TRUE, 495)))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$flag, df$flag)
})

test_that("compress='fast' round-trips all types together", {
  df <- data.frame(
    i = c(1L, 2L, NA, 4L, seq(5L, 500L)),
    d = c(1.1, NA, 3.3, 4.4, rnorm(496)),
    b = c(TRUE, NA, FALSE, TRUE, sample(c(TRUE, FALSE), 496, replace = TRUE)),
    s = c("a", NA, "c", "d", sample(letters, 496, replace = TRUE)),
    stringsAsFactors = FALSE
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$d, df$d)
  expect_equal(result$b, df$b)
  expect_equal(result$s, df$s)
  expect_equal(result$i, as.double(df$i))
})

test_that("compressed files are smaller than uncompressed", {
  df <- data.frame(x = rep(1:100, 50), y = rep(rnorm(100), 50))
  f_none <- tempfile(fileext = ".vtr")
  f_fast <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f_none, f_fast)))

  write_vtr(df, f_none, compress = "none")
  write_vtr(df, f_fast, compress = "fast")

  expect_lt(file.size(f_fast), file.size(f_none))
})

test_that("compress='fast' with multiple row groups", {
  df <- data.frame(x = rnorm(1000), y = 1:1000)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast", batch_size = 200)
  result <- tbl(f) |> collect()
  expect_equal(nrow(result), 1000L)
  expect_equal(result$x, df$x)
  expect_equal(result$y, as.double(df$y))
})

test_that("uncompressed file reads after rewrite with compression", {
  df <- data.frame(x = 1:100, y = rnorm(100))
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))

  write_vtr(df, f1, compress = "none")
  write_vtr(df, f2, compress = "fast")

  r1 <- tbl(f1) |> collect()
  r2 <- tbl(f2) |> collect()
  expect_equal(r1$x, r2$x)
  expect_equal(r1$y, r2$y)
})

# -- Phase 2a: narrow integer types -------------------------------------------

test_that("col_types int8 round-trips", {
  df <- data.frame(val = c(-100L, 0L, 50L, 127L, NA))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, col_types = c(val = "int8"))
  result <- tbl(f) |> collect()
  expect_equal(result$val, as.double(df$val))
})

test_that("col_types int16 round-trips", {
  df <- data.frame(val = c(-30000L, 0L, 15000L, 32767L, NA))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, col_types = c(val = "int16"))
  result <- tbl(f) |> collect()
  expect_equal(result$val, as.double(df$val))
})

test_that("col_types int32 round-trips", {
  df <- data.frame(val = c(-2000000L, 0L, 1000000L, NA))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, col_types = c(val = "int32"))
  result <- tbl(f) |> collect()
  expect_equal(result$val, as.double(df$val))
})

test_that("narrow int with compression round-trips", {
  df <- data.frame(a = sample(-100L:100L, 500, replace = TRUE),
                   b = sample(-30000L:30000L, 500, replace = TRUE))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            col_types = c(a = "int8", b = "int16"))
  result <- tbl(f) |> collect()
  expect_equal(result$a, as.double(df$a))
  expect_equal(result$b, as.double(df$b))
})

test_that("narrow int files are smaller", {
  df <- data.frame(val = sample(-100L:100L, 1000, replace = TRUE))
  f64 <- tempfile(fileext = ".vtr")
  f8  <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f64, f8)))
  write_vtr(df, f64, compress = "none")
  write_vtr(df, f8, compress = "none", col_types = c(val = "int8"))
  expect_lt(file.size(f8), file.size(f64))
})

test_that("filter works on narrow int columns", {
  df <- data.frame(id = 1:200, val = sample(-100L:100L, 200, replace = TRUE))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, col_types = c(val = "int8"))
  result <- tbl(f) |> filter(val > 0) |> collect()
  expected <- df[df$val > 0, ]
  expect_equal(nrow(result), nrow(expected))
  expect_equal(result$val, as.double(expected$val))
})

test_that("mutate works on narrow int columns", {
  df <- data.frame(val = c(10L, 20L, 30L))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, col_types = c(val = "int8"))
  result <- tbl(f) |> mutate(doubled = val * 2) |> collect()
  expect_equal(result$doubled, c(20, 40, 60))
})

# -- Phase 2b: lossy quantization ---------------------------------------------

test_that("quantize round-trips within precision", {
  set.seed(42)
  df <- data.frame(temp = runif(500, -20, 20))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, quantize = list(temp = list(precision = 0.001,
                                                type = "int16")))
  result <- tbl(f) |> collect()
  expect_equal(length(result$temp), 500L)
  max_err <- max(abs(result$temp - df$temp), na.rm = TRUE)
  expect_lt(max_err, 0.001 + 1e-10)
})

test_that("quantize with scale parameter", {
  df <- data.frame(val = c(0.0, 0.5, 1.0, -1.0))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, quantize = list(val = list(scale = 1000, type = "int16")))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$val - df$val))
  expect_lt(max_err, 0.001 + 1e-10)
})

test_that("quantize preserves NA", {
  df <- data.frame(temp = c(1.5, NA, -3.2, NA, 10.0))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, quantize = list(temp = list(precision = 0.01,
                                                type = "int16")))
  result <- tbl(f) |> collect()
  expect_true(is.na(result$temp[2]))
  expect_true(is.na(result$temp[4]))
  non_na <- c(1, 3, 5)
  max_err <- max(abs(result$temp[non_na] - df$temp[non_na]))
  expect_lt(max_err, 0.01 + 1e-10)
})

test_that("quantize with compression round-trips", {
  set.seed(123)
  df <- data.frame(temp = rnorm(1000, 15, 5))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            quantize = list(temp = list(precision = 0.01, type = "int16")))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$temp - df$temp))
  expect_lt(max_err, 0.01 + 1e-10)
})

test_that("quantize to int8", {
  df <- data.frame(val = c(-5.0, 0.0, 5.0, 12.0))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, quantize = list(val = list(precision = 0.1,
                                               type = "int8")))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$val - df$val))
  expect_lt(max_err, 0.1 + 1e-10)
})

test_that("quantize produces smaller files", {
  set.seed(99)
  df <- data.frame(temp = rnorm(5000, 15, 5))
  f_raw <- tempfile(fileext = ".vtr")
  f_q   <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f_raw, f_q)))
  write_vtr(df, f_raw, compress = "fast")
  write_vtr(df, f_q, compress = "fast",
            quantize = list(temp = list(precision = 0.01, type = "int16")))
  expect_lt(file.size(f_q), file.size(f_raw))
})

# -- Phase 2c: spatial encoding -----------------------------------------------

test_that("spatial round-trips on smooth raster data", {
  nx <- 50; ny <- 50
  x_coord <- rep(1:nx, ny)
  y_coord <- rep(1:ny, each = nx)
  # smooth temperature field: gradient + small noise
  set.seed(1)
  temp <- 10 + 0.05 * x_coord + 0.03 * y_coord + rnorm(nx * ny, 0, 0.5)
  df <- data.frame(temp = temp)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            quantize = list(temp = list(precision = 0.001, type = "int16")),
            spatial = list(nx = nx, ny = ny))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$temp - df$temp))
  expect_lt(max_err, 0.001 + 1e-10)
})

test_that("spatial without quantize round-trips integers", {
  nx <- 20; ny <- 20
  vals <- as.integer(rep(1:20, each = 20) + rep(1:20, 20))
  df <- data.frame(val = vals)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            spatial = list(nx = nx, ny = ny))
  result <- tbl(f) |> collect()
  expect_equal(result$val, as.double(df$val))
})

test_that("spatial produces smaller files on smooth data", {
  nx <- 100; ny <- 100
  set.seed(2)
  temp <- 10 + 0.05 * rep(1:nx, ny) + 0.03 * rep(1:ny, each = nx)
  df <- data.frame(temp = temp)

  f_plain <- tempfile(fileext = ".vtr")
  f_spatial <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f_plain, f_spatial)))

  write_vtr(df, f_plain, compress = "fast",
            quantize = list(temp = list(precision = 0.001, type = "int16")))
  write_vtr(df, f_spatial, compress = "fast",
            quantize = list(temp = list(precision = 0.001, type = "int16")),
            spatial = list(nx = nx, ny = ny))

  expect_lt(file.size(f_spatial), file.size(f_plain))
})

test_that("spatial + quantize + fast compression round-trips", {
  nx <- 30; ny <- 30
  set.seed(3)
  temp <- 15 + 0.1 * rep(1:nx, ny) + rnorm(nx * ny, 0, 0.2)
  df <- data.frame(temp = temp)
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            quantize = list(temp = list(precision = 0.001, type = "int16")),
            spatial = list(nx = nx, ny = ny))
  result <- tbl(f) |> collect()
  max_err <- max(abs(result$temp - df$temp))
  expect_lt(max_err, 0.001 + 1e-10)
})

test_that("spatial requires nx*ny == n_rows", {
  nx <- 40; ny <- 40
  df <- data.frame(temp = rnorm(100))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  expect_error(
    write_vtr(df, f, compress = "fast",
              spatial = list(nx = nx, ny = ny)),
    "nx\\*ny"
  )
})

# -- DIFF encoding (auto-selected for slowly varying data) --------------------

test_that("slowly varying integer data round-trips (triggers DIFF encoding)", {
  # Monotonically increasing with small increments - should trigger DIFF
  df <- data.frame(ts = cumsum(sample(1:5, 1000, replace = TRUE)))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$ts, as.double(df$ts))
})

test_that("slowly varying doubles round-trip (may trigger DIFF)", {
  set.seed(5)
  df <- data.frame(val = cumsum(rnorm(1000, 0, 0.01)))
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast")
  result <- tbl(f) |> collect()
  expect_equal(result$val, df$val)
})

# -- Combined pipeline: narrow + compress + filter ----------------------------

test_that("full pipeline: write compressed narrow int, then filter + collect", {
  df <- data.frame(
    id = 1:1000,
    category = sample(c(-1L, 0L, 1L), 1000, replace = TRUE),
    value = rnorm(1000)
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            col_types = c(category = "int8"))
  result <- tbl(f) |> filter(category == 1) |> collect()
  expected <- df[df$category == 1L, ]
  expect_equal(nrow(result), nrow(expected))
  expect_equal(result$value, expected$value)
})

test_that("full pipeline: quantize + compress + summarise", {
  set.seed(10)
  df <- data.frame(
    grp = rep(c("A", "B"), each = 500),
    val = c(rnorm(500, 10, 2), rnorm(500, 20, 2))
  )
  f <- tempfile(fileext = ".vtr")
  on.exit(unlink(f))
  write_vtr(df, f, compress = "fast",
            quantize = list(val = list(precision = 0.01, type = "int16")))

  result <- tbl(f) |>
    group_by(grp) |>
    summarise(avg = mean(val)) |>
    collect()

  # Means should be close to 10 and 20 (within quantization + sampling error)
  result <- result[order(result$grp), ]
  expect_equal(nrow(result), 2L)
  expect_lt(abs(result$avg[1] - 10), 0.5)
  expect_lt(abs(result$avg[2] - 20), 0.5)
})

Try the vectra package in your browser

Any scripts or data that you put into this service are public.

vectra documentation built on May 8, 2026, 9:06 a.m.