tests/testthat/test-vtr1-tdc.R

# P3 + P4: tdc-backed row-group container — round-trip via vtr1_tdc.
#
# Exercises src/vtr1_tdc.c. Builds a multi-column data.frame, splits
# it into row groups via the writer, opens the resulting tdc container
# and reads it back, then verifies bit-identical equality column by
# column.
#
# String columns land in P4d via tdc_decode_block_varlen; the dedicated
# round-trip cases live at the bottom of this file (basic, NA, multi-
# rowgroup, unicode, varied length).

VTR_COMPRESS_NONE  <- 0L
VTR_COMPRESS_FAST  <- 1L
VTR_COMPRESS_SMALL <- 2L

vtr_tdc_roundtrip <- function(df, rowgroup_size, comp_level,
                              annotations = NULL) {
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df,
        as.integer(rowgroup_size), as.integer(comp_level),
        annotations,
        PACKAGE = "vectra")
  out <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
  attr(out, "row.names") <- .set_row_names(length(out[[1]]))
  class(out) <- "data.frame"
  out
}

test_that("multi-column data.frame round-trips byte-exactly across comp levels", {
  set.seed(1)
  n <- 5000L
  df <- data.frame(
    x_dbl = rnorm(n, 100, 25),
    x_int = sample.int(.Machine$integer.max, n) - 1L,
    x_lgl = sample(c(TRUE, FALSE), n, replace = TRUE),
    x_seq = as.double(seq_len(n)),
    stringsAsFactors = FALSE
  )

  for (level in c(VTR_COMPRESS_NONE, VTR_COMPRESS_FAST, VTR_COMPRESS_SMALL)) {
    for (rg in c(64L, 1024L, n)) {
      rt <- vtr_tdc_roundtrip(df, rg, level)
      expect_identical(names(rt), names(df),
                       info = sprintf("level=%d rg=%d", level, rg))
      for (col in names(df)) {
        expect_identical(rt[[col]], df[[col]],
                         info = sprintf("col=%s level=%d rg=%d",
                                        col, level, rg))
      }
    }
  }
})

test_that("single-rowgroup write matches the input exactly", {
  df <- data.frame(
    a = as.double(1:1024),
    b = 1024:1L,
    c = rep(c(TRUE, FALSE), 512),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 4096L, VTR_COMPRESS_FAST)
  expect_identical(rt$a, df$a)
  expect_identical(rt$b, df$b)
  expect_identical(rt$c, df$c)
})

test_that("rowgroup size that does not divide n_rows works", {
  df <- data.frame(
    v = as.double(seq_len(2050)),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 256L, VTR_COMPRESS_FAST)
  expect_identical(rt$v, df$v)
})

test_that("constant column compresses below raw and round-trips", {
  df <- data.frame(
    k = rep(3.14, 8192),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 8192L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")
  raw_size <- length(df$k) * 8
  expect_lt(file.info(path)$size, raw_size)
  rt <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
  expect_identical(rt$k, df$k)
})

# ----- P4a: per-column user annotations propagate end-to-end -----------------

test_that("user annotations round-trip through the schema slot", {
  df <- data.frame(
    a = as.double(1:32),
    b = 1:32L,
    c = rep(c(TRUE, FALSE), 16),
    stringsAsFactors = FALSE
  )
  ann <- c("factor|low|mid|high", NA, "units=kg")

  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 32L, VTR_COMPRESS_FAST, ann,
        PACKAGE = "vectra")

  back <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
  expect_identical(back, ann)

  # Empty-string annotation collapses to NA on the read side.
  ann2 <- c("", "x", "")
  .Call("C_write_vtr_tdc", path, df, 32L, VTR_COMPRESS_FAST, ann2,
        PACKAGE = "vectra")
  back2 <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
  expect_identical(back2, c(NA_character_, "x", NA_character_))
})

test_that("annotation roundtrip survives multi-rowgroup writes", {
  df <- data.frame(
    x = as.double(1:1000),
    y = rep(c(TRUE, FALSE), 500),
    stringsAsFactors = FALSE
  )
  ann <- c("scale=0.5", "factor|TRUE|FALSE")
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 128L, VTR_COMPRESS_FAST, ann,
        PACKAGE = "vectra")
  back <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
  expect_identical(back, ann)
  # Body data still round-trips byte-exactly with annotations attached.
  rt <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
  expect_identical(rt$x, df$x)
  expect_identical(rt$y, df$y)
})

# ----- P4a: per-column statistics round-trip ---------------------------------

test_that("stats round-trip min/max/null_count for double/int/bool", {
  df <- data.frame(
    d = c(1.5, 2.5, 3.5, 4.5, 5.5),
    i = c(10L, 20L, 30L, 40L, 50L),
    b = c(TRUE, TRUE, FALSE, TRUE, FALSE),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 5L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")

  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  expect_length(stats, 1L)        # one rowgroup
  expect_length(stats[[1]], 3L)   # three columns

  # double
  v <- stats[[1]][[1]]
  expect_equal(v[1], 1)             # has_stats
  expect_equal(v[2], 1.5)           # min
  expect_equal(v[3], 5.5)           # max
  expect_equal(v[4], 0)             # null_count

  # int (stored at int64 granularity in stats)
  v <- stats[[1]][[2]]
  expect_equal(v[1], 1)
  expect_equal(v[2], 10)
  expect_equal(v[3], 50)
  expect_equal(v[4], 0)

  # bool: layout is {has_false, has_true} in the {min, max} slots
  v <- stats[[1]][[3]]
  expect_equal(v[1], 1)
  expect_equal(v[2], 1)             # has_false
  expect_equal(v[3], 1)             # has_true
  expect_equal(v[4], 0)
})

test_that("stats split correctly across multiple rowgroups", {
  df <- data.frame(
    x = c(seq(1, 100), seq(201, 300)) * 1.0,  # rg1 in [1,100], rg2 in [201,300]
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 100L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")

  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  expect_length(stats, 2L)

  v1 <- stats[[1]][[1]]
  expect_equal(v1[2], 1)
  expect_equal(v1[3], 100)

  v2 <- stats[[2]][[1]]
  expect_equal(v2[2], 201)
  expect_equal(v2[3], 300)
})

test_that("constant column produces equal min and max", {
  df <- data.frame(
    k = rep(42.0, 256),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 256L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")
  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  v <- stats[[1]][[1]]
  expect_equal(v[1], 1)
  expect_equal(v[2], 42)
  expect_equal(v[3], 42)
})

test_that("bool-only-true column reports has_false=0", {
  df <- data.frame(
    b = rep(TRUE, 128),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 128L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")
  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  v <- stats[[1]][[1]]
  expect_equal(v[1], 1)
  expect_equal(v[2], 0)             # has_false
  expect_equal(v[3], 1)             # has_true
})

# ----- P4b: NA round-trip via validity bitmap --------------------------------

test_that("NA values round-trip for double / int / logical, single rowgroup", {
  df <- data.frame(
    d = c(1.5, NA_real_, 3.5, NA_real_, 5.5),
    i = c(NA_integer_, 20L, 30L, NA_integer_, 50L),
    b = c(TRUE, NA, FALSE, NA, TRUE),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 5L, VTR_COMPRESS_FAST)
  expect_identical(rt$d, df$d)
  expect_identical(rt$i, df$i)
  expect_identical(rt$b, df$b)
})

test_that("NA values round-trip across rowgroup boundaries", {
  set.seed(42)
  n <- 1000L
  d <- rnorm(n)
  d[sample.int(n, 200)] <- NA_real_
  i <- sample.int(.Machine$integer.max, n) - 1L
  i[sample.int(n, 150)] <- NA_integer_
  b <- sample(c(TRUE, FALSE), n, replace = TRUE)
  b[sample.int(n, 100)] <- NA
  df <- data.frame(d = d, i = i, b = b, stringsAsFactors = FALSE)

  for (rg in c(64L, 128L, 333L, n)) {
    for (level in c(VTR_COMPRESS_NONE, VTR_COMPRESS_FAST,
                    VTR_COMPRESS_SMALL)) {
      rt <- vtr_tdc_roundtrip(df, rg, level)
      expect_identical(rt$d, df$d, info = sprintf("rg=%d lvl=%d", rg, level))
      expect_identical(rt$i, df$i, info = sprintf("rg=%d lvl=%d", rg, level))
      expect_identical(rt$b, df$b, info = sprintf("rg=%d lvl=%d", rg, level))
    }
  }
})

test_that("all-NA column round-trips correctly", {
  df <- data.frame(
    d = rep(NA_real_, 256),
    i = rep(NA_integer_, 256),
    b = rep(NA, 256),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 64L, VTR_COMPRESS_FAST)
  expect_identical(rt$d, df$d)
  expect_identical(rt$i, df$i)
  expect_identical(rt$b, df$b)
})

test_that("NA at rowgroup boundary positions survives", {
  # NAs deliberately placed at first / last index of each rowgroup.
  rg <- 32L
  n  <- 128L
  d <- as.double(seq_len(n))
  edges <- c(1L, rg, rg + 1L, 2L * rg, 2L * rg + 1L, n)
  d[edges] <- NA_real_
  df <- data.frame(d = d, stringsAsFactors = FALSE)
  rt <- vtr_tdc_roundtrip(df, rg, VTR_COMPRESS_FAST)
  expect_identical(rt$d, df$d)
})

# ----- P4c: parallel reader correctness --------------------------------------

test_that("many-rowgroup file decodes correctly under parallel reader", {
  # 200 rowgroups exercises OpenMP scheduling and per-thread FILE* handling.
  set.seed(7)
  n <- 20000L
  df <- data.frame(
    d = rnorm(n),
    i = sample.int(.Machine$integer.max, n) - 1L,
    b = sample(c(TRUE, FALSE), n, replace = TRUE),
    stringsAsFactors = FALSE
  )
  # Salt with NAs every 73rd row across all three types.
  na_idx <- seq.int(1L, n, by = 73L)
  df$d[na_idx] <- NA_real_
  df$i[na_idx] <- NA_integer_
  df$b[na_idx] <- NA

  rt <- vtr_tdc_roundtrip(df, 100L, VTR_COMPRESS_FAST)  # 200 rowgroups
  expect_identical(rt$d, df$d)
  expect_identical(rt$i, df$i)
  expect_identical(rt$b, df$b)
})

test_that("null_count stat reflects NA presence", {
  df <- data.frame(
    d = c(1.0, NA_real_, 3.0, NA_real_, 5.0),
    i = c(NA_integer_, NA_integer_, 30L, 40L, 50L),
    b = c(TRUE, NA, NA, NA, FALSE),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 5L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")
  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  expect_equal(stats[[1]][[1]][4], 2)  # d: 2 NAs
  expect_equal(stats[[1]][[2]][4], 2)  # i: 2 NAs
  expect_equal(stats[[1]][[3]][4], 3)  # b: 3 NAs

  # min/max ignore NAs.
  expect_equal(stats[[1]][[1]][2], 1)
  expect_equal(stats[[1]][[1]][3], 5)
  expect_equal(stats[[1]][[2]][2], 30)
  expect_equal(stats[[1]][[2]][3], 50)
})

# ---------------------------------------------------------------------------
# P4d — VEC_STRING round-trip
# ---------------------------------------------------------------------------

test_that("string column round-trips byte-exactly across rowgroup sizes", {
  s <- c("alpha", "beta", "", "gamma", "delta", "epsilon", "zeta", "eta")
  df <- data.frame(s = s, stringsAsFactors = FALSE)
  for (rg in c(1L, 2L, 3L, 8L, 64L)) {
    rt <- vtr_tdc_roundtrip(df, rg, VTR_COMPRESS_FAST)
    expect_identical(rt$s, df$s, info = sprintf("rg=%d", rg))
  }
})

test_that("string column with NA_character_ round-trips and counts nulls", {
  df <- data.frame(
    s = c("a", NA_character_, "", "longer string", NA_character_, "z"),
    stringsAsFactors = FALSE
  )
  path <- tempfile(fileext = ".vtdc")
  on.exit(unlink(path), add = TRUE)
  .Call("C_write_vtr_tdc", path, df, 6L, VTR_COMPRESS_FAST, NULL,
        PACKAGE = "vectra")
  out <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
  expect_identical(out$s, df$s)

  stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
  expect_equal(stats[[1]][[1]][4], 2)  # 2 NA strings
})

test_that("string column survives many rowgroups (boundary stress)", {
  s <- sprintf("row-%05d", 0:999)
  df <- data.frame(s = s, stringsAsFactors = FALSE)
  rt <- vtr_tdc_roundtrip(df, 137L, VTR_COMPRESS_FAST)  # 8 rowgroups
  expect_identical(rt$s, df$s)
})

test_that("unicode and varied-length strings round-trip", {
  df <- data.frame(
    s = c("café", "naïve", "日本語", "emoji \U1F600",
          strrep("x", 1000), "", "single"),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 3L, VTR_COMPRESS_FAST)
  expect_identical(rt$s, df$s)
})

test_that("string columns mix with numeric/logical columns in one container", {
  set.seed(7)
  n <- 500L
  df <- data.frame(
    name  = sprintf("item-%03d", seq_len(n)),
    value = rnorm(n),
    flag  = sample(c(TRUE, FALSE), n, replace = TRUE),
    stringsAsFactors = FALSE
  )
  rt <- vtr_tdc_roundtrip(df, 73L, VTR_COMPRESS_FAST)
  expect_identical(rt$name,  df$name)
  expect_identical(rt$value, df$value)
  expect_identical(rt$flag,  df$flag)
})

Try the vectra package in your browser

Any scripts or data that you put into this service are public.

vectra documentation built on May 8, 2026, 9:06 a.m.