Nothing
# P3 + P4: tdc-backed row-group container — round-trip via vtr1_tdc.
#
# Exercises src/vtr1_tdc.c. Builds a multi-column data.frame, splits
# it into row groups via the writer, opens the resulting tdc container
# and reads it back, then verifies bit-identical equality column by
# column.
#
# String columns land in P4d via tdc_decode_block_varlen; the dedicated
# round-trip cases live at the bottom of this file (basic, NA, multi-
# rowgroup, unicode, varied length).
VTR_COMPRESS_NONE <- 0L
VTR_COMPRESS_FAST <- 1L
VTR_COMPRESS_SMALL <- 2L
vtr_tdc_roundtrip <- function(df, rowgroup_size, comp_level,
annotations = NULL) {
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df,
as.integer(rowgroup_size), as.integer(comp_level),
annotations,
PACKAGE = "vectra")
out <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
attr(out, "row.names") <- .set_row_names(length(out[[1]]))
class(out) <- "data.frame"
out
}
test_that("multi-column data.frame round-trips byte-exactly across comp levels", {
set.seed(1)
n <- 5000L
df <- data.frame(
x_dbl = rnorm(n, 100, 25),
x_int = sample.int(.Machine$integer.max, n) - 1L,
x_lgl = sample(c(TRUE, FALSE), n, replace = TRUE),
x_seq = as.double(seq_len(n)),
stringsAsFactors = FALSE
)
for (level in c(VTR_COMPRESS_NONE, VTR_COMPRESS_FAST, VTR_COMPRESS_SMALL)) {
for (rg in c(64L, 1024L, n)) {
rt <- vtr_tdc_roundtrip(df, rg, level)
expect_identical(names(rt), names(df),
info = sprintf("level=%d rg=%d", level, rg))
for (col in names(df)) {
expect_identical(rt[[col]], df[[col]],
info = sprintf("col=%s level=%d rg=%d",
col, level, rg))
}
}
}
})
test_that("single-rowgroup write matches the input exactly", {
df <- data.frame(
a = as.double(1:1024),
b = 1024:1L,
c = rep(c(TRUE, FALSE), 512),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 4096L, VTR_COMPRESS_FAST)
expect_identical(rt$a, df$a)
expect_identical(rt$b, df$b)
expect_identical(rt$c, df$c)
})
test_that("rowgroup size that does not divide n_rows works", {
df <- data.frame(
v = as.double(seq_len(2050)),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 256L, VTR_COMPRESS_FAST)
expect_identical(rt$v, df$v)
})
test_that("constant column compresses below raw and round-trips", {
df <- data.frame(
k = rep(3.14, 8192),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 8192L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
raw_size <- length(df$k) * 8
expect_lt(file.info(path)$size, raw_size)
rt <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
expect_identical(rt$k, df$k)
})
# ----- P4a: per-column user annotations propagate end-to-end -----------------
test_that("user annotations round-trip through the schema slot", {
df <- data.frame(
a = as.double(1:32),
b = 1:32L,
c = rep(c(TRUE, FALSE), 16),
stringsAsFactors = FALSE
)
ann <- c("factor|low|mid|high", NA, "units=kg")
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 32L, VTR_COMPRESS_FAST, ann,
PACKAGE = "vectra")
back <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
expect_identical(back, ann)
# Empty-string annotation collapses to NA on the read side.
ann2 <- c("", "x", "")
.Call("C_write_vtr_tdc", path, df, 32L, VTR_COMPRESS_FAST, ann2,
PACKAGE = "vectra")
back2 <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
expect_identical(back2, c(NA_character_, "x", NA_character_))
})
test_that("annotation roundtrip survives multi-rowgroup writes", {
df <- data.frame(
x = as.double(1:1000),
y = rep(c(TRUE, FALSE), 500),
stringsAsFactors = FALSE
)
ann <- c("scale=0.5", "factor|TRUE|FALSE")
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 128L, VTR_COMPRESS_FAST, ann,
PACKAGE = "vectra")
back <- .Call("C_read_vtr_tdc_annotations", path, PACKAGE = "vectra")
expect_identical(back, ann)
# Body data still round-trips byte-exactly with annotations attached.
rt <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
expect_identical(rt$x, df$x)
expect_identical(rt$y, df$y)
})
# ----- P4a: per-column statistics round-trip ---------------------------------
test_that("stats round-trip min/max/null_count for double/int/bool", {
df <- data.frame(
d = c(1.5, 2.5, 3.5, 4.5, 5.5),
i = c(10L, 20L, 30L, 40L, 50L),
b = c(TRUE, TRUE, FALSE, TRUE, FALSE),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 5L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
expect_length(stats, 1L) # one rowgroup
expect_length(stats[[1]], 3L) # three columns
# double
v <- stats[[1]][[1]]
expect_equal(v[1], 1) # has_stats
expect_equal(v[2], 1.5) # min
expect_equal(v[3], 5.5) # max
expect_equal(v[4], 0) # null_count
# int (stored at int64 granularity in stats)
v <- stats[[1]][[2]]
expect_equal(v[1], 1)
expect_equal(v[2], 10)
expect_equal(v[3], 50)
expect_equal(v[4], 0)
# bool: layout is {has_false, has_true} in the {min, max} slots
v <- stats[[1]][[3]]
expect_equal(v[1], 1)
expect_equal(v[2], 1) # has_false
expect_equal(v[3], 1) # has_true
expect_equal(v[4], 0)
})
test_that("stats split correctly across multiple rowgroups", {
df <- data.frame(
x = c(seq(1, 100), seq(201, 300)) * 1.0, # rg1 in [1,100], rg2 in [201,300]
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 100L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
expect_length(stats, 2L)
v1 <- stats[[1]][[1]]
expect_equal(v1[2], 1)
expect_equal(v1[3], 100)
v2 <- stats[[2]][[1]]
expect_equal(v2[2], 201)
expect_equal(v2[3], 300)
})
test_that("constant column produces equal min and max", {
df <- data.frame(
k = rep(42.0, 256),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 256L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
v <- stats[[1]][[1]]
expect_equal(v[1], 1)
expect_equal(v[2], 42)
expect_equal(v[3], 42)
})
test_that("bool-only-true column reports has_false=0", {
df <- data.frame(
b = rep(TRUE, 128),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 128L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
v <- stats[[1]][[1]]
expect_equal(v[1], 1)
expect_equal(v[2], 0) # has_false
expect_equal(v[3], 1) # has_true
})
# ----- P4b: NA round-trip via validity bitmap --------------------------------
test_that("NA values round-trip for double / int / logical, single rowgroup", {
df <- data.frame(
d = c(1.5, NA_real_, 3.5, NA_real_, 5.5),
i = c(NA_integer_, 20L, 30L, NA_integer_, 50L),
b = c(TRUE, NA, FALSE, NA, TRUE),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 5L, VTR_COMPRESS_FAST)
expect_identical(rt$d, df$d)
expect_identical(rt$i, df$i)
expect_identical(rt$b, df$b)
})
test_that("NA values round-trip across rowgroup boundaries", {
set.seed(42)
n <- 1000L
d <- rnorm(n)
d[sample.int(n, 200)] <- NA_real_
i <- sample.int(.Machine$integer.max, n) - 1L
i[sample.int(n, 150)] <- NA_integer_
b <- sample(c(TRUE, FALSE), n, replace = TRUE)
b[sample.int(n, 100)] <- NA
df <- data.frame(d = d, i = i, b = b, stringsAsFactors = FALSE)
for (rg in c(64L, 128L, 333L, n)) {
for (level in c(VTR_COMPRESS_NONE, VTR_COMPRESS_FAST,
VTR_COMPRESS_SMALL)) {
rt <- vtr_tdc_roundtrip(df, rg, level)
expect_identical(rt$d, df$d, info = sprintf("rg=%d lvl=%d", rg, level))
expect_identical(rt$i, df$i, info = sprintf("rg=%d lvl=%d", rg, level))
expect_identical(rt$b, df$b, info = sprintf("rg=%d lvl=%d", rg, level))
}
}
})
test_that("all-NA column round-trips correctly", {
df <- data.frame(
d = rep(NA_real_, 256),
i = rep(NA_integer_, 256),
b = rep(NA, 256),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 64L, VTR_COMPRESS_FAST)
expect_identical(rt$d, df$d)
expect_identical(rt$i, df$i)
expect_identical(rt$b, df$b)
})
test_that("NA at rowgroup boundary positions survives", {
# NAs deliberately placed at first / last index of each rowgroup.
rg <- 32L
n <- 128L
d <- as.double(seq_len(n))
edges <- c(1L, rg, rg + 1L, 2L * rg, 2L * rg + 1L, n)
d[edges] <- NA_real_
df <- data.frame(d = d, stringsAsFactors = FALSE)
rt <- vtr_tdc_roundtrip(df, rg, VTR_COMPRESS_FAST)
expect_identical(rt$d, df$d)
})
# ----- P4c: parallel reader correctness --------------------------------------
test_that("many-rowgroup file decodes correctly under parallel reader", {
# 200 rowgroups exercises OpenMP scheduling and per-thread FILE* handling.
set.seed(7)
n <- 20000L
df <- data.frame(
d = rnorm(n),
i = sample.int(.Machine$integer.max, n) - 1L,
b = sample(c(TRUE, FALSE), n, replace = TRUE),
stringsAsFactors = FALSE
)
# Salt with NAs every 73rd row across all three types.
na_idx <- seq.int(1L, n, by = 73L)
df$d[na_idx] <- NA_real_
df$i[na_idx] <- NA_integer_
df$b[na_idx] <- NA
rt <- vtr_tdc_roundtrip(df, 100L, VTR_COMPRESS_FAST) # 200 rowgroups
expect_identical(rt$d, df$d)
expect_identical(rt$i, df$i)
expect_identical(rt$b, df$b)
})
test_that("null_count stat reflects NA presence", {
df <- data.frame(
d = c(1.0, NA_real_, 3.0, NA_real_, 5.0),
i = c(NA_integer_, NA_integer_, 30L, 40L, 50L),
b = c(TRUE, NA, NA, NA, FALSE),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 5L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
expect_equal(stats[[1]][[1]][4], 2) # d: 2 NAs
expect_equal(stats[[1]][[2]][4], 2) # i: 2 NAs
expect_equal(stats[[1]][[3]][4], 3) # b: 3 NAs
# min/max ignore NAs.
expect_equal(stats[[1]][[1]][2], 1)
expect_equal(stats[[1]][[1]][3], 5)
expect_equal(stats[[1]][[2]][2], 30)
expect_equal(stats[[1]][[2]][3], 50)
})
# ---------------------------------------------------------------------------
# P4d — VEC_STRING round-trip
# ---------------------------------------------------------------------------
test_that("string column round-trips byte-exactly across rowgroup sizes", {
s <- c("alpha", "beta", "", "gamma", "delta", "epsilon", "zeta", "eta")
df <- data.frame(s = s, stringsAsFactors = FALSE)
for (rg in c(1L, 2L, 3L, 8L, 64L)) {
rt <- vtr_tdc_roundtrip(df, rg, VTR_COMPRESS_FAST)
expect_identical(rt$s, df$s, info = sprintf("rg=%d", rg))
}
})
test_that("string column with NA_character_ round-trips and counts nulls", {
df <- data.frame(
s = c("a", NA_character_, "", "longer string", NA_character_, "z"),
stringsAsFactors = FALSE
)
path <- tempfile(fileext = ".vtdc")
on.exit(unlink(path), add = TRUE)
.Call("C_write_vtr_tdc", path, df, 6L, VTR_COMPRESS_FAST, NULL,
PACKAGE = "vectra")
out <- .Call("C_read_vtr_tdc", path, PACKAGE = "vectra")
expect_identical(out$s, df$s)
stats <- .Call("C_read_vtr_tdc_stats", path, PACKAGE = "vectra")
expect_equal(stats[[1]][[1]][4], 2) # 2 NA strings
})
test_that("string column survives many rowgroups (boundary stress)", {
s <- sprintf("row-%05d", 0:999)
df <- data.frame(s = s, stringsAsFactors = FALSE)
rt <- vtr_tdc_roundtrip(df, 137L, VTR_COMPRESS_FAST) # 8 rowgroups
expect_identical(rt$s, df$s)
})
test_that("unicode and varied-length strings round-trip", {
df <- data.frame(
s = c("café", "naïve", "日本語", "emoji \U1F600",
strrep("x", 1000), "", "single"),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 3L, VTR_COMPRESS_FAST)
expect_identical(rt$s, df$s)
})
test_that("string columns mix with numeric/logical columns in one container", {
set.seed(7)
n <- 500L
df <- data.frame(
name = sprintf("item-%03d", seq_len(n)),
value = rnorm(n),
flag = sample(c(TRUE, FALSE), n, replace = TRUE),
stringsAsFactors = FALSE
)
rt <- vtr_tdc_roundtrip(df, 73L, VTR_COMPRESS_FAST)
expect_identical(rt$name, df$name)
expect_identical(rt$value, df$value)
expect_identical(rt$flag, df$flag)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.