Nothing
# Helper to create two tables for join tests
make_join_tables <- function() {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
list(f1 = f1, f2 = f2)
}
# --- inner_join ---
test_that("inner_join keeps only matching rows", {
ft <- make_join_tables()
on.exit(unlink(c(ft$f1, ft$f2)))
result <- inner_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
expect_equal(nrow(result), 2)
expect_equal(sort(result$id), c(1, 2))
expect_equal(result$x[result$id == 1], 10)
expect_equal(result$y[result$id == 1], 100)
})
# --- left_join ---
test_that("left_join keeps all left rows, NAs for unmatched", {
ft <- make_join_tables()
on.exit(unlink(c(ft$f1, ft$f2)))
result <- left_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
expect_equal(nrow(result), 3)
expect_equal(result$y[result$id == 1], 100)
expect_equal(result$y[result$id == 2], 200)
expect_true(is.na(result$y[result$id == 3]))
})
# --- semi_join ---
test_that("semi_join keeps left rows with matches, no right cols", {
ft <- make_join_tables()
on.exit(unlink(c(ft$f1, ft$f2)))
result <- semi_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
expect_equal(nrow(result), 2)
expect_equal(sort(result$id), c(1, 2))
expect_equal(names(result), c("id", "x"))
})
# --- anti_join ---
test_that("anti_join keeps left rows without matches", {
ft <- make_join_tables()
on.exit(unlink(c(ft$f1, ft$f2)))
result <- anti_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
expect_equal(nrow(result), 1)
expect_equal(result$id, 3)
expect_equal(result$x, 30)
})
# --- named by (different column names) ---
test_that("join with named by works", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(a = c(1, 2, 3), x = c(10, 20, 30)), f1)
write_vtr(data.frame(b = c(1, 2, 4), y = c(100, 200, 400)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = c("a" = "b")) |> collect()
expect_equal(nrow(result), 2)
expect_true("x" %in% names(result))
expect_true("y" %in% names(result))
})
# --- natural join (no by) ---
test_that("natural join finds common columns", {
ft <- make_join_tables()
on.exit(unlink(c(ft$f1, ft$f2)))
expect_message(
result <- inner_join(tbl(ft$f1), tbl(ft$f2)) |> collect(),
"Joining by"
)
expect_equal(nrow(result), 2)
})
# --- duplicate handling ---
test_that("inner_join with duplicates on right produces multiple rows", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c(1, 2), x = c(10, 20)), f1)
write_vtr(data.frame(id = c(1, 1, 2), y = c(100, 101, 200)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 3)
expect_equal(sum(result$id == 1), 2)
})
# --- NA key handling ---
test_that("NAs in join keys do not match", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c(1, NA, 3), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1, NA, 4), y = c(100, 200, 400)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 1)
expect_equal(result$id, 1)
})
test_that("left_join with NA keys: unmatched NAs get NA right cols", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c(1, NA), x = c(10, 20)), f1)
write_vtr(data.frame(id = c(1, NA), y = c(100, 200)), f2)
result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 2)
expect_equal(result$y[which(result$id == 1)], 100)
expect_true(is.na(result$y[which(is.na(result$id))]))
})
# --- suffix for name collisions ---
test_that("suffix disambiguates columns with same name", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c(1, 2), val = c(10, 20)), f1)
write_vtr(data.frame(id = c(1, 2), val = c(100, 200)), f2)
result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_true("val.x" %in% names(result))
expect_true("val.y" %in% names(result))
})
# --- string keys ---
test_that("join works with string keys", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(name = c("alice", "bob"), x = c(1, 2),
stringsAsFactors = FALSE), f1)
write_vtr(data.frame(name = c("bob", "carol"), y = c(20, 30),
stringsAsFactors = FALSE), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "name") |> collect()
expect_equal(nrow(result), 1)
expect_equal(result$name, "bob")
expect_equal(result$y, 20)
})
# --- composite keys ---
test_that("join works with composite keys", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(a = c(1, 1, 2), b = c("x", "y", "x"),
v1 = c(10, 20, 30), stringsAsFactors = FALSE), f1)
write_vtr(data.frame(a = c(1, 2), b = c("x", "x"),
v2 = c(100, 300), stringsAsFactors = FALSE), f2)
result <- inner_join(tbl(f1), tbl(f2), by = c("a", "b")) |> collect()
expect_equal(nrow(result), 2)
expect_equal(sort(result$v1), c(10, 30))
})
# --- join after filter ---
test_that("string-key join survives many unique keys", {
# 200 unique string keys forces hash table to work hard.
# Regression test for join key lifetime / string data integrity.
set.seed(42)
n_keys <- 200
keys <- paste0("key_", seq_len(n_keys))
left_df <- data.frame(
id = rep(keys, each = 3),
x = rnorm(n_keys * 3),
stringsAsFactors = FALSE
)
right_df <- data.frame(
id = keys,
y = rnorm(n_keys),
stringsAsFactors = FALSE
)
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(left_df, f1)
write_vtr(right_df, f2)
# left_join: all left rows should have a match
result <- tbl(f1) |> left_join(tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), nrow(left_df))
expect_false(any(is.na(result$y)))
# full_join with partial overlap
partial_right <- data.frame(
id = keys[1:100],
z = rnorm(100),
stringsAsFactors = FALSE
)
f3 <- tempfile(fileext = ".vtr")
on.exit(unlink(f3), add = TRUE)
write_vtr(partial_right, f3)
result2 <- tbl(f1) |> full_join(tbl(f3), by = "id") |> collect()
# All left rows present, no extra right-only rows (right keys are subset)
expect_equal(nrow(result2), nrow(left_df))
# Keys 101-200 have NA z
unmatched <- result2[result2$id %in% keys[101:200], ]
expect_true(all(is.na(unmatched$z)))
# Chain a downstream op on the result to verify string integrity
f4 <- tempfile(fileext = ".vtr")
on.exit(unlink(f4), add = TRUE)
write_vtr(result, f4)
roundtrip <- tbl(f4) |> filter(y > 0) |> collect()
expect_true(all(roundtrip$y > 0))
expect_true(nrow(roundtrip) > 0)
})
# --- key type mismatch ---
test_that("join auto-coerces compatible key types (int64 vs double)", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# integer keys on left, double keys on right -> should auto-coerce
write_vtr(data.frame(id = c(1L, 2L, 3L), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 2)
expect_equal(sort(result$id), c(1, 2))
})
test_that("join errors on key type mismatch (string vs double)", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c("a", "b"), x = c(10, 20),
stringsAsFactors = FALSE), f1)
write_vtr(data.frame(id = c(1, 2), y = c(100, 200)), f2)
expect_error(
left_join(tbl(f1), tbl(f2), by = "id") |> collect(),
"left_join key type mismatch: x\\.id \\(string\\) vs y\\.id \\(double\\)"
)
})
test_that("composite key auto-coerces mixed numeric types", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# First key (a) matches (both double), second key (b) int vs double
write_vtr(data.frame(a = c(1, 2), b = c(10L, 20L), x = c(1, 2)), f1)
write_vtr(data.frame(a = c(1, 2), b = c(10, 20), y = c(3, 4)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = c("a", "b")) |> collect()
expect_equal(nrow(result), 2)
})
test_that("join works when both sides have same numeric type", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# Both double
write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 2)
# Both integer
write_vtr(data.frame(id = c(1L, 2L, 3L), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1L, 2L, 4L), y = c(100, 200, 400)), f2)
result2 <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result2), 2)
})
# --- long string keys ---
test_that("join handles very long string keys", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# Keys with 100k+ bytes each
long_a <- paste(rep("a", 100000), collapse = "")
long_b <- paste(rep("b", 100000), collapse = "")
long_c <- paste(rep("c", 100000), collapse = "")
write_vtr(data.frame(key = c(long_a, long_b, long_c),
x = c(1, 2, 3), stringsAsFactors = FALSE), f1)
write_vtr(data.frame(key = c(long_b, long_c, "short"),
y = c(20, 30, 99), stringsAsFactors = FALSE), f2)
result <- inner_join(tbl(f1), tbl(f2), by = "key") |> collect()
expect_equal(nrow(result), 2)
expect_equal(result$x[result$key == long_b], 2)
expect_equal(result$y[result$key == long_c], 30)
})
test_that("join handles many distinct long string keys", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
n <- 500
# Each key is ~1000 chars with a unique suffix
base <- paste(rep("x", 1000), collapse = "")
keys <- paste0(base, "_", seq_len(n))
write_vtr(data.frame(key = keys, x = seq_len(n),
stringsAsFactors = FALSE), f1)
write_vtr(data.frame(key = keys[1:250], y = seq_len(250),
stringsAsFactors = FALSE), f2)
result <- left_join(tbl(f1), tbl(f2), by = "key") |> collect()
expect_equal(nrow(result), n)
expect_equal(sum(!is.na(result$y)), 250)
expect_equal(sum(is.na(result$y)), 250)
})
# --- join after filter ---
# --- streaming probe tests (multi-batch left side) ---
test_that("streaming inner_join across multiple probe batches", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# 100 left rows in 10 batches of 10 rows each (all integer)
left_df <- data.frame(id = rep(1L:10L, each = 10L),
x = seq_len(100L))
right_df <- data.frame(id = c(1L, 5L, 10L), y = c(10L, 50L, 100L))
write_vtr(left_df, f1, batch_size = 10L)
write_vtr(right_df, f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 30) # 3 matching ids * 10 rows each
expect_equal(sort(unique(result$id)), c(1, 5, 10))
})
test_that("streaming left_join with unmatched rows across batches", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
left_df <- data.frame(id = 1L:50L, x = seq_len(50L))
right_df <- data.frame(id = c(1L, 25L, 50L), y = c(100L, 250L, 500L))
write_vtr(left_df, f1, batch_size = 10L) # 5 batches
write_vtr(right_df, f2)
result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 50)
expect_equal(sum(!is.na(result$y)), 3)
expect_equal(sum(is.na(result$y)), 47)
expect_equal(result$y[result$id == 25], 250)
})
test_that("streaming semi_join across multiple batches", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
left_df <- data.frame(id = 1L:30L, x = seq_len(30L))
right_df <- data.frame(id = c(5L, 15L, 25L), y = c(1L, 2L, 3L))
write_vtr(left_df, f1, batch_size = 10L) # 3 batches
write_vtr(right_df, f2)
result <- semi_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 3)
expect_equal(sort(result$id), c(5, 15, 25))
expect_equal(names(result), c("id", "x"))
})
test_that("streaming anti_join across batches where some batches produce no output", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# Batch 1 (ids 1-10): all matched -> 0 output rows
# Batch 2 (ids 11-20): none matched -> 10 output rows
left_df <- data.frame(id = 1L:20L, x = seq_len(20L))
right_df <- data.frame(id = 1L:10L, y = seq_len(10L))
write_vtr(left_df, f1, batch_size = 10L)
write_vtr(right_df, f2)
result <- anti_join(tbl(f1), tbl(f2), by = "id") |> collect()
expect_equal(nrow(result), 10)
expect_equal(sort(result$id), 11:20)
})
test_that("streaming full_join emits unmatched build rows after probe", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
left_df <- data.frame(id = 1L:20L, x = seq_len(20L))
right_df <- data.frame(id = c(5L, 15L, 99L, 100L),
y = c(50L, 150L, 990L, 1000L))
write_vtr(left_df, f1, batch_size = 10L)
write_vtr(right_df, f2)
result <- full_join(tbl(f1), tbl(f2), by = "id") |> collect()
# 20 left rows + 2 unmatched right rows (99, 100)
expect_equal(nrow(result), 22)
# Unmatched right rows have NA x
unmatched <- result[result$id %in% c(99, 100), ]
expect_equal(nrow(unmatched), 2)
expect_true(all(is.na(unmatched$x)))
expect_equal(sort(unmatched$y), c(990, 1000))
})
test_that("streaming many-to-many join works across batches", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
# 30 left rows, 3 batches of 10 (all integer)
# Right side has 5 rows per key for keys 1-3
left_df <- data.frame(id = rep(1L:3L, each = 10L),
x = seq_len(30L))
right_df <- data.frame(id = rep(1L:3L, each = 5L),
y = seq_len(15L))
write_vtr(left_df, f1, batch_size = 10L)
write_vtr(right_df, f2)
result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
# Each of 30 left rows matches 5 right rows -> 150
expect_equal(nrow(result), 150)
})
# --- join after filter ---
test_that("join composes with filter", {
f1 <- tempfile(fileext = ".vtr")
f2 <- tempfile(fileext = ".vtr")
on.exit(unlink(c(f1, f2)))
write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
write_vtr(data.frame(id = c(1, 2, 3), y = c(100, 200, 300)), f2)
result <- tbl(f1) |>
filter(x > 10) |>
inner_join(tbl(f2), by = "id") |>
collect()
expect_equal(nrow(result), 2)
expect_true(all(result$x > 10))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.