tests/testthat/test-joins.R

# Helper to create two tables for join tests
make_join_tables <- function() {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
  list(f1 = f1, f2 = f2)
}

# --- inner_join ---

test_that("inner_join keeps only matching rows", {
  ft <- make_join_tables()
  on.exit(unlink(c(ft$f1, ft$f2)))
  result <- inner_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(sort(result$id), c(1, 2))
  expect_equal(result$x[result$id == 1], 10)
  expect_equal(result$y[result$id == 1], 100)
})

# --- left_join ---

test_that("left_join keeps all left rows, NAs for unmatched", {
  ft <- make_join_tables()
  on.exit(unlink(c(ft$f1, ft$f2)))
  result <- left_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
  expect_equal(nrow(result), 3)
  expect_equal(result$y[result$id == 1], 100)
  expect_equal(result$y[result$id == 2], 200)
  expect_true(is.na(result$y[result$id == 3]))
})

# --- semi_join ---

test_that("semi_join keeps left rows with matches, no right cols", {
  ft <- make_join_tables()
  on.exit(unlink(c(ft$f1, ft$f2)))
  result <- semi_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(sort(result$id), c(1, 2))
  expect_equal(names(result), c("id", "x"))
})

# --- anti_join ---

test_that("anti_join keeps left rows without matches", {
  ft <- make_join_tables()
  on.exit(unlink(c(ft$f1, ft$f2)))
  result <- anti_join(tbl(ft$f1), tbl(ft$f2), by = "id") |> collect()
  expect_equal(nrow(result), 1)
  expect_equal(result$id, 3)
  expect_equal(result$x, 30)
})

# --- named by (different column names) ---

test_that("join with named by works", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(a = c(1, 2, 3), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(b = c(1, 2, 4), y = c(100, 200, 400)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = c("a" = "b")) |> collect()
  expect_equal(nrow(result), 2)
  expect_true("x" %in% names(result))
  expect_true("y" %in% names(result))
})

# --- natural join (no by) ---

test_that("natural join finds common columns", {
  ft <- make_join_tables()
  on.exit(unlink(c(ft$f1, ft$f2)))
  expect_message(
    result <- inner_join(tbl(ft$f1), tbl(ft$f2)) |> collect(),
    "Joining by"
  )
  expect_equal(nrow(result), 2)
})

# --- duplicate handling ---

test_that("inner_join with duplicates on right produces multiple rows", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c(1, 2), x = c(10, 20)), f1)
  write_vtr(data.frame(id = c(1, 1, 2), y = c(100, 101, 200)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 3)
  expect_equal(sum(result$id == 1), 2)
})

# --- NA key handling ---

test_that("NAs in join keys do not match", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c(1, NA, 3), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1, NA, 4), y = c(100, 200, 400)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 1)
  expect_equal(result$id, 1)
})

test_that("left_join with NA keys: unmatched NAs get NA right cols", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c(1, NA), x = c(10, 20)), f1)
  write_vtr(data.frame(id = c(1, NA), y = c(100, 200)), f2)
  result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(result$y[which(result$id == 1)], 100)
  expect_true(is.na(result$y[which(is.na(result$id))]))
})

# --- suffix for name collisions ---

test_that("suffix disambiguates columns with same name", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c(1, 2), val = c(10, 20)), f1)
  write_vtr(data.frame(id = c(1, 2), val = c(100, 200)), f2)
  result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_true("val.x" %in% names(result))
  expect_true("val.y" %in% names(result))
})

# --- string keys ---

test_that("join works with string keys", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(name = c("alice", "bob"), x = c(1, 2),
                       stringsAsFactors = FALSE), f1)
  write_vtr(data.frame(name = c("bob", "carol"), y = c(20, 30),
                       stringsAsFactors = FALSE), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "name") |> collect()
  expect_equal(nrow(result), 1)
  expect_equal(result$name, "bob")
  expect_equal(result$y, 20)
})

# --- composite keys ---

test_that("join works with composite keys", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(a = c(1, 1, 2), b = c("x", "y", "x"),
                       v1 = c(10, 20, 30), stringsAsFactors = FALSE), f1)
  write_vtr(data.frame(a = c(1, 2), b = c("x", "x"),
                       v2 = c(100, 300), stringsAsFactors = FALSE), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = c("a", "b")) |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(sort(result$v1), c(10, 30))
})

# --- join after filter ---

test_that("string-key join survives many unique keys", {
  # 200 unique string keys forces hash table to work hard.
  # Regression test for join key lifetime / string data integrity.
  set.seed(42)
  n_keys <- 200
  keys <- paste0("key_", seq_len(n_keys))
  left_df <- data.frame(
    id = rep(keys, each = 3),
    x = rnorm(n_keys * 3),
    stringsAsFactors = FALSE
  )
  right_df <- data.frame(
    id = keys,
    y = rnorm(n_keys),
    stringsAsFactors = FALSE
  )
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(left_df, f1)
  write_vtr(right_df, f2)

  # left_join: all left rows should have a match
  result <- tbl(f1) |> left_join(tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), nrow(left_df))
  expect_false(any(is.na(result$y)))

  # full_join with partial overlap
  partial_right <- data.frame(
    id = keys[1:100],
    z = rnorm(100),
    stringsAsFactors = FALSE
  )
  f3 <- tempfile(fileext = ".vtr")
  on.exit(unlink(f3), add = TRUE)
  write_vtr(partial_right, f3)
  result2 <- tbl(f1) |> full_join(tbl(f3), by = "id") |> collect()
  # All left rows present, no extra right-only rows (right keys are subset)
  expect_equal(nrow(result2), nrow(left_df))
  # Keys 101-200 have NA z
  unmatched <- result2[result2$id %in% keys[101:200], ]
  expect_true(all(is.na(unmatched$z)))

  # Chain a downstream op on the result to verify string integrity
  f4 <- tempfile(fileext = ".vtr")
  on.exit(unlink(f4), add = TRUE)
  write_vtr(result, f4)
  roundtrip <- tbl(f4) |> filter(y > 0) |> collect()
  expect_true(all(roundtrip$y > 0))
  expect_true(nrow(roundtrip) > 0)
})

# --- key type mismatch ---

test_that("join auto-coerces compatible key types (int64 vs double)", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # integer keys on left, double keys on right -> should auto-coerce
  write_vtr(data.frame(id = c(1L, 2L, 3L), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(sort(result$id), c(1, 2))
})

test_that("join errors on key type mismatch (string vs double)", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c("a", "b"), x = c(10, 20),
                       stringsAsFactors = FALSE), f1)
  write_vtr(data.frame(id = c(1, 2), y = c(100, 200)), f2)
  expect_error(
    left_join(tbl(f1), tbl(f2), by = "id") |> collect(),
    "left_join key type mismatch: x\\.id \\(string\\) vs y\\.id \\(double\\)"
  )
})

test_that("composite key auto-coerces mixed numeric types", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # First key (a) matches (both double), second key (b) int vs double
  write_vtr(data.frame(a = c(1, 2), b = c(10L, 20L), x = c(1, 2)), f1)
  write_vtr(data.frame(a = c(1, 2), b = c(10, 20), y = c(3, 4)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = c("a", "b")) |> collect()
  expect_equal(nrow(result), 2)
})

test_that("join works when both sides have same numeric type", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # Both double
  write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1, 2, 4), y = c(100, 200, 400)), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 2)
  # Both integer
  write_vtr(data.frame(id = c(1L, 2L, 3L), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1L, 2L, 4L), y = c(100, 200, 400)), f2)
  result2 <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result2), 2)
})

# --- long string keys ---

test_that("join handles very long string keys", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # Keys with 100k+ bytes each
  long_a <- paste(rep("a", 100000), collapse = "")
  long_b <- paste(rep("b", 100000), collapse = "")
  long_c <- paste(rep("c", 100000), collapse = "")
  write_vtr(data.frame(key = c(long_a, long_b, long_c),
                       x = c(1, 2, 3), stringsAsFactors = FALSE), f1)
  write_vtr(data.frame(key = c(long_b, long_c, "short"),
                       y = c(20, 30, 99), stringsAsFactors = FALSE), f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "key") |> collect()
  expect_equal(nrow(result), 2)
  expect_equal(result$x[result$key == long_b], 2)
  expect_equal(result$y[result$key == long_c], 30)
})

test_that("join handles many distinct long string keys", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  n <- 500
  # Each key is ~1000 chars with a unique suffix
  base <- paste(rep("x", 1000), collapse = "")
  keys <- paste0(base, "_", seq_len(n))
  write_vtr(data.frame(key = keys, x = seq_len(n),
                       stringsAsFactors = FALSE), f1)
  write_vtr(data.frame(key = keys[1:250], y = seq_len(250),
                       stringsAsFactors = FALSE), f2)
  result <- left_join(tbl(f1), tbl(f2), by = "key") |> collect()
  expect_equal(nrow(result), n)
  expect_equal(sum(!is.na(result$y)), 250)
  expect_equal(sum(is.na(result$y)), 250)
})

# --- join after filter ---

# --- streaming probe tests (multi-batch left side) ---

test_that("streaming inner_join across multiple probe batches", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # 100 left rows in 10 batches of 10 rows each (all integer)
  left_df <- data.frame(id = rep(1L:10L, each = 10L),
                        x = seq_len(100L))
  right_df <- data.frame(id = c(1L, 5L, 10L), y = c(10L, 50L, 100L))
  write_vtr(left_df, f1, batch_size = 10L)
  write_vtr(right_df, f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 30)  # 3 matching ids * 10 rows each
  expect_equal(sort(unique(result$id)), c(1, 5, 10))
})

test_that("streaming left_join with unmatched rows across batches", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  left_df <- data.frame(id = 1L:50L, x = seq_len(50L))
  right_df <- data.frame(id = c(1L, 25L, 50L), y = c(100L, 250L, 500L))
  write_vtr(left_df, f1, batch_size = 10L)  # 5 batches
  write_vtr(right_df, f2)
  result <- left_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 50)
  expect_equal(sum(!is.na(result$y)), 3)
  expect_equal(sum(is.na(result$y)), 47)
  expect_equal(result$y[result$id == 25], 250)
})

test_that("streaming semi_join across multiple batches", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  left_df <- data.frame(id = 1L:30L, x = seq_len(30L))
  right_df <- data.frame(id = c(5L, 15L, 25L), y = c(1L, 2L, 3L))
  write_vtr(left_df, f1, batch_size = 10L)  # 3 batches
  write_vtr(right_df, f2)
  result <- semi_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 3)
  expect_equal(sort(result$id), c(5, 15, 25))
  expect_equal(names(result), c("id", "x"))
})

test_that("streaming anti_join across batches where some batches produce no output", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # Batch 1 (ids 1-10): all matched -> 0 output rows
  # Batch 2 (ids 11-20): none matched -> 10 output rows
  left_df <- data.frame(id = 1L:20L, x = seq_len(20L))
  right_df <- data.frame(id = 1L:10L, y = seq_len(10L))
  write_vtr(left_df, f1, batch_size = 10L)
  write_vtr(right_df, f2)
  result <- anti_join(tbl(f1), tbl(f2), by = "id") |> collect()
  expect_equal(nrow(result), 10)
  expect_equal(sort(result$id), 11:20)
})

test_that("streaming full_join emits unmatched build rows after probe", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  left_df <- data.frame(id = 1L:20L, x = seq_len(20L))
  right_df <- data.frame(id = c(5L, 15L, 99L, 100L),
                         y = c(50L, 150L, 990L, 1000L))
  write_vtr(left_df, f1, batch_size = 10L)
  write_vtr(right_df, f2)
  result <- full_join(tbl(f1), tbl(f2), by = "id") |> collect()
  # 20 left rows + 2 unmatched right rows (99, 100)
  expect_equal(nrow(result), 22)
  # Unmatched right rows have NA x
  unmatched <- result[result$id %in% c(99, 100), ]
  expect_equal(nrow(unmatched), 2)
  expect_true(all(is.na(unmatched$x)))
  expect_equal(sort(unmatched$y), c(990, 1000))
})

test_that("streaming many-to-many join works across batches", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  # 30 left rows, 3 batches of 10 (all integer)
  # Right side has 5 rows per key for keys 1-3
  left_df <- data.frame(id = rep(1L:3L, each = 10L),
                        x = seq_len(30L))
  right_df <- data.frame(id = rep(1L:3L, each = 5L),
                         y = seq_len(15L))
  write_vtr(left_df, f1, batch_size = 10L)
  write_vtr(right_df, f2)
  result <- inner_join(tbl(f1), tbl(f2), by = "id") |> collect()
  # Each of 30 left rows matches 5 right rows -> 150
  expect_equal(nrow(result), 150)
})

# --- join after filter ---

test_that("join composes with filter", {
  f1 <- tempfile(fileext = ".vtr")
  f2 <- tempfile(fileext = ".vtr")
  on.exit(unlink(c(f1, f2)))
  write_vtr(data.frame(id = c(1, 2, 3), x = c(10, 20, 30)), f1)
  write_vtr(data.frame(id = c(1, 2, 3), y = c(100, 200, 300)), f2)
  result <- tbl(f1) |>
    filter(x > 10) |>
    inner_join(tbl(f2), by = "id") |>
    collect()
  expect_equal(nrow(result), 2)
  expect_true(all(result$x > 10))
})

Try the vectra package in your browser

Any scripts or data that you put into this service are public.

vectra documentation built on May 8, 2026, 9:06 a.m.