tests/testthat/test-relational.R

# Run this file with testthat::test_local(filter = "^relational$")

con <- dbConnect(duckdb())
on.exit(dbDisconnect(con, shutdown = TRUE))

test_that("we can create a relation from a df", {
  rel <- rel_from_df(con, mtcars)
  expect_type(rel, "externalptr")
  expect_s3_class(rel, "duckdb_relation")
})

test_that("we don't add optional quotes to columns", {
  df1 <- data.frame(a.b = 1)
  rel3 <- rel_from_df(con, df1)
  rel4 <- rel_project(rel3, list(expr_reference("a.b")))
  rel5 <- rel_to_altrep(rel4)
  expect_equal(df1, rel5)
})

test_that("we won't crash when creating a relation from odd things", {
  # na seems to be fine, single col data frame with a single row with NA in it
  rel_from_df(con, NA)

  expect_error(rel_from_df(NULL, NULL))
  expect_error(rel_from_df(con, NULL))

  expect_true(TRUE)
})

test_that("we can round-trip a data frame", {
  expect_equivalent(mtcars, as.data.frame.duckdb_relation(rel_from_df(con, mtcars)))
  expect_equivalent(iris, as.data.frame.duckdb_relation(rel_from_df(con, iris)))
})

test_that("we can recognize if a df is materialized", {
  expect_error(df_is_materialized(data.frame(a = "x")))
  expect_error(df_is_materialized(data.frame(a = 1:2)))

  rel <- rel_from_df(con, data.frame(a = 1:3))
  df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(df))
  # Side effect: materialization
  expect_equal(nrow(df), 3)
  expect_true(df_is_materialized(df))
})


test_that("we can create various expressions and don't crash", {
  ref <- expr_reference("asdf")
  print(ref)
  expect_error(expr_reference(NA))
  #  expect_error(expr_reference(as.character(NA)))
  expect_error(expr_reference(""))
  expect_error(expr_reference(NULL))

  expr_constant(TRUE)
  expr_constant(FALSE)
  expr_constant(NA)
  expr_constant(42L)
  expr_constant(42)
  const <- expr_constant("asdf")
  print(const)

  expect_error(expr_constant(NULL))
  expect_error(expr_constant())


  expr_function("asdf", list())

  expect_error(expr_function("", list()))
  #  expect_error(expr_function(as.character(NA), list()))
  expect_error(expr_function(NULL, list()))
  expect_error(expr_function("asdf"))

  expect_true(TRUE)
})


# TODO should maybe be a different file, test_enum_strings.R

test_that("we can cast R strings to DuckDB strings", {
  chars <- c(letters, LETTERS)
  max_len <- 100
  n <- 100000

  # yay R one-liners
  gen_rand_string <- function(x, max_len) paste0(chars[sample(1:length(chars), runif(1) * max_len, replace = TRUE)], collapse = "")

  test_string_vec <- c(vapply(1:n, gen_rand_string, "character", max_len), NA, NA, NA, NA, NA, NA, NA, NA) # batman

  df <- data.frame(s = test_string_vec, stringsAsFactors = FALSE)
  expect_equivalent(df, as.data.frame.duckdb_relation(rel_from_df(con, df)))

  res <- rel_sql(
    rel_from_df(con, df),
    "SELECT s::string FROM _"
  )
  expect_equivalent(df, res)

  res <- rel_sql(
    rel_from_df(con, df),
    "SELECT COUNT(*) c FROM _"
  )
  expect_equal(nrow(df), res$c)

  # many rounds yay
  df2 <- df
  for (i in 1:10) {
    df2 <- as.data.frame.duckdb_relation(rel_from_df(con, df2))
    expect_equivalent(df, df2)
  }

  df2 <- df
  for (i in 1:10) {
    df2 <- as.data.frame(
      rel_sql(
        rel_from_df(con, df2),
        "SELECT s::string s FROM _"
      )
    )
    expect_equivalent(df, df2)
  }
})

test_that("the altrep-conversion for relations works", {
  iris$Species <- as.character(iris$Species)
  rel <- rel_from_df(con, iris)
  df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(df))
  inspect_output <- capture.output(.Internal(inspect(df)))
  expect_true(any(grepl("DUCKDB_ALTREP_REL_VECTOR", inspect_output, fixed = TRUE)))
  expect_true(any(grepl("DUCKDB_ALTREP_REL_ROWNAMES", inspect_output, fixed = TRUE)))
  expect_false(df_is_materialized(df))
  dim(df)
  expect_true(df_is_materialized(df))
  expect_equal(iris, df)
})

test_that("the altrep-conversion for relations work for weirdo types", {
  test_df <- data.frame(col_date = as.Date("2019-11-26"), col_ts = as.POSIXct("2019-11-26 21:11Z", "UTC"), col_factor = factor(c("a")))
  rel <- rel_from_df(con, test_df)
  df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(df))
  expect_equal(test_df, df)
})

test_that("we can get the relation object back from an altrep df", {
  iris$Species <- as.character(iris$Species)
  rel <- rel_from_df(con, iris)
  df <- rel_to_altrep(rel)
  rel2 <- rel_from_altrep_df(df)
  expect_true(TRUE)
})

test_that("rel_order() sorts NAs last", {
  test_df <- rel_from_df(con, data.frame(a = c(NA, 1:3)))

  orders <- list(expr_reference("a"))

  rel <- rel_order(test_df, orders)
  rel_df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(rel_df))

  expected_result <- data.frame(a = c(1:3, NA))
  expect_equal(rel_df, expected_result)
})

test_that("Inner join returns all inner relations", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  left <- rel_from_df(con, data.frame(left_a = c(1, 2, 3), left_b = c(1, 1, 2)))
  right <- rel_from_df(con, data.frame(right_b = c(1, 3), right_c = c(4, 5)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_b"))))
  rel2 <- rel_join(left, right, cond, "inner")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_a = c(1, 2), left_b = c(1, 1), right_b = c(1, 1), right_c = c(4, 4))
  expect_equal(rel_df, expected_result)
})

test_that("Left join returns all left relations", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  left <- rel_from_df(con, data.frame(left_a = c(1, 2, 3), left_b = c(1, 1, 2)))
  right <- rel_from_df(con, data.frame(right_b = c(1)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_b"))))
  rel2 <- rel_join(left, right, cond, "left")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_a = c(1, 2, 3), left_b = c(1, 1, 2), right_b = c(1, 1, NA))
  expect_equal(rel_df, expected_result)
})

test_that("Right join returns all right relations", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  left <- rel_from_df(con, data.frame(left_b = c(1)))
  right <- rel_from_df(con, data.frame(right_a = c(1, 2, 3), right_b = c(1, 1, 2)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_b"))))
  rel2 <- rel_join(left, right, cond, "right")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_b = c(1, 1, NA), right_a = c(1, 2, 3), right_b = c(1, 1, 2))
  expect_equal(rel_df, expected_result)
})

test_that("Full join returns all outer relations", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  left <- rel_from_df(con, data.frame(left_a = c(1, 2, 5), left_b = c(4, 5, 6)))
  right <- rel_from_df(con, data.frame(right_a = c(1, 2, 3), right_b = c(1, 1, 2)))
  cond <- list(expr_function("eq", list(expr_reference("left_a"), expr_reference("right_a"))))
  rel2 <- rel_join(left, right, cond, "outer")
  rel_df <- rel_to_altrep(rel2)

  expected_result <- data.frame(
    left_a = c(1, 2, 5, NA),
    left_b = c(4, 5, 6, NA),
    right_a = c(1, 2, NA, 3),
    right_b = c(1, 1, NA, 2)
  )

  rel_df_sorted <- rel_df[order(rel_df$left_a), ]
  rownames(rel_df_sorted) <- NULL
  expect_equal(rel_df_sorted, expected_result)
})

test_that("cross join works", {
  left <- rel_from_df(con, data.frame(left_a = c(1, 2, 3), left_b = c(1, 1, 2)))
  right <- rel_from_df(con, data.frame(right_a = c(1, 4, 5), right_b = c(7, 8, 9)))
  cross <- rel_join(left, right, list(), "cross")
  order_by <- rel_order(cross, list(expr_reference("right_a"), expr_reference("right_a")))
  rel_df <- rel_to_altrep(order_by)
  dim(rel_df)
  expected_result <- data.frame(
    left_a = c(1, 2, 3, 1, 2, 3, 1, 2, 3),
    left_b = c(1, 1, 2, 1, 1, 2, 1, 1, 2),
    right_a = c(1, 1, 1, 4, 4, 4, 5, 5, 5),
    right_b = c(7, 7, 7, 8, 8, 8, 9, 9, 9)
  )
  expect_equal(rel_df, expected_result)
})

test_that("semi join works", {
  left <- rel_from_df(con, data.frame(left_b = c(1, 5, 6)))
  right <- rel_from_df(con, data.frame(right_a = c(1, 2, 3), right_b = c(1, 1, 2)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_a"))))
  # select * from left semi join right on (left_b = right_a)
  rel2 <- rel_join(left, right, cond, "semi")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_b = c(1))
  expect_equal(rel_df, expected_result)

})

test_that("anti join works", {
  left <- rel_from_df(con, data.frame(left_b = c(1, 5, 6)))
  right <- rel_from_df(con, data.frame(right_a = c(1, 2, 3), right_b = c(1, 1, 2)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_a"))))
  # select * from left anti join right on (left_b = right_a)
  rel2 <- rel_join(left, right, cond, "anti")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_b = c(5, 6))
  expect_equal(rel_df, expected_result)
})

test_that("Union all does not immediately materialize", {
  test_df_a <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  test_df_b <- rel_from_df(con, data.frame(a = c("5", "6"), b = c("7", "8")))
  rel <- rel_union_all(test_df_a, test_df_b)
  rel_df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(rel_df))
  dim(rel_df)
  expect_true(df_is_materialized(rel_df))
})

test_that("Union all has the correct values", {
  test_df_a <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  test_df_b <- rel_from_df(con, data.frame(a = c("5", "6"), b = c("7", "8")))
  rel <- rel_union_all(test_df_a, test_df_b)
  order_by_rel <- rel_order(rel, list(expr_reference("a"), expr_reference("b")))
  rel_df <- rel_to_altrep(order_by_rel)
  expect_false(df_is_materialized(rel_df))
  dim(rel_df)
  expect_true(df_is_materialized(rel_df))
  expected_result <- data.frame(a = c("1", "2", "5", "6"), b = c("3", "4", "7", "8"))
  expect_equal(rel_df, expected_result)
})

test_that("Union all keeps duplicates", {
  test_df_a2 <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  test_df_b2 <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  rel <- rel_union_all(test_df_a2, test_df_b2)
  order_by_rel <- rel_order(rel, list(expr_reference("a"), expr_reference("b")))
  rel_df <- rel_to_altrep(order_by_rel)
  dim(rel_df)
  expect_true(df_is_materialized(rel_df))
  expected_result <- data.frame(a = c("1", "1", "2", "2"), b = c("3", "3", "4", "4"))
  expect_equal(rel_df, expected_result)
})

test_that("Inner join returns all inner relations", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  left <- rel_from_df(con, data.frame(left_a = c(1, 2, 3), left_b = c(1, 1, 2)))
  right <- rel_from_df(con, data.frame(right_b = c(1, 3), right_c = c(4, 5)))
  cond <- list(expr_function("eq", list(expr_reference("left_b"), expr_reference("right_b"))))
  rel2 <- rel_join(left, right, cond, "inner")
  rel_df <- rel_to_altrep(rel2)
  dim(rel_df)
  expected_result <- data.frame(left_a = c(1, 2), left_b = c(1, 1), right_b = c(1, 1), right_c = c(4, 4))
  expect_equal(rel_df, expected_result)
})

test_that("ASOF join works", {
  dbExecute(con, "CREATE OR REPLACE MACRO gte(a, b) AS a >= b")
  test_df1 <- rel_from_df(con, data.frame(ts = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))
  test_df2 <- rel_from_df(con, data.frame(event_ts = c(1, 3, 6, 8), event_id = c(0, 1, 2, 3)))
  cond <- list(expr_function("gte", list(expr_reference("ts"), expr_reference("event_ts"))))
  rel <- rel_join(test_df1, test_df2, cond, join_ref_type = "asof")
  rel_proj <- rel_project(rel, list(expr_reference("ts"), expr_reference("event_id")))
  rel_df <- rel_to_altrep(rel_proj)
  expected_result <- data.frame(ts = c(1, 2, 3, 4, 5, 6, 7, 8, 9), event_id = c(0, 0, 1, 1, 1, 2, 2, 3, 3))
  expect_equal(expected_result, rel_df)
})

test_that("LEFT ASOF join works", {
  dbExecute(con, "CREATE OR REPLACE MACRO gte(a, b) AS a >= b")
  test_df1 <- rel_from_df(con, data.frame(ts = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)))
  test_df2 <- rel_from_df(con, data.frame(event_ts = c(2, 4, 6, 8), event_id = c(0, 1, 2, 3)))
  cond <- list(expr_function("gte", list(expr_reference("ts"), expr_reference("event_ts"))))
  rel <- rel_join(test_df1, test_df2, cond, join = "left", join_ref_type = "asof")
  rel_proj <- rel_project(rel, list(expr_reference("ts"), expr_reference("event_ts"), expr_reference("event_id")))
  order <- rel_order(rel_proj, list(expr_reference("ts")))
  rel_df <- rel_to_altrep(order)
  expected_result <- data.frame(ts = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), event_ts = c(NA, NA, 2, 2, 4, 4, 6, 6, 8, 8), event_id = c(NA, NA, 0, 0, 1, 1, 2, 2, 3, 3))
  expect_equal(expected_result, rel_df)
})

test_that("Positional cross join works", {
  test_df1 <- rel_from_df(con, data.frame(a = c(11, 12, 13), b = c(1, 2, 3)))
  test_df2 <- rel_from_df(con, data.frame(c = c(11, 12), d = c(1, 2)))
  rel <- rel_join(test_df1, test_df2, list(), join = "cross", join_ref_type = "positional")
  rel_df <- rel_to_altrep(rel)
  expected_result <- data.frame(a = c(11, 12, 13), b = c(1, 2, 3), c = c(11, 12, NA), d = c(1, 2, NA))
  expect_equal(expected_result, rel_df)
})

test_that("regular positional join works", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  test_df1 <- rel_from_df(con, data.frame(a = c(11, 12, 13), b = c(1, 2, 3)))
  test_df2 <- rel_from_df(con, data.frame(c = c(11, 12, 14, 11), d = c(4, 5, 6, 8)))
  cond <- expr_function("eq", list(expr_reference("a"), expr_reference("c")))
  rel <- rel_join(test_df1, test_df2, list(cond), join_ref_type = "positional")
  rel_df <- rel_to_altrep(rel)
  expected_result <- data.frame(a = c(11, 12), b = c(1, 2), c = c(11, 12), d = c(4, 5))
  expect_equal(expected_result, rel_df)
})

test_that("Invalid asof join condition throws error", {
  dbExecute(con, "CREATE OR REPLACE MACRO neq(a, b) AS a <> b")
  test_df1 <- rel_from_df(con, data.frame(ts = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))
  test_df2 <- rel_from_df(con, data.frame(begin = c(1, 3, 6, 8), value = c(0, 1, 2, 3)))
  cond <- list(expr_function("neq", list(expr_reference("ts"), expr_reference("begin"))))
  expect_error(rel_join(test_df1, test_df2, cond, join_ref_type = "asof"), "Binder")
})

test_that("multiple inequality conditions for asof join throws error", {
  dbExecute(con, "CREATE OR REPLACE MACRO gte(a, b) AS a >= b")
  test_df1 <- rel_from_df(con, data.frame(ts = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))
  test_df2 <- rel_from_df(con, data.frame(begin = c(1, 3, 6, 8), value = c(0, 1, 2, 3)))
  cond1 <- expr_function("gte", list(expr_reference("ts"), expr_reference("begin")))
  cond2 <- expr_function("gte", list(expr_reference("ts"), expr_reference("value")))
  conds <- list(cond1, cond2)
  expect_error(rel_join(test_df1, test_df2, conds, join_ref_type = "asof"), "Binder")
})


test_that("Inequality joins work", {
  dbExecute(con, "CREATE OR REPLACE MACRO gte(a, b) AS a >= b")
  timing_df <- rel_from_df(con, data.frame(ts = c(1, 2, 3, 4, 5, 6)))
  events_df <- rel_from_df(con, data.frame(event_ts = c(1, 3, 6, 8), event_id = c(0, 1, 2, 3)))
  cond <- list(expr_function("gte", list(expr_reference("ts"), expr_reference("event_ts"))))
  rel <- rel_inner_join(timing_df, events_df, cond)
  rel_proj <- rel_project(rel, list(expr_reference("ts"), expr_reference("event_ts")))
  rel_order <- rel_order(rel_proj, list(expr_reference("ts"), expr_reference("event_ts")))
  rel_df <- rel_to_altrep(rel_order)
  expected_result <- data.frame(ts = c(1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6), event_ts = c(1, 1, 1, 3, 1, 3, 1, 3, 1, 3, 6))
  expect_equal(expected_result, rel_df)
})


test_that("Inequality join works to perform between operation", {
  dbExecute(con, "CREATE OR REPLACE MACRO gt(a, b) AS a > b")
  dbExecute(con, "CREATE OR REPLACE MACRO lt(a, b) AS a < b")
  timing_df <- rel_from_df(con, data.frame(ts = c(1, 2, 3, 4, 5, 6, 7, 8, 9)))
  events_df <- rel_from_df(con, data.frame(event_ts = c(1, 3, 6, 8), event_id = c(0, 1, 2, 3)))
  lead <- expr_function("lead", list(expr_reference("event_ts")))
  window_lead <- expr_window(lead, offset_expr = expr_constant(1))
  expr_set_alias(window_lead, "lead")
  proj_window <- rel_project(events_df, list(expr_reference("event_ts"), window_lead, expr_reference("event_id")))
  cond1 <- expr_function("gt", list(expr_reference("ts"), expr_reference("event_ts")))
  cond2 <- expr_function("lt", list(expr_reference("ts"), expr_reference("lead")))
  conds <- list(cond1, cond2)
  rel <- rel_inner_join(timing_df, proj_window, conds)
  rel_proj <- rel_project(rel, list(expr_reference("ts")))
  rel_order <- rel_order(rel_proj, list(expr_reference("ts")))
  rel_df <- rel_to_altrep(rel_order)
  expected_result <- data.frame(ts = c(2, 4, 5, 7))
  expect_equal(expected_result, rel_df)
})


# nobody should do this in reality. It's a pretty dumb idea
test_that("we can union the same relation to itself", {
  test_df_a2 <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  rel <- rel_union_all(test_df_a2, test_df_a2)
  rel_df <- rel_to_altrep(rel)
  expected_result <- data.frame(a = c("1", "2", "1", "2"), b = c("3", "4", "3", "4"))
  expect_equal(rel_df, expected_result)
})

test_that("we throw an error when attempting to union all relations that are not compatible", {
  test_df_a2 <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4")))
  test_df_b2 <- rel_from_df(con, data.frame(a = c("1", "2"), b = c("3", "4"), c = c("5", "6")))
  # The two data frames have different dimensions, therefore you get a binding error.
  expect_error(rel_union_all(test_df_a2, test_df_b2), "Binder")
})

test_that("A union with different column types casts to the richer type", {
  test_df_a1 <- rel_from_df(con, data.frame(a = c(1)))
  test_df_a2 <- rel_from_df(con, data.frame(a = c("1")))
  rel <- rel_union_all(test_df_a1, test_df_a2)
  res <- rapi_rel_to_df(rel)
  expected <- data.frame(a = c("1.0", "1"))
  expect_equal(class(res$a), class(expected$a))
  expect_equal(res$a[1], expected$a[1])
  expect_equal(res$a[2], expected$a[2])
})

test_that("Set Intersect returns set intersection", {
  test_df_a <- rel_from_df(con, data.frame(a = c(1, 2), b = c(3, 4)))
  test_df_b <- rel_from_df(con, data.frame(a = c(1, 6), b = c(3, 8)))
  rel <- rel_set_intersect(test_df_a, test_df_b)
  ordered_rel <- rel_order(rel, list(expr_reference("a")))
  rel_df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(rel_df))
  dim(rel_df)
  expected_result <- data.frame(a = c(1), b = c(3))
  expect_equal(rel_df, expected_result)
})

test_that("Set intersect casts columns to the richer type", {
  # df1 is Integer
  df1 <- data.frame(x = 1:4)
  # df2 has Double
  df2 <- data.frame(x = 4)
  expect_equal(class(df1$x), "integer")
  expect_equal(class(df2$x), "numeric")
  out <- rel_set_intersect(
    rel_from_df(con, df1),
    rel_from_df(con, df2)
  )
  out_df <- rapi_rel_to_df(out)
  expect_equal(class(out_df$x), "numeric")
})

test_that("Set Diff returns the set difference", {
  test_df_a <- rel_from_df(con, data.frame(a = c(1, 2), b = c(3, 4)))
  test_df_b <- rel_from_df(con, data.frame(a = c(1, 6), b = c(3, 8)))
  rel <- rel_set_diff(test_df_a, test_df_b)
  rel_df <- rel_to_altrep(rel)
  expect_false(df_is_materialized(rel_df))
  dim(rel_df)
  expected_result <- data.frame(a = c(2), b = c(4))
  expect_equal(rel_df, expected_result)
})

test_that("Symmetric difference returns the symmetric difference", {
  test_df_a <- rel_from_df(con, data.frame(a = c(1, 2), b = c(3, 4)))
  test_df_b <- rel_from_df(con, data.frame(a = c(1, 6), b = c(3, 8)))
  rel <- rel_set_symdiff(test_df_a, test_df_b)
  ordered_rel <- rel_order(rel, list(expr_reference("a"), expr_reference("b")))
  rel_df <- rel_to_altrep(ordered_rel)
  expect_false(df_is_materialized(rel_df))
  dim(rel_df)
  expected_result <- data.frame(a = c(2, 6), b = c(4, 8))
  expect_equal(rel_df, expected_result)
})

test_that("rel aggregate with no groups but a sum over a column, sums the column", {
  rel_a <- rel_from_df(con, data.frame(a = c(1, 2), b = c(3, 4)))
  aggrs <- list(sum = expr_function("sum", list(expr_reference("a"))))
  res <- rel_aggregate(rel_a, list(), aggrs)
  rel_df <- rel_to_altrep(res)
  expected_result <- data.frame(sum = c(3))
  expect_equal(rel_df, expected_result)
})

test_that("rel aggregate with groups and aggregate function works", {
  rel_a <- rel_from_df(con, data.frame(a = c(1, 2, 5, 5), b = c(3, 3, 4, 4)))
  aggrs <- list(sum = expr_function("sum", list(expr_reference("a"))))
  rel_b <- rel_aggregate(rel_a, list(expr_reference("b")), aggrs)
  res <- rel_order(rel_b, list(expr_reference("b")))
  rel_df <- rel_to_altrep(res)
  expected_result <- data.frame(b = c(3, 4), sum = c(3, 10))
  expect_equal(rel_df, expected_result)
})

test_that("Window sum expression function test works", {
  #   select j, i, sum(i) over (partition by j) from a order by 1,2
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  sum_func <- expr_function("sum", list(expr_reference("a")))
  aggrs <- expr_window(sum_func, partitions = list(expr_reference("b")))
  expr_set_alias(aggrs, "window_result")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), aggrs))
  order_over_window <- rel_order(window_proj, list(expr_reference("window_result")))
  res <- rel_to_altrep(order_over_window)
  expected_result <- data.frame(a = c(1:8), window_result = c(3, 3, 7, 7, 11, 11, 15, 15))
  expect_equal(res, expected_result)
})

test_that("Window count function works", {
  #   select a, b, count(b) over (partition by a) from a order by a
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  count_func <- expr_function("count", list(expr_reference("a")))
  count <- expr_window(count_func, partitions = list(expr_reference("b")))
  expr_set_alias(count, "window_result")
  window_proj <- rel_project(rel_a, list(count))
  res <- rel_to_altrep(window_proj)
  expected_result <- data.frame(window_result = c(2, 2, 2, 2, 2, 2, 2, 2))
  expect_equal(res, expected_result)
})

test_that("Window avg function works", {
  #     select a, b, avg(b) over (partition by a) from a order by a
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  avg_func <- expr_function("avg", list(expr_reference("a")))
  avg_window <- expr_window(avg_func, partitions = list(expr_reference("b")))
  expr_set_alias(avg_window, "window_result")
  window_proj <- rel_project(rel_a, list(avg_window))
  ordered <- rel_order(window_proj, list(expr_reference("window_result")))
  res <- rel_to_altrep(ordered)
  expected_result <- data.frame(window_result = c(1.5, 1.5, 3.5, 3.5, 5.5, 5.5, 7.5, 7.5))
  expect_equal(res, expected_result)
})

test_that("Window sum with Partition, order, and window boundaries works", {
  #     SUM(x) OVER (partition by b ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 1, 1, 2, 2, 2, 2)))
  partitions <- list(expr_reference("b"))
  order_by_a <- list(rel_order(rel_a, list(expr_reference("a"))))
  sum_func <- expr_function("sum", list(expr_reference("a")))
  sum_window <- expr_window(sum_func,
    partitions = partitions,
    order_bys = list(expr_reference("a")),
    window_boundary_start = "expr_preceding_rows",
    window_boundary_end = "current_row_rows",
    start_expr = expr_constant(2)
  )
  expr_set_alias(sum_window, "window_result")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), sum_window))
  proj_order <- rel_order(window_proj, list(expr_reference("a")))
  res <- rel_to_altrep(proj_order)
  expected_result <- data.frame(a = c(1:8), window_result = c(1, 3, 6, 9, 5, 11, 18, 21))
  expect_equal(res, expected_result)
})

test_that("Window boundaries boundaries are CaSe INsenSItive", {
  #     SUM(x) OVER (partition by b ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 1, 1, 2, 2, 2, 2)))
  partitions <- list(expr_reference("b"))
  order_by_a <- list(rel_order(rel_a, list(expr_reference("a"))))
  sum_func <- expr_function("sum", list(expr_reference("a")))
  sum_window <- expr_window(sum_func,
    partitions = partitions,
    order_bys = list(expr_reference("a")),
    window_boundary_start = "exPr_PREceding_rOWs",
    window_boundary_end = "cURrEnt_rOw_RoWs",
    start_expr = expr_constant(2)
  )
  expr_set_alias(sum_window, "window_result")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), sum_window))
  proj_order <- rel_order(window_proj, list(expr_reference("a")))
  res <- rel_to_altrep(proj_order)
  expected_result <- data.frame(a = c(1:8), window_result = c(1, 3, 6, 9, 5, 11, 18, 21))
  expect_equal(res, expected_result)
})

test_that("Window avg with a filter expression and partition works", {
  #   select a, b, avg(a) FILTER (WHERE x % 2 = 0) over (partition by b)
  DBI::dbExecute(con, "CREATE OR REPLACE MACRO mod(a, b) as a % b")
  DBI::dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) as a = b")
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  partitions <- list(expr_reference("b"))
  mod_function <- expr_function("mod", list(expr_reference("a"), expr_constant(2)))
  zero <- expr_constant(0)
  filters <- list(expr_function("eq", list(zero, mod_function)))
  avg_func <- expr_function("avg", args = list(expr_reference("a")), filter_bys = filters)
  avg_filter_window <- expr_window(avg_func, partitions = partitions)
  expr_set_alias(avg_filter_window, "avg_filter")
  window_proj <- rel_project(rel_a, list(avg_filter_window))
  proj_order <- rel_order(window_proj, list(expr_reference("avg_filter")))
  expected_result <- data.frame(avg_filter = c(2, 2, 4, 4, 6, 6, 8, 8))
  res <- rel_to_altrep(proj_order)
  expect_equal(res, expected_result)
})

test_that("Window lag function works as expected", {
  #   select a, b, lag(a, 1) OVER () order by a
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  lag <- expr_function("lag", list(expr_reference("a")))
  window_lag <- expr_window(lag, offset_expr = expr_constant(1))
  expr_set_alias(window_lag, "lag")
  proj_window <- rel_project(rel_a, list(expr_reference("a"), window_lag))
  order_over_window <- rel_order(proj_window, list(expr_reference("a")))
  expected_result <- data.frame(a = c(1:8), lag = c(NA, 1, 2, 3, 4, 5, 6, 7))
  res <- rel_to_altrep(order_over_window)
  expect_equal(res, expected_result)
})


test_that("function name for window is case insensitive", {
  #   select a, b, lag(a, 1) OVER () order by a
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  lag <- expr_function("LAG", list(expr_reference("a")))
  window_lag <- expr_window(lag, offset_expr = expr_constant(1))
  expr_set_alias(window_lag, "lag")
  proj_window <- rel_project(rel_a, list(expr_reference("a"), window_lag))
  order_over_window <- rel_order(proj_window, list(expr_reference("a")))
  expected_result <- data.frame(a = c(1:8), lag = c(NA, 1, 2, 3, 4, 5, 6, 7))
  res <- rel_to_altrep(order_over_window)
  expect_equal(res, expected_result)
})

test_that("Window lead function works as expected", {
  #   select a, b, lag(a, 1) OVER () order by a
  rel_a <- rel_from_df(con, data.frame(a = c(1:8), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  lead <- expr_function("lead", list(expr_reference("a")))
  window_lead <- expr_window(lead, offset_expr = expr_constant(1))
  expr_set_alias(window_lead, "lead")
  proj_window <- rel_project(rel_a, list(expr_reference("a"), window_lead))
  order_over_window <- rel_order(proj_window, list(expr_reference("a")))
  expected_result <- data.frame(a = c(1:8), lead = c(2, 3, 4, 5, 6, 7, 8, NA))
  res <- rel_to_altrep(order_over_window)
  expect_equal(res, expected_result)
})

test_that("Window function with string aggregate works", {
  #   select j, s, string_agg(s, '|') over (partition by b) from a order by j, s;
  rel_a <- rel_from_df(con, data.frame(r = c(1, 2, 3, 4), a = c("hello", "Big", "world", "42"), b = c(1, 1, 2, 2)))
  str_agg <- expr_function("string_agg", list(expr_reference("a")))
  partitions <- list(expr_reference("b"))
  window_str_cat <- expr_window(str_agg, partitions = partitions)
  expr_set_alias(window_str_cat, "str_agg_res")
  proj_window <- rel_project(rel_a, list(expr_reference("r"), window_str_cat))
  order_over_window <- rel_order(proj_window, list(expr_reference("r")))
  expected_result <- data.frame(r = c(1:4), str_agg_res = c("hello,Big", "hello,Big", "world,42", "world,42"))
  res <- rel_to_altrep(order_over_window)
  expect_equal(res, expected_result)
})

test_that("You can perform window functions on row_number", {
  # select a, b, row_number() OVER () from tmp order by a;
  rel_a <- rel_from_df(con, data.frame(a = c(8:1), b = c(1, 1, 2, 2, 3, 3, 4, 4)))
  row_number <- expr_function("row_number", list())
  window_function <- expr_window(row_number)
  expr_set_alias(window_function, "row_number")
  proj <- rel_project(rel_a, list(expr_reference("a"), window_function))
  order_by_a <- rel_order(proj, list(expr_reference("a")))
  expected_result <- data.frame(a = c(1:8), row_number = (8:1))
  res <- rel_to_altrep(order_by_a)
  expect_equal(res, expected_result)
})

# also tests order by inside the rank function.
# these tests come from https://dplyr.tidyverse.org/articles/window-functions.html
# in dplyr min_rank = rank
test_that("You can perform the window function min_rank", {
  # select rank() OVER (order by a) from t1
  rel_a <- rel_from_df(con, data.frame(a = c(1, 1, 2, 2, 2)))
  rank_func <- expr_function("rank", list())
  min_rank_window <- expr_window(rank_func, order_bys = list(expr_reference("a")))
  expr_set_alias(min_rank_window, "window_result")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), min_rank_window))
  res <- rel_to_altrep(window_proj)
  expected_result <- data.frame(a = c(1, 1, 2, 2, 2), window_result = c(1, 1, 3, 3, 3))
  expect_equal(res, expected_result)
})

test_that("You can perform the window function dense_rank", {
  # select dense_rank() OVER (order by a) from t1;
  rel_a <- rel_from_df(con, data.frame(a = c(1, 1, 2, 2, 2)))
  dense_rank_fun <- expr_function("dense_rank", list())
  min_rank_window <- expr_window(dense_rank_fun, order_bys = list(expr_reference("a")))
  expr_set_alias(min_rank_window, "window_result")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), min_rank_window))
  res <- rel_to_altrep(window_proj)
  expected_result <- data.frame(a = c(1, 1, 2, 2, 2), window_result = c(1, 1, 2, 2, 2))
  expect_equal(res, expected_result)
})

test_that("You can perform the window function cume_dist", {
  # select cume_dist() OVER (order by a) from t1;
  rel_a <- rel_from_df(con, data.frame(a = c(1, 1, 2, 2, 2)))
  cume_dist_func <- expr_function("cume_dist", list())
  cume_dist_window <- expr_window(cume_dist_func, order_bys = list(expr_reference("a")))
  expr_set_alias(cume_dist_window, "cume_dist")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), cume_dist_window))
  order_proj <- rel_order(window_proj, list(expr_reference("a")))
  res <- rel_to_altrep(order_proj)
  expected_result <- data.frame(a = c(1, 1, 2, 2, 2), cume_dist = c(0.4, 0.4, 1.0, 1.0, 1.0))
  expect_equal(res, expected_result)
})

test_that("You can perform the window function percent rank", {
  # select percent_rank() OVER (order by a) from t1;
  rel_a <- rel_from_df(con, data.frame(a = c(5, 1, 3, 2, 2)))
  percent_rank_func <- expr_function("percent_rank", list())
  percent_rank_wind <- expr_window(percent_rank_func, order_bys = list(expr_reference("a")))
  expr_set_alias(percent_rank_wind, "percent_rank")
  window_proj <- rel_project(rel_a, list(expr_reference("a"), percent_rank_wind))
  order_proj <- rel_order(window_proj, list(expr_reference("a")))
  res <- rel_to_altrep(order_proj)
  expected_result <- data.frame(a = c(1, 2, 2, 3, 5), percent_rank = c(0.00, 0.25, 0.25, 0.75, 1.00))
  expect_equal(res, expected_result)
})

# with and without offsets
test_that("R semantics for adding NaNs is respected", {
  dbExecute(con, "CREATE OR REPLACE MACRO eq(a, b) AS a = b")
  test_df_a <- rel_from_df(con, data.frame(a = c(1, 2), b = c(3, 4)))
  test_df_b <- rel_from_df(con, data.frame(c = c(NaN, 6), d = c(3, 8)))
  cond <- list(expr_function("eq", list(expr_reference("b"), expr_reference("d"))))
  rel_join <- rel_join(test_df_a, test_df_b, cond, "inner")
  addition_expression <- expr_function("+", list(expr_reference("a"), expr_reference("c")))
  proj <- rel_project(rel_join, list(addition_expression))
  res <- rapi_rel_to_df(proj)
  expect_true(is.na(res[[1]]))
})

test_that("R semantics for arithmetics sum function are respected", {
  test_df_a <- rel_from_df(con, data.frame(a = c(1:5, NA)))
  sum_rel <- expr_function("sum", list(expr_reference("a")))
  ans <- rel_aggregate(test_df_a, list(), list(sum_rel))
  res <- rel_to_altrep(ans)
  expect_equal(res[[1]], 15)
})

test_that("anti joins for eq_na_matches works", {
  dbExecute(con, 'CREATE OR REPLACE MACRO "___eq_na_matches_na"(a, b) AS ((a IS NULL AND b IS NULL) OR (a = b))')
  rel1 <- rel_from_df(con, data.frame(x = c(1, 1, 2, 3)))
  rel2 <- rel_from_df(con, data.frame(y = c(2, 3, 3, 4)))
  cond <- list(expr_function("___eq_na_matches_na", list(expr_reference("x"), expr_reference("y"))))
  out <- rel_join(rel1, rel2, cond, "anti")
  res <- rel_to_altrep(out)
  expect_equal(res, data.frame(x = c(1, 1)))
})

test_that("semi joins for eq_na_matches works", {
  dbExecute(con, 'CREATE OR REPLACE MACRO "___eq_na_matches_na"(a, b) AS ((a IS NULL AND b IS NULL) OR (a = b))')
  rel1 <- rel_from_df(con, data.frame(x = c(1, 1, 2, 2)))
  rel2 <- rel_from_df(con, data.frame(y = c(2, 2, 2, 2, 3, 3, 3)))
  cond <- list(expr_function("___eq_na_matches_na", list(expr_reference("x"), expr_reference("y"))))
  out <- rel_join(rel1, rel2, cond, "semi")
  res <- rel_to_altrep(out)
  expect_equal(res, data.frame(x = c(2, 2)))
})

test_that("rel_project does not automatically quote upper-case column names", {
  df <- data.frame(B = 1)
  rel <- rel_from_df(con, df)
  ref <- expr_reference(names(df))
  exprs <- list(ref)
  proj <- rel_project(rel, exprs)
  ans <- rapi_rel_to_altrep(proj)
  expect_equal(df, ans)
})

test_that("rel_to_sql works for row_number", {
  invisible(DBI::dbExecute(con, "CREATE MACRO \"==\"(a, b) AS a = b"))
  df1 <- data.frame(a = 1)
  rel1 <- rel_from_df(con, df1)
  rel2 <- rel_project(
    rel1,
    list({
      tmp_expr <- expr_window(expr_function("row_number", list()), list(), list(), offset_expr = NULL, default_expr = NULL)
      expr_set_alias(tmp_expr, "___row_number")
      tmp_expr
    })
  )
  sql <- rel_to_sql(rel2)
  sub_str_sql <- substr(sql, 0, 44)
  expect_equal(sub_str_sql, "SELECT row_number() OVER () AS ___row_number")
})

test_that("rel_from_table_function works", {
  rel <- rel_from_table_function(con, "generate_series", list(1L, 10L, 2L))
  df <- as.data.frame(rel)
  expect_equal(df$generate_series, c(1, 3, 5, 7, 9))
})

test_that("we don't crash with evaluation errors", {
  invisible(DBI::dbExecute(con, "CREATE OR REPLACE MACRO \"==\"(x, y) AS x = y"))
  df1 <- data.frame(a = "a")

  rel1 <- rel_from_df(con, df1)
  rel2 <- rel_project(
    rel1,
    list(
      {
        tmp_expr <- expr_reference("a")
        expr_set_alias(tmp_expr, "a")
        tmp_expr
      },
      {
        tmp_expr <- expr_function(
          "==",
          list(expr_reference("a"), expr_constant(1))
        )
        expr_set_alias(tmp_expr, "b")
        tmp_expr
      }
    )
  )

  ans <- rapi_rel_to_altrep(rel2)

  # This query is supposed to throw a runtime error.
  # If this succeeds, find a new query that throws a runtime error.
  expect_error(nrow(ans), "Error evaluating duckdb query")
})

test_that("we don't crash with evaluation errors", {
  invisible(DBI::dbExecute(con, "CREATE OR REPLACE MACRO \"==\"(x, y) AS x = y"))
  df1 <- data.frame(a = "a")

  rel1 <- rel_from_df(con, df1)
  rel2 <- rel_project(
    rel1,
    list(
      {
        tmp_expr <- expr_reference("a")
        expr_set_alias(tmp_expr, "a")
        tmp_expr
      },
      {
        tmp_expr <- expr_function(
          "==",
          list(expr_reference("a"), expr_constant(1))
        )
        expr_set_alias(tmp_expr, "b")
        tmp_expr
      }
    )
  )

  ans <- rapi_rel_to_altrep(rel2)

  # This query is supposed to throw a runtime error.
  # If this succeeds, find a new query that throws a runtime error.
  expect_error(nrow(ans), "Error evaluating duckdb query")
})

Try the duckdb package in your browser

Any scripts or data that you put into this service are public.

duckdb documentation built on Sept. 11, 2024, 6:33 p.m.