tests/testthat/test-db_joins.R

test_that("*_join() works", {
  for (conn in get_test_conns()) {

    # Define two test datasets
    x <- get_table(conn, "__mtcars") |>
      dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb)

    y <- get_table(conn, "__mtcars") |>
      dplyr::select(name, drat, wt, qsec)


    # Test the implemented joins
    q  <- dplyr::left_join(x, y, by = "name") |> dplyr::collect()
    qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y), by = "name")
    expect_equal(q, qr)

    q <- dplyr::right_join(x, y, by = "name") |> dplyr::collect()
    qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "name")
    expect_equal(q, qr)

    q <- dplyr::inner_join(x, y, by = "name") |> dplyr::collect()
    qr <- dplyr::inner_join(dplyr::collect(x), dplyr::collect(y), by = "name")
    expect_equal(q, qr)


    # Create two more synthetic test data set with NA data

    # First test case
    x <- data.frame(number = c("1", "2", NA),
                    t = c("strA", NA, "strB"))

    y <- data.frame(letter = c("A", "B", "A", "B"),
                    number = c(NA, "2", "1", "1"))

    # Copy x and y to conn
    x <- dplyr::copy_to(conn, x, name = id("test.SCDB_tmp1", conn), overwrite = TRUE, temporary = FALSE)

    y <- dplyr::copy_to(conn, y, name = id("test.SCDB_tmp2", conn), overwrite = TRUE, temporary = FALSE)


    q  <- dplyr::left_join(x, y, na_by = "number") |>
      dplyr::collect() |>
      dplyr::arrange(number, t, letter)
    qr <- dplyr::left_join(dplyr::collect(x), dplyr::collect(y),  by = "number", multiple = "all") |>
      dplyr::arrange(number, t, letter)
    expect_mapequal(q, qr)

    q  <- dplyr::right_join(x, y, na_by = "number") |>
      dplyr::collect() |>
      dplyr::arrange(number, t, letter)
    qr <- dplyr::right_join(dplyr::collect(x), dplyr::collect(y), by = "number", multiple = "all") |>
      dplyr::arrange(number, t, letter)
    expect_equal(q, qr)

    q  <- dplyr::inner_join(x, y, na_by = "number") |>
      dplyr::collect() |>
      dplyr::arrange(number, t, letter)
    qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y),  by = "number", multiple = "all") |>
      dplyr::arrange(number, t, letter)
    expect_equal(q, qr)


    # Second test case
    x <- data.frame(date = as.Date(c("2022-05-01", "2022-05-01", "2022-05-02", "2022-05-02")),
                    region_id = c("1", NA, NA, "1"),
                    n_start = c(3, NA, NA, NA))

    y <- data.frame(date = as.Date("2022-05-02"),
                    region_id = "1",
                    n_add = 4)

    # Copy x and y to conn
    x <- dplyr::copy_to(conn, x, name = id("test.SCDB_tmp1", conn), overwrite = TRUE, temporary = FALSE)

    y <- dplyr::copy_to(conn, y, name = id("test.SCDB_tmp2", conn), overwrite = TRUE, temporary = FALSE)


    q  <- dplyr::full_join(x, y, by = "date", na_by = "region_id") |>
      dplyr::collect() |>
      dplyr::arrange(date, region_id)
    qr <- dplyr::full_join(dplyr::collect(x), dplyr::collect(y), by = c("date", "region_id")) |>
      dplyr::arrange(date, region_id)
    expect_equal(q, qr)




    # Some other test cases
    x <- get_table(conn, "__mtcars") |>
      dplyr::select(name, mpg, cyl, hp, vs, am, gear, carb)

    y <- get_table(conn, "__mtcars") |>
      dplyr::select(name, drat, wt, qsec)

    xx <- x |> dplyr::mutate(name = dplyr::if_else(dplyr::row_number() == 1, NA, name))
    yy <- y |> dplyr::mutate(name = dplyr::if_else(dplyr::row_number() == 1, NA, name))

    # Using by should give 1 mismatch
    # Using na_by should give no mismatch
    expect_equal(dplyr::left_join(xx, xx, by    = "name") |>
                   dplyr::summarize(n = sum(dplyr::if_else(is.na(cyl.y), 1, 0), na.rm = TRUE)) |>                       # nolint: redundant_ifelse_linter
                   dplyr::pull(n), 1)
    expect_equal(dplyr::left_join(xx, xx, na_by = "name") |>
                   dplyr::summarize(n = sum(dplyr::if_else(is.na(cyl.y), 1, 0), na.rm = TRUE)) |>                       # nolint: redundant_ifelse_linter
                   dplyr::pull(n), 0)

    # And they should be identical with the simple case
    expect_equal(dplyr::left_join(xx, xx, na_by = "name") |>
                   dplyr::select(!"name") |>
                   dplyr::collect(),
                 dplyr::left_join(x,  x,  na_by = "name") |>
                   dplyr::select(!"name") |>
                   dplyr::collect())

    connection_clean_up(conn)
  }
})

Try the SCDB package in your browser

Any scripts or data that you put into this service are public.

SCDB documentation built on Oct. 4, 2024, 1:09 a.m.