tests/testthat/test-rejoin.r

describe("rejoin", {
  it("is left-inverse for lossless full-dep database creation, outside of row permutations, for tables with unique rows", {
    autodb_inverted_by_rejoin <- expect_bi(
      with_args(df_equiv, digits = NA),
      with_args(autodb, ensure_lossless = TRUE) %>>%
        rejoin,
      identity
    )
    table_dum <- data.frame()
    table_dee <- data.frame(a = 1)[, -1, drop = FALSE]
    autodb_inverted_by_rejoin(table_dum)
    autodb_inverted_by_rejoin(table_dee)
    # 6 columns allows for interesting cases, such as a table containing two
    # independent ones, or a reference involving several attributes
    forall(
      gen_df(6, 7, remove_dup_rows = TRUE),
      autodb_inverted_by_rejoin
    )
    forall(
      gen_df(6, 7, remove_dup_rows = FALSE),
      expect_bi(
        with_args(df_equiv, digits = NA),
        with_args(autodb, ensure_lossless = TRUE) %>>%
          rejoin,
        df_unique
      )
    )
  })
  it("is possible for any database constructed from a data frame", {
    forall(
      gen.choice(0, 10) |>
        gen.list(of = 2) |>
        gen.and_then(uncurry(\(rows, cols) {
          gen_df(rows, cols, remove_dup_rows = TRUE)
        })) |>
        gen.with(with_args(autodb, ensure_lossless = TRUE)),
      rejoin %>>%
        with_args(try, silent = TRUE) %>>%
        with_args(expect_s3_class, class = "data.frame")
    )
  })
  it("handles databases with zero-column relations properly", {
    db <- database(
      relation(
        list(
          constants = list(
            df = data.frame(a = 1L)[, FALSE, drop = FALSE],
            keys = list(character())
          ),
          a = list(df = data.frame(a = 1L), keys = list("a"))
        ),
        "a"
      ),
      list()
    )
    expect_identical(rejoin(db), data.frame(a = 1L))
  })
  it("returns an error if any attributes aren't present in relations", {
    expect_error(
      rejoin(database(
        relation(setNames(list(), character()), "a"),
        list()
      )),
      "^database is not lossless: attributes in attrs_order not present in relations\\na$"
    )
    expect_error(
      rejoin(database(
        relation(setNames(list(), character()), c("a", "b")),
        list()
      )),
      "^database is not lossless: attributes in attrs_order not present in relations\\na, b$"
    )
  })
  it("returns an error if rejoining fails, with best joined attribute sets", {
    expect_error(
      rejoin(database(
        relation(
          list(
            a = list(df = data.frame(a = c(FALSE, TRUE)), keys = list("a")),
            b = list(df = data.frame(b = c(FALSE, TRUE)), keys = list("b"))
          ),
          c("a", "b")
        ),
        list()
      )),
      "^database can not be fully rejoined\\nbest joined sets:\\na\\nb$"
    )
    # below doesn't return {c} as a set, since we have {b, c}.
    expect_error(
      rejoin(database(
        relation(
          list(
            a = list(
              df = data.frame(a = c(FALSE, TRUE), b = c(FALSE, TRUE)),
              keys = list("a")
            ),
            c = list(
              df = data.frame(c = 1:4),
              keys = list("c")
            ),
            b_c = list(
              df = data.frame(b = c(FALSE, TRUE), c = 1:4),
              keys = list(c("b", "c"))
            ),
            b_c.1 = list(
              df = data.frame(b = c(FALSE, TRUE), c = 1:4),
              keys = list(c("b", "c"))
            )
          ),
          c("a", "b", "c")
        ),
        list()
      )),
      "^database can not be fully rejoined\\nbest joined sets:\\na, b\\nb, c$"
    )
    # sorts set contents, removes repeat sets
    expect_error(
      rejoin(database(
        relation(
          list(
            a = list(
              df = data.frame(a = logical()),
              keys = list("a")
            ),
            d = list(
              df = data.frame(d = logical(), c = logical()),
              keys = list("d")
            ),
            b_d = list(
              df = data.frame(b = logical(), d = logical()),
              keys = list(c("d"))
            )
          ),
          c("a", "b", "c", "d")
        ),
        list()
      )),
      "^database can not be fully rejoined\\nbest joined sets:\\na\\nb, c, d$"
    )
  })
  it("properly merges tables with close floating-point values (no duplicates)", {
    # The motivating cause of spurious duplicates on re-merges is that
    # merge.data.frame works differently if there is more than one "by" column:
    # in this case, it creates a grouping vector by pasting all the values in a
    # row together, with "\r" as a separator. This implicitly turns floats into
    # characters, i.e. represents them using only 15 sig. fig.s. Since going
    # above 15 sig. figs. results in behaviour that varies across machines (see
    # print.default), we have to test this behaviour without
    # rounding, so digits = NA.
    x <- data.frame(
      a = c(F, F, F, F, F, F, T, T),
      b = c(
        2.69353840461766669279,
        2.69353840461766669279,
        2.69353840461766713688,
        2.69353840461766713688,
        3.74416921885076714460,
        5.50801230419353693435,
        3.72990161259524111159,
        5.50801230419353693435
      ),
      c = c(
        -3.19131542233864262670,
        -3.19131542233864262670,
        -2.87397721325655908231,
        -1.98786466514381765514,
        -3.33190719804050772268,
        -3.33190719804050772268,
        -3.33190719804050772268,
        -3.33190719804050772268
      ),
      d = c(
        2.56310969849146363941,
        2.56310969849146363941,
        2.56310969849146363941,
        2.56310969849146363941,
        2.94505568569789533129,
        4.88640238864414033770,
        2.94505568569789533129,
        4.88640238864414211406
      ),
      e = 1:8
    )
    # rows are unique, even when using only 1 sig. fig., so no rows can be
    # considered duplicate on any machine.
    # abce[e].{a,b} -> abd[ab,bd].{a,b},
    # abcd[acd].{a,b} -> abd[ab,bd].{a,b}
    # abcd[acd].{b,d} -> abd[ab,bd].{b,d}
    # the merge duplicates occur when joining e and ab along the former
    # reference.
    db <- autodb(x, digits = NA)
    expect_silent(y <- rejoin(db))
    expect_true(df_equiv(y, x, digits = NA))
  })
})

Try the autodb package in your browser

Any scripts or data that you put into this service are public.

autodb documentation built on April 4, 2025, 5:12 a.m.