test-discover.r
In autodb: Automatic Database Normalisation for Data Frames

# Various tests should have random accuracy draws, probably with lots of
# weight on 1

describe("discover", {
  expect_equiv_deps <- function(deps1, deps2) {
    expect_setequal(attrs_order(deps1), attrs_order(deps2))
    expect_setequal(
      deps1,
      functional_dependency(
        unclass(deps2),
        attrs_order(deps1)
      )
    )
  }
  expect_equiv_deps_except_names <- function(deps1, deps2) {
    nms1 <- attrs_order(deps1)
    nms2 <- attrs_order(deps2)
    renamed_deps1 <- functional_dependency(
      Map(
        list,
        lapply(detset(deps1), \(dets) nms2[match(dets, nms1)]),
        nms2[match(dependant(deps1), nms1)]
      ),
      nms2
    )
    expect_equiv_deps(renamed_deps1, deps2)
  }
  expect_equiv_deps_except_classes <- function(deps1, deps2) {
    nms1 <- attrs_order(deps1)
    nms2 <- attrs_order(deps2)
    reclassed_deps1 <- functional_dependency(
      Map(
        list,
        lapply(detset(deps1), \(dets) nms2[match(dets, nms1)]),
        nms2[match(dependant(deps1), nms1)]
      ),
      nms1
    )
    expect_equiv_deps(reclassed_deps1, deps2)
  }
  expect_equiv_non_removed_attr_deps <- function(deps1, deps2) {
    removed_attr <- setdiff(attrs_order(deps1), attrs_order(deps2))
    expect_length(removed_attr, 1)
    filtered <- deps1
    filtered <- functional_dependency(
      unclass(filtered[vapply(
        filtered,
        \(fd) !is.element(removed_attr, unlist(fd)),
        logical(1)
      )]),
      setdiff(attrs_order(deps1), removed_attr)
    )
    expect_equiv_deps(filtered, deps2)
  }
  expect_det_subsets_kept <- function(deps1, deps2) {
    expect_identical(attrs_order(deps1), attrs_order(deps2))
    expect_true(all(vapply(
      deps1,
      \(ds) any(
        vapply(dependant(deps2), identical, logical(1), dependant(ds)) &
          vapply(
            detset(deps2),
            \(detset) all(is.element(detset, detset(ds)[[1L]])),
            logical(1)
          )
      ),
      logical(1)
    )))
  }
  terminates_then <- function(fn, accuracy, ...) {
    function(df) {
      res <- R.utils::withTimeout(discover(df, accuracy = accuracy, ...), timeout = 5, onTimeout = "silent")
      if (is.null(res)) {
        return(fail("discover() timed out"))
      }
      succeed() # dummy success, otherwise tests complain about no expectations
      fn(res)
    }
  }
  both_terminate_then <- function(fn, accuracy, ...) {
    function(df1, df2) {
      res1 <- R.utils::withTimeout(discover(df1, accuracy = accuracy, ...), timeout = 5, onTimeout = "silent")
      expect_true(!is.null(res1))
      res2 <- R.utils::withTimeout(discover(df2, accuracy = accuracy, ...), timeout = 5, onTimeout = "silent")
      expect_true(!is.null(res2))
      fn(res1, res2)
    }
  }
  terminates_with_and_without_full_cache_then <- function(fn, accuracy, ...) {
    function(df) {
      res_cache <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = TRUE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      expect_true(!is.null(res_cache))
      res_nocache <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = FALSE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      expect_true(!is.null(res_nocache))
      fn(res_cache, res_nocache)
    }
  }
  terminates_with_and_without_store_cache_then <- function(fn, accuracy, ...) {
    function(df) {
      res_store <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = TRUE, store_cache = TRUE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      expect_true(!is.null(res_store))
      res_nostore <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = TRUE, store_cache = FALSE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      expect_true(!is.null(res_nostore))
      fn(res_store, res_nostore)
    }
  }
  terminates_with_and_without_bijection_skip_then <- function(fn, accuracy, ...) {
    function(df) {
      res_skip <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = TRUE, store_cache = TRUE, skip_bijections = TRUE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      if (is.null(res_skip))
        return(fail("discover() with bijection skip timed out"))
      res_noskip <- R.utils::withTimeout(
        discover(df, accuracy = accuracy, full_cache = TRUE, store_cache = TRUE, skip_bijections = FALSE, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      if (is.null(res_noskip))
        return(fail("discover() without bijection skip timed out"))
      fn(res_skip, res_noskip)
    }
  }

  it("gives a deterministic result, except for per-dependant dependency order", {
    two_copies <- function(fn) {
      function(df) {
        fn(df, df)
      }
    }
    forall(
      gen_df(4, 6),
      two_copies(both_terminate_then(expect_equiv_deps, accuracy = 1, method = "DFD"))
    )
    forall(
      gen_df(4, 6),
      two_copies(both_terminate_then(expect_equiv_deps, accuracy = 1, method = "FDHitsSep"))
    )
    forall(
      gen_df(4, 6),
      two_copies(both_terminate_then(expect_equiv_deps, accuracy = 1, method = "FDHitsJoint"))
    )
  })
  it("returns dependencies where shared dependant <=> not sub/supersets for determinants", {
    has_non_nested_determinant_sets <- function(deps) {
      det_groups <- split(detset(deps), dependant(deps))
      for (det_sets in det_groups) {
        len <- length(det_sets)
        if (len <= 1)
          succeed()
        else{
          for (n in seq_len(max(0, len - 1))) {
            for (m in seq.int(n + 1L, len)) {
              expect_true(length(setdiff(det_sets[[n]], det_sets[[m]])) > 0)
              expect_true(length(setdiff(det_sets[[m]], det_sets[[n]])) > 0)
            }
          }
        }
      }
    }
    forall(gen_df(4, 6), terminates_then(has_non_nested_determinant_sets, 1, method = "DFD"))
    forall(gen_df(4, 6), terminates_then(has_non_nested_determinant_sets, 1, method = "FDHitsSep"))
    forall(gen_df(4, 6), terminates_then(has_non_nested_determinant_sets, 1, method = "FDHitsJoint"))
  })
  it("is invariant to an attribute's values being permuted", {
    gen_perm <- function(vals) {
      uniq <- unique(vals)
      matches <- match(vals, uniq)
      gen.sample(uniq, length(uniq)) |>
        gen.with(\(perm) perm[matches])
    }
    gen_df_and_value_perm <- function(
      nrow,
      ncol,
      remove_dup_rows = FALSE
    ) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(gen.pure(df), gen.int(ncol(df)))) |>
        gen.and_then(\(lst) c(lst, list(gen_perm(lst[[1]][[lst[[2]]]])))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          attr <- lst[[2]]
          permuted_attr <- lst[[3]]
          permed <- df
          permed[[attr]] <- permuted_attr
          list(df, permed)
        })
    }
    forall(
      gen_df_and_value_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_value_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_value_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("is invariant to an attribute's class being losslessly changed (except for class info)", {
    gen_df_and_type_change <- function(
      nrow,
      ncol,
      remove_dup_rows = FALSE
    ) {
      classes <- c("logical", "integer", "numeric", "character")
      changes <- list(
        logical = c("integer", "numeric", "character"),
        integer = c("numeric", "character"),
        numeric = c("character"),
        character = c("logical"),
        factor = c("integer", "numeric", "character")
      )
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, gen.sample(ncol(df)))) |>
        gen.and_then(uncurry(\(df, attr) {
          list(
            gen.pure(df),
            gen.pure(attr),
            gen.element(changes[[class(df[[attr]])[[1]]]])
          )
        })) |>
        gen.with(uncurry(\(df, attr, new_class) {
          permed <- df
          permed[[attr]] <- as(permed[[attr]], new_class)
          list(df, permed)
        }))
    }
    forall(
      gen_df_and_type_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_classes, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_type_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_classes, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_type_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_classes, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("terminates properly when attributes have parameter names for paste", {
    df1 <- data.frame(
      b = NA,
      j = c(TRUE, NA, NA, NA),
      u = c(FALSE, TRUE, TRUE, NA),
      l = c(FALSE, TRUE, TRUE, TRUE),
      t = c(FALSE, FALSE, TRUE, NA),
      sep = c(TRUE, TRUE, NA, NA)
    )
    df2 <- df1[, c("l", "j", "t", "b", "u", "sep")]
    terminates_with_and_without_cache <- function(...) {
      terminates_with_and_without_full_cache_then(
        \(x, y) {},
        1,
        ...
      )
    }
    terminates_with_and_without_cache(method = "DFD")(df1)
    terminates_with_and_without_cache(method = "FDHitsSep")(df1)
    terminates_with_and_without_cache(method = "FDHitsJoint")(df1)
    terminates_with_and_without_cache(method = "DFD")(df2)
    terminates_with_and_without_cache(method = "FDHitsSep")(df2)
    terminates_with_and_without_cache(method = "FDHitsJoint")(df2)
  })
  it("is invariant, under reordering, to attributes being reordered", {
    gen_df_and_attr_perm <- function(
      nrow,
      ncol,
      remove_dup_rows = FALSE
    ) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, sample.int(ncol(df)))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          perm <- lst[[2]]
          list(df, df[, perm, drop = FALSE])
        })
    }

    forall(
      gen_df_and_attr_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_attr_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_attr_perm(4, 6),
      both_terminate_then(expect_equiv_deps, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("loses FDs involving a removed attribute, keeps the rest", {
    gen_df_and_remove_col <- function(nrow, ncol, remove_dup_rows = FALSE) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, gen.int(ncol(df)))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          n <- lst[[2]]
          list(df, df[, -n, drop = FALSE])
        })
    }
    forall(
      gen_df_and_remove_col(4, 6),
      both_terminate_then(expect_equiv_non_removed_attr_deps, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_remove_col(4, 6),
      both_terminate_then(expect_equiv_non_removed_attr_deps, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_remove_col(4, 6),
      both_terminate_then(expect_equiv_non_removed_attr_deps, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("is invariant to changes of accuracy within same required row count when using DFD", {
    gen_df_and_accuracy_nrow <- function(nrow, ncol, remove_dup_rows = FALSE) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, gen.int(nrow(df)))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          n <- lst[[2]]
          prop <- n/nrow(df)
          low <- (n - 1)/nrow(df) + 1e-9
          list(df, low, prop)
        })
    }
    both_bounds_terminate_then <- function(fn, ...) {
      function(df, low, high) {
        res1 <- R.utils::withTimeout(discover(df, accuracy = low, ...), timeout = 5, onTimeout = "silent")
        expect_true(!is.null(res1))
        res2 <- R.utils::withTimeout(discover(df, accuracy = high, ...), timeout = 5, onTimeout = "silent")
        expect_true(!is.null(res2))
        fn(res1, res2)
      }
    }
    forall(
      gen_df_and_accuracy_nrow(4, 6),
      both_bounds_terminate_then(expect_equiv_deps, method = "DFD"),
      curry = TRUE
    )
  })
  it("keeps subsets of all FDs if a row is removed, might have more", {
    gen_df_and_remove_row <- function(nrow, ncol) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows = TRUE) |>
        gen.and_then(\(df) list(df, gen.element(seq_len(nrow(df))))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          n <- lst[[2]]
          list(df, df[-n, , drop = FALSE])
        })
    }
    forall(
      gen_df_and_remove_row(4, 6),
      both_terminate_then(expect_det_subsets_kept, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_remove_row(4, 6),
      both_terminate_then(expect_det_subsets_kept, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_remove_row(4, 6),
      both_terminate_then(expect_det_subsets_kept, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("discover -> change attribute names is equivalent to change names -> discover", {
    gen_df_and_name_change <- function(nrow, ncol, remove_dup_rows = FALSE) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, gen.sample(LETTERS, ncol(df)))) |>
        gen.with(\(lst) {
          df <- lst[[1]]
          new_names <- lst[[2]]
          list(df, stats::setNames(df, new_names))
        })
    }
    forall(
      gen_df_and_name_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_names, 1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_name_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_names, 1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_name_change(4, 6),
      both_terminate_then(expect_equiv_deps_except_names, 1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("correctly simplifies date attributes with varying standard/daylight savings", {
    # example from nycflights13::weather
    df <- data.frame(
      month = c(11L, 11L, 11L),
      day = c(3L, 3L, 4L),
      hour = 1L,
      time = as.POSIXct(
        # 2013-11-03 01:00:00 EDT,
        # 2013-11-03 01:00:00 EST,
        # 2013-11-04 01:00:00 EST
        c(1383454800L, 1383458400L, 1383544800L),
        origin = "1970-01-01 00:00:00 UTC",
        tz = "America/New_York"
      )
    )
    stopifnot(df[1, "time"] != df[2, "time"])
    deps <- discover(df, method = "DFD")
    expect_length(deps[dependant(deps) == "time"], 0L)
    deps2 <- discover(df, method = "FDHitsSep")
    expect_length(deps2[dependant(deps2) == "time"], 0L)
    deps3 <- discover(df, method = "FDHitsJoint")
    expect_length(deps3[dependant(deps3) == "time"], 0L)
  })
  it("correctly simplifies floating-point numbers to high accuracy", {
    df <- data.frame(
      x = c(
        47.37661580000000327573,
        47.37661580000000327573
      ),
      y = c(
        8.549177500000007,
        8.549177499999999
      )
    )
    expect_identical(
      discover(df, digits = 8, method = "DFD"),
      functional_dependency(
        list(list(character(), "x"), list(character(), "y")),
        c("x", "y")
      )
    )
    expect_identical(
      discover(df, digits = 8, method = "FDHitsSep"),
      functional_dependency(
        list(list(character(), "x"), list(character(), "y")),
        c("x", "y")
      )
    )
    expect_identical(
      discover(df, digits = 8, method = "FDHitsJoint"),
      functional_dependency(
        list(list(character(), "x"), list(character(), "y")),
        c("x", "y")
      )
    )
    expect_identical(
      discover(df, digits = 15, method = "DFD"),
      functional_dependency(
        list(list(character(), "x")),
        c("x", "y")
      )
    )
    expect_identical(
      discover(df, digits = 15, method = "FDHitsSep"),
      functional_dependency(
        list(list(character(), "x")),
        c("x", "y")
      )
    )
    expect_identical(
      discover(df, digits = 15, method = "FDHitsJoint"),
      functional_dependency(
        list(list(character(), "x")),
        c("x", "y")
      )
    )
  })
  it("doesn't have an excluded attribute in any determinant sets", {
    gen_df_and_exclude <- function(nrow, ncol, remove_dup_rows = FALSE) {
      gen_df(nrow, ncol, minrow = 1L, mincol = 1L, remove_dup_rows) |>
        gen.and_then(\(df) list(df, gen.element(names(df))))
    }
    terminates_with_exclusion_then_no_trival <- function(accuracy, ...) {
      function(df, attr) {
        deps <- R.utils::withTimeout(
          discover(df, accuracy = accuracy, exclude = attr, ...),
          timeout = 5,
          onTimeout = "silent"
        )
        if (is.null(deps)) {
          return(fail("discover() with exclude timed out"))
        }
        # test exclusion_not_in_determinant_sets
        expect_false(attr %in% unlist(detset(deps)))
      }
    }
    forall(
      gen_df_and_exclude(4, 6),
      terminates_with_exclusion_then_no_trival(1, method = "DFD"),
      curry = TRUE
    )
    forall(
      gen_df_and_exclude(4, 6),
      terminates_with_exclusion_then_no_trival(1, method = "FDHitsSep"),
      curry = TRUE
    )
    forall(
      gen_df_and_exclude(4, 6),
      terminates_with_exclusion_then_no_trival(1, method = "FDHitsJoint"),
      curry = TRUE
    )
  })
  it("gives same result from excluding class and exclude attributes with that class", {
    exclude_and_exclude_class_terminate_then <- function(fn, accuracy, class, ...) {
      function(df) {
        attrs_with_class <- names(df)[vapply(df, inherits, logical(1), class)]
        deps1 <- R.utils::withTimeout(
          discover(df, accuracy = accuracy, exclude = attrs_with_class, ...),
          timeout = 5,
          onTimeout = "silent"
        )
        if (is.null(deps1))
          return(fail("discover() with exclude timed out"))
        deps2 <- R.utils::withTimeout(
          discover(df, accuracy = accuracy, exclude_class = class, ...),
          timeout = 5,
          onTimeout = "silent"
        )
        if (is.null(deps2))
          return(fail("discover() with exclude_class timed out"))
        fn(deps1, deps2)
      }
    }
    forall(
      gen_df(4, 6),
      exclude_and_exclude_class_terminate_then(
        expect_equiv_deps,
        accuracy = 1,
        "logical",
        method = "DFD"
      )
    )
    forall(
      gen_df(4, 6),
      exclude_and_exclude_class_terminate_then(
        expect_equiv_deps,
        accuracy = 1,
        "logical",
        method = "FDHitsSep"
      )
    )
    forall(
      gen_df(4, 6),
      exclude_and_exclude_class_terminate_then(
        expect_equiv_deps,
        accuracy = 1,
        "logical",
        method = "FDHitsJoint"
      )
    )
  })
  it("gives same result from using filter arguments and from post-filtering", {
    superset <- function(x, y) {
      if (length(x) == 0)
        return(logical(1))
      mat <- outer(x, y, Vectorize(\(u, v) setequal(intersect(u, v), v)))
      apply(mat, 1, any)
    }
    same_under_constraints_and_filtering <- function(
      x,
      dependants,
      detset_limit,
      method,
      ...
    ) {
      by_arguments <- R.utils::withTimeout(
        discover(
          x,
          dependants = dependants,
          detset_limit = detset_limit,
          method = method,
          ...
        ),
        timeout = 5,
        onTimeout = "silent"
      )
      if (is.null(by_arguments))
        return(fail("discover() with filtering arguments timed out"))
      by_filtering <- R.utils::withTimeout(
        discover(x, method = method, ...),
        timeout = 5,
        onTimeout = "silent"
      )
      if (is.null(by_filtering))
        return(fail("discover() without filtering arguments timed out"))
      by_filtering <- by_filtering[
        dependant(by_filtering) %in% dependants &
          lengths(detset(by_filtering)) <= detset_limit
      ]
      expect_setequal(by_arguments, by_filtering)
    }
    forall(
      gen_df(4, 6) |>
        gen.and_then(\(x) {
          list(
            gen.pure(x),
            gen.sample_resampleable(names(x), from = 0, to = ncol(x)),
            gen.element(0:(ncol(x) - 1L)),
            gen.element(c("DFD", "FDHitsSep", "FDHitsJoint"))
          )
        }),
      same_under_constraints_and_filtering,
      curry = TRUE
    )

    # example
    x <- data.frame(
      a = c(0, NA, NA, 1),
      b = c(NA, FALSE, NA, NA),
      c = c("TRUE", NA, "TRUE", "FALSE"),
      d = c(1L, 1L, NA, 1L),
      e = c(NA, 1L, 1L, 0L)
    )
    same_under_constraints_and_filtering(x, c("a", "c"), 2L, method = "DFD")
    same_under_constraints_and_filtering(x, c("a", "c"), 2L, method = "FDHitsSep")
    same_under_constraints_and_filtering(x, c("a", "c"), 2L, method = "FDHitsJoint")
  })
  it("gives dependencies for unique attributes (in case don't want them as key)", {
    df <- data.frame(A = 1:3, B = c(1, 1, 2), C = c(1, 2, 2))

    deps_dfd <- discover(df, method = "DFD")
    A_deps_dfd <- dependant(deps_dfd) == "A"
    A_detsets_dfd <- detset(deps_dfd[A_deps_dfd])
    expect_identical(A_detsets_dfd, list(c("B", "C")))

    deps_sep <- discover(df, method = "FDHitsSep")
    A_deps_sep <- dependant(deps_sep) == "A"
    A_detsets_sep <- detset(deps_sep[A_deps_sep])
    expect_identical(A_detsets_sep, list(c("B", "C")))

    deps_jnt <- discover(df, method = "FDHitsJoint")
    A_deps_jnt <- dependant(deps_jnt) == "A"
    A_detsets_jnt <- detset(deps_jnt[A_deps_jnt])
    expect_identical(A_detsets_jnt, list(c("B", "C")))
  })
  it("finds dependencies for the team data in test-synthesise", {
    df <- data.frame(
      team = c(
        'Red', 'Red', 'Red', 'Orange', 'Orange',
        'Yellow', 'Yellow', 'Green', 'Green', 'Blue'
      ),
      jersey_num = c(
        1, 2, 3, 1, 2,
        1, 5, 8, 2, 2
      ),
      player_name = c(
        'A', 'B', 'C', 'D', 'A',
        'E', 'B', 'A', 'G', 'H'
      ),
      city = c(
        'boston', 'boston', 'boston', 'chicago', 'chicago',
        'honolulu', 'honolulu', 'boston', 'boston', 'austin'
      ),
      state = c(
        'MA', 'MA', 'MA', 'IL', 'IL',
        'HI', 'HI', 'MA', 'MA', 'TX'
      )
    )
    expected_deps <- functional_dependency(
      list(
        list(c('player_name', 'jersey_num'), "team"),
        list(c('player_name', 'team'), "jersey_num"),
        list(c('team', 'jersey_num'), "player_name"),
        list('team', "city"),
        list('state', "city"),
        list(c('player_name', 'jersey_num'), "city"),
        list('team', "state"),
        list(c('player_name', 'jersey_num'), "state"),
        list('city', "state")
      ),
      c("team", "jersey_num", "player_name", "city", "state")
    )

    deps_dfd <- discover(df, method = "DFD")
    expect_identical(attrs_order(deps_dfd), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_dfd)))

    deps_sep <- discover(df, method = "FDHitsSep")
    expect_identical(attrs_order(deps_sep), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_sep)))

    deps_jnt <- discover(df, method = "FDHitsJoint")
    expect_identical(attrs_order(deps_jnt), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_jnt)))
  })
  it("finds dependencies for the team data in original's edit demo", {
    df <- data.frame(
      team = c("tigers", "elephants", "foxes", "snakes", "dolphins", "eagles"),
      city = c("boston", "chicago", "miami", "austin", "honolulu", "houston"),
      state = c("MA", "IL", "FL", "TX", "HI", "TX"),
      roster_size = c(20L, 21L, 20L, 20L, 19L, 21L)
    )
    expected_deps <- functional_dependency(
      list(
        list("city", "team"),
        list("team", "city"),
        list("team", "state"),
        list("city", "state"),
        list("team", "roster_size"),
        list("city", "roster_size")
      ),
      c("team", "city", "state", "roster_size")
    )

    deps_dfd <- discover(df, method = "DFD")
    expect_identical(attrs_order(deps_dfd), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_dfd)))

    deps_sep <- discover(df, method = "FDHitsSep")
    expect_identical(attrs_order(deps_sep), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_sep)))

    deps_jnt <- discover(df, method = "FDHitsJoint")
    expect_identical(attrs_order(deps_jnt), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_jnt)))
  })
  it("finds dependencies for Wikipedia 1NF->2NF->3NF example", {
    df <- data.frame(
      Title = rep(
        c(
          "Beginning MySQL Database Design and Optimization",
          "The Relational Model for Database Management: Version 2"
        ),
        each = 2
      ),
      Format = c("Hardcover", "E-book", "E-book", "Paperback"),
      Author = rep(c("Chad Russell", "E.F. Codd"), each = 2),
      Author_Nationality = rep(c("American", "British"), each = 2),
      Price = c(4999L, 2234L, 1388L, 3999L),
      Thickness = "Thick",
      Genre_ID = rep(1:2, each = 2),
      Genre_Name = rep(c("Tutorial", "Popular science"), each = 2),
      Publisher_ID = rep(1:2, each = 2)
    )
    expected_deps <- functional_dependency(
      list(
        list("Title", "Author"),
        list("Author", "Author_Nationality"),
        list(c("Title", "Format"), "Price"),
        list(character(), "Thickness"),
        list("Title", "Genre_ID"),
        list("Genre_ID", "Genre_Name"),
        list("Title", "Publisher_ID")
      ),
      c(
        "Title",
        "Format",
        "Author",
        "Author_Nationality",
        "Price",
        "Thickness",
        "Genre_ID",
        "Genre_Name",
        "Publisher_ID"
      )
    )
    deps_dfd <- discover(df, method = "DFD")
    deps_sep <- discover(df, method = "FDHitsSep")
    deps_jnt <- discover(df, method = "FDHitsJoint")
    expect_identical(attrs_order(deps_dfd), attrs_order(expected_deps))
    expect_identical(attrs_order(deps_sep), attrs_order(expected_deps))
    expect_identical(attrs_order(deps_jnt), attrs_order(expected_deps))
    expect_true(all(is.element(expected_deps, deps_dfd)))
    expect_true(all(is.element(expected_deps, deps_sep)))
    expect_true(all(is.element(expected_deps, deps_jnt)))
  })
  it("correctly handles attributes with non-df-standard names", {
    df <- data.frame(1:3, c(1, 1, 2), c(1, 2, 2)) |>
      stats::setNames(c("A 1", "B 2", "C 3"))

    deps_dfd <- discover(df, method = "DFD")
    A_1_deps_dfd <- dependant(deps_dfd) == "A 1"
    A_1_detsets_dfd <- detset(deps_dfd[A_1_deps_dfd])
    expect_identical(A_1_detsets_dfd, list(c("B 2", "C 3")))

    deps_sep <- discover(df, method = "FDHitsSep")
    A_1_deps_sep <- dependant(deps_sep) == "A 1"
    A_1_detsets_sep <- detset(deps_sep[A_1_deps_sep])
    expect_identical(A_1_detsets_sep, list(c("B 2", "C 3")))

    deps_jnt <- discover(df, method = "FDHitsJoint")
    A_1_deps_jnt <- dependant(deps_jnt) == "A 1"
    A_1_detsets_jnt <- detset(deps_jnt[A_1_deps_jnt])
    expect_identical(A_1_detsets_jnt, list(c("B 2", "C 3")))
  })
  it("expects attribute names to be unique", {
    df <- data.frame(A = 1:3, B = c(1, 1, 2), A = c(1, 2, 2), check.names = FALSE)
    expect_error(discover(df, method = "DFD"), "^duplicate column names: A$")
    expect_error(discover(df, method = "FDHitsSep"), "^duplicate column names: A$")
    expect_error(discover(df, method = "FDHitsJoint"), "^duplicate column names: A$")
  })
  it("gets the same results with and without storing partitions", {
    forall(
      gen_df(20, 5),
      terminates_with_and_without_full_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "DFD"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_full_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "FDHitsSep"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_full_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "FDHitsJoint"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_full_cache_then(
        expect_equiv_deps,
        accuracy = 3/4,
        method = "DFD"
      ),
      shrink.limit = Inf
    )
  })
  it("is invariant to whether partition is transferred between dependants, also for accuracy < 1 for DFD", {
    forall(
      gen_df(20, 5),
      terminates_with_and_without_store_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "DFD"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_store_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "FDHitsSep"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_store_cache_then(
        expect_equiv_deps,
        accuracy = 1,
        method = "FDHitsJoint"
      ),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      terminates_with_and_without_store_cache_then(
        expect_equiv_deps,
        accuracy = 3/4,
        method = "DFD"
      ),
      shrink.limit = Inf
    )
  })
  it("is invariant to whether bijections are skipped, under full accuracy", {
    df1 <- data.frame(
      a = c(FALSE, FALSE, TRUE),
      b = FALSE,
      c = c(FALSE, TRUE, NA),
      d = c(FALSE, NA, NA),
      e = c(FALSE, TRUE, NA)
    )
    df2 <- data.frame(
      a = c(FALSE, TRUE, NA, TRUE, NA),
      b = c(TRUE, TRUE, TRUE, TRUE, NA),
      c = c(FALSE, TRUE, NA, NA, NA),
      d = c(NA, FALSE, TRUE, FALSE, TRUE),
      e = c(FALSE, FALSE, FALSE, TRUE, TRUE)
    )
    df3 <- data.frame(
      a = c(NA, TRUE, NA),
      b = c(TRUE, FALSE, TRUE),
      c = c(NA, NA, FALSE),
      d = c(NA, NA, TRUE),
      e = c(FALSE, TRUE, NA)
    )
    invariant_to_bijection_skip <- function(...) {
      terminates_with_and_without_bijection_skip_then(
        expect_equiv_deps,
        accuracy = 1,
        ...
      )
    }
    invariant_to_bijection_skip(method = "DFD")(df1)
    invariant_to_bijection_skip(method = "DFD")(df2)
    invariant_to_bijection_skip(method = "DFD")(df3)
    invariant_to_bijection_skip(method = "FDHitsSep")(df1)
    invariant_to_bijection_skip(method = "FDHitsSep")(df2)
    invariant_to_bijection_skip(method = "FDHitsSep")(df3)
    invariant_to_bijection_skip(method = "FDHitsJoint")(df1)
    invariant_to_bijection_skip(method = "FDHitsJoint")(df2)
    invariant_to_bijection_skip(method = "FDHitsJoint")(df3)
    forall(
      gen_df(20, 5),
      invariant_to_bijection_skip(method = "DFD"),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      invariant_to_bijection_skip(method = "FDHitsSep"),
      shrink.limit = Inf
    )
    forall(
      gen_df(20, 5),
      invariant_to_bijection_skip(method = "FDHitsJoint"),
      shrink.limit = Inf
    )
  })
  it(
    paste(
      "for accuracy = 1, is invariant to:",
      "- having a non-false keep_rownames vs. adding row names as first column",
      "- method used",
      "- excluding a class vs. excluding attributes in that class",
      "- filtering by arguments (dependants/detset_limit) or by subsetting results",
      "- whether partition is transferred between dependants",
      "- whether bijections are skipped",
      sep = "\n"
    ),
    {
      forall(
        gen_df(4, 6) |>
          gen.and_then(\(x) {
            list(
              gen.pure(x),
              gen.choice(gen.element(c(FALSE, TRUE)), gen_attr_name(9)),
              gen.sample_resampleable(names(x), from = 0, to = ncol(x)),
              gen.element(0:(ncol(x) - 1L))
            )
          }),
        function(df, keep_rownames, dependants, detset_limit) {
          arglists <- expand.grid(
            if (!isFALSE(keep_rownames)) {
              tmp <- if (isTRUE(keep_rownames)) "row" else keep_rownames
              nms <- make.unique(c(names(df), tmp))
              nm <- nms[[length(nms)]]
              list(
                list(df = df, accuracy = 1, keep_rownames = keep_rownames),
                list(
                  df = cbind(setNames(data.frame(rownames(df)), nm), df),
                  accuracy = 1,
                  keep_rownames = FALSE
                )
              )
            }else
              list(list(df = df, accuracy = 1, keep_rownames = FALSE)),
            list(
              list(method = "DFD"),
              list(method = "FDHitsSep"),
              list(method = "FDHitsJoint")
            ),
            list(
              list(exclude_class = "logical"),
              list(exclude = names(df)[vapply(df, is.logical, logical(1))])
            ),
            list(
              list(),
              list(dependants = dependants),
              list(detset_limit = detset_limit),
              list(dependants = dependants, detset_limit = detset_limit)
            ),
            list(
              list(store_cache = FALSE),
              list(store_cache = TRUE)
            ),
            list(
              list(skip_bijections = FALSE),
              list(skip_bijections = TRUE)
            )
          ) |>
            unname() |>
            apply(1, \(x) do.call(c, x), simplify = FALSE)
          results <- lapply(
            arglists,
            \(lst) {
              base <- R.utils::withTimeout(
                do.call(discover, lst),
                timeout = 5,
                onTimeout = "silent"
              )
              if (is.null(base))
                return(base)
              if (is.null(lst$dependants))
                base <- base[dependant(base) %in% dependants]
              if (is.null(lst$detset_limit))
                base <- base[lengths(detset(base)) <= detset_limit]
              base
            }
          )
          if (any(vapply(results, is.null, logical(1))))
            return(fail("some argument lists fail"))
          expect_true(all(vapply(results, setequal, logical(1), results[[1]])))
        },
        curry = TRUE
      )
    }
  )
  it("returns a minimal functional dependency set", {
    forall(
      gen_df(6, 7),
      terminates_then(is_valid_minimal_functional_dependency, 1, method = "DFD")
    )
    forall(
      gen_df(6, 7),
      terminates_then(is_valid_minimal_functional_dependency, 1, method = "FDHitsSep")
    )
    forall(
      gen_df(6, 7),
      terminates_then(is_valid_minimal_functional_dependency, 1, method = "FDHitsJoint")
    )
  })
  it("can include the rownames, equivalent to adding them as first column", {
    x <- data.frame(
      a = c(1, 1, 1, 2, 2, 3, 3, 3, 4),
      b = c(1, 1, 1, 1, 1, 2, 2, 2, 3),
      row.names = letters[1:9]
    )
    default <- discover(x, keep_rownames = TRUE)
    explicit <- discover(x, keep_rownames = "id")
    expected <- discover(x)
    expect_setequal(
      default,
      discover(cbind(data.frame(row = rownames(x)), x))
    )
    expect_setequal(
      explicit,
      discover(cbind(data.frame(id = rownames(x)), x))
    )
  })
})
Any scripts or data that you put into this service are public.
autodb documentation built on June 26, 2025, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
autodb
Automatic Database Normalisation for Data Frames

tests/testthat/test-discover.r
In autodb: Automatic Database Normalisation for Data Frames

Try the autodb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

autodb Automatic Database Normalisation for Data Frames

tests/testthat/test-discover.r In autodb: Automatic Database Normalisation for Data Frames

Try the autodb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

autodb
Automatic Database Normalisation for Data Frames

tests/testthat/test-discover.r
In autodb: Automatic Database Normalisation for Data Frames