Nothing
DFD <- function(
lookup,
accuracy = 1,
full_cache = TRUE,
store_cache = TRUE,
skip_bijections = FALSE,
determinants = seq_along(lookup),
dependants = seq_along(lookup),
detset_limit = ncol(lookup) - 1L,
report = reporter(report = FALSE, con = "", new = TRUE)
) {
attrs <- seq_along(lookup)
attr_names <- names(lookup)
n_cols <- length(attrs)
dependencies <- stats::setNames(rep(list(list()), n_cols), attr_names)
# check for constant-value columns, because if columns are fixed we can
# ignore them for the rest of the search
fixed <- integer()
for (attr in attrs) {
if (all(lookup[[attr]] == 1L)) {
report(paste(attr_names[[attr]], "is fixed"))
fixed <- c(fixed, attr)
if (attr %in% dependants)
dependencies[[attr]] <- list(character())
}
}
nonfixed <- setdiff(attrs, fixed)
valid_dependant_attrs <- intersect(dependants, nonfixed)
# check for zero dependants before removing simple keys, otherwise
# returning early would leave out the simple-key results
if (
length(valid_dependant_attrs) == 0 ||
detset_limit < 1
) {
report("no valid dependants, or detset_limit < 1, skipping search")
return(flatten(
filter_nonflat_dependencies(dependencies, detset_limit),
attr_names
))
}
# For non-fixed non-key attributes, all can be dependants,
# but might not all be valid determinants.
valid_determinant_attrs_prekeys <- intersect(
nonfixed,
determinants
)
# Non-fixed attributes might be single-attribute keys: we can list them as
# determining all other non-fixed attributes, use them in the main search only
# as dependants. If there are several single-attribute keys, and we're
# skipping bijections, then we can also remove all but one of them as
# dependants.
valid_determinant_attrs <- valid_determinant_attrs_prekeys
# Can't just check column values are seq_len(nrow(df)),
# because df can have duplicate rows, and we can't remove
# the duplicate rows in df, because it changes the behaviour
# for accuracy < 1.
df_uniq <- df_unique(lookup)
simple_keys <- nonfixed[vapply(
df_uniq[nonfixed],
Negate(anyDuplicated),
logical(1)
)]
determinant_keys <- intersect(simple_keys, valid_determinant_attrs_prekeys)
dependant_keys <- intersect(simple_keys, valid_dependant_attrs)
if (length(simple_keys) > 0) {
report(paste("single-attribute keys:", toString(attr_names[simple_keys])))
valid_determinant_attrs <- setdiff(valid_determinant_attrs, simple_keys)
if (skip_bijections) {
valid_dependant_attrs <- setdiff(valid_dependant_attrs, dependant_keys[-1])
}
}
if (length(valid_determinant_attrs) < n_cols) {
report(
paste(
"attributes not considered as determinants:",
toString(attr_names[-valid_determinant_attrs])
)
)
}
valid_determinant_nonfixed_indices <- match(valid_determinant_attrs, nonfixed)
# Maximum size of determinant set for a dependant is number
# of other valid determinants.
# If there are dependants that aren't valid determinants,
# this is number of valid determinant attributes. If there
# aren't, subtract one.
n_dependant_only <- length(setdiff(valid_dependant_attrs, valid_determinant_attrs))
max_n_lhs_attrs <- length(valid_determinant_attrs) -
as.integer(n_dependant_only == 0)
# using 0 would allow for one more column, but makes indexing a pain
lhs_attrs_limit <- floor(log(.Machine$integer.max, 2))
if (max_n_lhs_attrs > lhs_attrs_limit)
stop(paste(
"only data.frames with up to",
lhs_attrs_limit,
"columns possible in a determinant set currently supported"
))
bijections <- list()
# main search
if (max_n_lhs_attrs > 0) {
report("constructing powerset")
powerset <- nonempty_powerset(
max_n_lhs_attrs,
use_visited = TRUE
)
# cache generated powerset and reductions, otherwise we spend a lot
# of time duplicating reduction work
all_powersets <- stats::setNames(list(powerset), max_n_lhs_attrs)
partition_handler <- checkable_partition_handler(
unname(lookup[, nonfixed, drop = FALSE]),
key_class = "integer",
accuracy = accuracy,
full_cache = full_cache
)
for (rhs in which(nonfixed %in% valid_dependant_attrs)) {
report(paste("dependant", attr_names[nonfixed][rhs]))
lhs_nonfixed_indices <- setdiff(valid_determinant_nonfixed_indices, rhs)
n_lhs_attrs <- length(lhs_nonfixed_indices)
expected_n_lhs_attrs <- max_n_lhs_attrs -
(n_dependant_only > 0 && is.element(rhs, valid_determinant_nonfixed_indices))
stopifnot(n_lhs_attrs == expected_n_lhs_attrs)
bijection_candidate_nonfixed_indices <- if (skip_bijections)
match(
names(dependencies)[
vapply(
dependencies,
\(x) any(vapply(x, identical, logical(1), attr_names[nonfixed][[rhs]])),
logical(1)
)
],
attr_names[nonfixed]
) |>
intersect(lhs_nonfixed_indices)
else
integer()
if (n_lhs_attrs > 0) {
if (n_lhs_attrs %in% names(all_powersets))
nodes <- all_powersets[[as.character(n_lhs_attrs)]]
else{
nodes <- reduce_powerset(powerset, n_lhs_attrs)
all_powersets[[as.character(n_lhs_attrs)]] <- nodes
}
report("determinants available, starting search")
lhss <- find_LHSs_dfd(
rhs,
lhs_nonfixed_indices,
nodes,
n_lhs_attrs,
partition_handler,
bijection_candidate_nonfixed_indices,
detset_limit
)
if (!store_cache)
partition_handler$reset()
if (lhss[[2]]) {
stopifnot(
is.element(lhss[[1]], bijection_candidate_nonfixed_indices),
lhss[[1]] < rhs
)
bij_ind <- match(lhss[[1]], names(bijections))
if (is.na(bij_ind))
bijections <- c(
bijections,
stats::setNames(list(c(lhss[[1]], rhs)), lhss[[1]])
)
else{
bijections[[bij_ind]] <- c(
bijections[[bij_ind]],
rhs
)
}
valid_determinant_nonfixed_indices <- setdiff(
valid_determinant_nonfixed_indices,
rhs
)
max_n_lhs_attrs <- max_n_lhs_attrs - 1L
if (max_n_lhs_attrs %in% names(all_powersets))
powerset <- all_powersets[[as.character(max_n_lhs_attrs)]]
else{
powerset <- reduce_powerset(powerset, max_n_lhs_attrs)
all_powersets[[as.character(max_n_lhs_attrs)]] <- powerset
}
}else
dependencies[[attr_names[nonfixed][rhs]]] <- c(
dependencies[[attr_names[nonfixed][rhs]]],
lapply(lhss[[1]], \(x) attr_names[nonfixed][x])
)
}
}
}
report(paste0(
"DFD complete",
"\n",
with_number(partition_handler$cache_size(), "partition", " cached", "s cached")
))
dependencies <- add_simple_key_deps(
dependencies,
attr_names[determinant_keys],
attr_names[dependant_keys],
attr_names[valid_dependant_attrs]
)
if (skip_bijections) {
dependencies <- add_deps_implied_by_bijections(
dependencies,
bijections,
attr_names[nonfixed],
attr_names
)
dependencies <- add_deps_implied_by_simple_keys(
dependencies,
attr_names[determinant_keys],
attr_names[dependant_keys],
attr_names[valid_dependant_attrs]
)
}
flatten(
filter_nonflat_dependencies(dependencies, detset_limit),
attr_names
)
}
find_LHSs_dfd <- function(
rhs,
lhs_nonfixed_indices,
nodes,
n_lhs_attrs,
partition_handler,
bijection_candidate_nonfixed_indices,
detset_limit
) {
# The original library "names" nodes with their attribute set,
# so finding a node involves matching a character vector against
# a list of character vectors. This is slow.
# Instead, we assign each possible attribute set an integer,
# by taking the integer that matches the bitset. This limits
# us to floor(.Machine$integer.max) attributes as potential determinants.
# We could use numerics to get to .Machine$double.digits determinant
# attributes, but these would be arbitrary floats rather than integers,
# so more work would be required to match indices etc.
# Assigning the integers per find_LHSs call allows one more column,
# since we don't need to include the dependant.
# Node categories:
# -3 = candidate maximal non-dependency
# -2 = maximal non-dependency
# -1 = non-maximal non-dependency
# 0 = unresolved
# 1 = non-minimal dependency
# 2 = minimal dependency
# 3 = candidate minimal dependency
# initial seeds are the single-attribute nodes, possibly pruned by detset
# constraints
lhs_attr_nodes <- to_nodes(seq_len(n_lhs_attrs), nodes)
initial_seeds <- lhs_attr_nodes
seeds <- generate_next_seeds(
max_non_deps = integer(),
min_deps = integer(),
initial_seeds = lhs_attr_nodes,
nodes = nodes,
detset_limit = detset_limit
)
min_deps <- integer()
max_non_deps <- integer()
trace <- integer()
bijection_nodes <- to_nodes(
match(
bijection_candidate_nonfixed_indices,
lhs_nonfixed_indices
),
nodes
)
while (length(seeds) != 0) {
node <- seeds[sample.int(length(seeds), 1)]
while (!is.na(node)) {
if (nodes$visited[node]) {
if (nodes$category[node] == 3) { # dependency
min_infer <- is_minimal(node, nodes)
if (isTRUE(min_infer)) {
nodes$category[node] <- 2L
min_deps <- c(min_deps, node)
if (is.element(node, bijection_nodes)) {
lhs_index <- lhs_nonfixed_indices[nodes$bits[[node]]]
stopifnot(is.element(
lhs_index,
bijection_candidate_nonfixed_indices
))
return(list(lhs_index, TRUE))
}
}
if (isFALSE(min_infer))
nodes$category[node] <- 1L
}
if (nodes$category[node] == -3) { # non-dependency
max_infer <- is_maximal(node, nodes)
if (isTRUE(max_infer)) {
nodes$category[node] <- -2L
max_non_deps <- c(max_non_deps, node)
}
if (isFALSE(max_infer)) {
nodes$category[node] <- -1L
}
}
nodes$category[node] <- update_dependency_type(
node,
nodes,
min_deps,
max_non_deps
)
}else{
inferred_type <- infer_type(node, nodes)
if (!is.na(inferred_type)) {
nodes$category[node] <- inferred_type
}
if (nodes$category[node] == 0L) {
lhs_set <- lhs_nonfixed_indices[nodes$bits[[node]]]
result <- partition_handler$check(
rhs,
lhs_set
)
if (result) {
min_infer <- is_minimal(node, nodes)
if (isFALSE(min_infer))
nodes$category[node] <- 1L
if (isTRUE(min_infer)) {
min_deps <- c(min_deps, node)
nodes$category[node] <- 2L
if (is.element(node, bijection_nodes)) {
lhs_index <- lhs_nonfixed_indices[nodes$bits[[node]]]
stopifnot(is.element(
lhs_index,
bijection_candidate_nonfixed_indices
))
return(list(lhs_index, TRUE))
}
}
if (is.na(min_infer))
nodes$category[node] <- 3L
}else{
max_infer <- is_maximal(node, nodes)
if (isFALSE(max_infer)) {
nodes$category[node] <- -1L
}
if (isTRUE(max_infer)) {
max_non_deps <- c(max_non_deps, node)
nodes$category[node] <- -2L
}
if (is.na(max_infer))
nodes$category[node] <- -3L
}
}
nodes$visited[node] <- TRUE
}
res <- pick_next_node(
node,
nodes,
trace,
min_deps,
max_non_deps
)
trace <- res[[2]]
nodes <- res[[3]]
min_deps <- res[[4]]
max_non_deps <- res[[5]]
node <- res[[1]]
}
seeds <- generate_next_seeds(
max_non_deps,
min_deps,
initial_seeds,
nodes,
detset_limit
)
}
list(
lapply(min_deps, \(md) lhs_nonfixed_indices[nodes$bits[[md]]]),
FALSE
)
}
pick_next_node <- function(node, nodes, trace, min_deps, max_non_deps) {
if (nodes$category[node] == 3) { # candidate dependency
s <- nonvisited_children(node, nodes)
s <- remove_pruned_subsets(s, min_deps, nodes$bits)
if (length(s) == 0) {
min_deps <- c(min_deps, node)
nodes$category[node] <- 2L
}else{
trace <- c(node, trace)
return(list(min(s), trace, nodes, min_deps, max_non_deps))
}
}else if (nodes$category[node] == -3) { # candidate non-dependency
s <- nonvisited_parents(node, nodes)
s <- remove_pruned_supersets(s, max_non_deps, nodes$bits)
if (length(s) == 0) {
max_non_deps <- c(max_non_deps, node)
nodes$category[node] <- -2L
}else{
trace <- c(node, trace)
return(list(min(s), trace, nodes, min_deps, max_non_deps))
}
}
if (length(trace) == 0)
return(list(NA, trace, nodes, min_deps, max_non_deps))
list(trace[1], trace[-1], nodes, min_deps, max_non_deps)
}
remove_pruned_subsets <- function(subsets, supersets, bitsets) {
if (length(subsets) == 0 || length(supersets) == 0)
return(subsets)
check_pairs <- outer(
bitsets[subsets],
bitsets[supersets],
Vectorize(is_subset)
)
prune <- apply(check_pairs, 1, any)
subsets[!prune]
}
remove_pruned_supersets <- function(supersets, subsets, bitsets) {
if (length(subsets) == 0 || length(supersets) == 0)
return(supersets)
check_pairs <- outer(
bitsets[supersets],
bitsets[subsets],
Vectorize(is_superset)
)
prune <- rowSums(check_pairs) > 0
supersets[!prune]
}
generate_next_seeds <- function(
max_non_deps,
min_deps,
initial_seeds,
nodes,
detset_limit
) {
# Seed generation assumes that the empty set is known to be a non-determinant.
# The below is equivalent to beginning with a single empty seed, and having
# the empty set as an additional non-determinant. Being able to refer to the
# empty set directly would remove the special cases we have below for
# max_non_deps being empty, and for applying the first one.
# Nodes are completely unknown iff they're non-visited
stopifnot(
all(nodes$visited == (nodes$category != 0)),
!any(abs(nodes$category) == 3)
)
if (length(max_non_deps) == 0) {
# original DFD paper doesn't mention case where no maximal non-dependencies
# found yet, so this approach could be inefficient
seeds <- remove_pruned_subsets(
initial_seeds,
min_deps,
nodes$bits
)
# At generation time, all seed nodes are non-categorised, and nodes in
# general are non-categorised or minimal dependencies, because to be in this
# case the only explored nodes must be single-attribute nodes found to be
# minimal. i.e.:
stopifnot(
all(nodes$category[seeds] == 0),
all(nodes$category %in% c(0L, 2L))
)
}else{
seeds <- integer()
for (n in seq_along(max_non_deps)) {
nfd <- max_non_deps[[n]]
max_non_dep_c <- remove_pruned_subsets(initial_seeds, nfd, nodes$bits)
# paper condition is "seeds is empty", trimming cross-intersections
# by detset_limit means seeds can be empty before the end,
# which would cause the remaining nfds to start from scratch.
# we therefore just initiate the seeds on the first nfd, as
# intended anyway.
if (n == 1) {
seeds <- max_non_dep_c
}else{
seeds <- cross_intersection(seeds, max_non_dep_c, nodes, detset_limit)
}
}
}
remove_pruned_supersets(seeds, min_deps, nodes$bits)
}
cross_intersection <- function(seeds, max_non_dep, powerset, detset_limit) {
new_seed_full_indices <- integer()
for (dep in seeds) {
seed_bitset <- powerset$bits[[dep]]
for (set in max_non_dep) {
set_bitset <- powerset$bits[[set]]
new_seed_bitset <- seed_bitset | set_bitset
if (sum(new_seed_bitset) > detset_limit)
next
new_seed_bitset_index <- to_bitset_index(which(new_seed_bitset))
new_seed_full_indices <- c(new_seed_full_indices, new_seed_bitset_index)
}
}
seeds <- powerset$bitset_index[new_seed_full_indices]
seeds <- seeds[!is.na(seeds)]
minimise_seeds(seeds, powerset$bits)
}
to_bitset_index <- function(bits) {
sum(2^(bits - 1))
}
minimise_seeds <- function(seeds, bitsets) {
if (length(seeds) == 0)
return(seeds)
# remove duplicates first instead of comparing identical bitsets
unique_seeds <- unique(seeds)
n_seeds <- length(unique_seeds)
include <- rep(TRUE, length(unique_seeds))
for (n in seq_len(n_seeds - 1)) {
if (include[n]) {
for (m in seq.int(n + 1L, n_seeds)) {
if (include[[m]]) {
if (is_subset(bitsets[[unique_seeds[n]]], bitsets[[unique_seeds[m]]]))
include[m] <- FALSE
else{
if (is_superset(bitsets[[unique_seeds[n]]], bitsets[[unique_seeds[m]]])) {
include[n] <- FALSE
break
}
}
}
}
}
}
unique_seeds[include]
}
add_simple_key_deps <- function(
dependencies,
determinant_keys,
dependant_keys,
valid_dependant_attrs
) {
nonkey_dependants <- setdiff(valid_dependant_attrs, dependant_keys)
dependencies[nonkey_dependants] <- lapply(
dependencies[nonkey_dependants],
\(dets) c(as.list(determinant_keys), dets)
)
dependencies[dependant_keys] <- lapply(
dependant_keys,
\(key) c(as.list(setdiff(determinant_keys, key)), dependencies[[key]])
)
dependencies
}
add_deps_implied_by_bijections <- function(
dependencies,
bijections,
nonfixed,
column_names
) {
for (b in bijections) {
first_index <- nonfixed[[b[[1]]]]
# add the bijection
for (nonfixed_index in b[-1]) {
replacement <- nonfixed[[nonfixed_index]]
dependencies[[replacement]] <- c(
first_index,
setdiff(dependencies[[first_index]], replacement)
)
stopifnot(!anyDuplicated(dependencies[[nonfixed_index]]))
}
# add dependencies implied by the bijection
# only needed when bijection attribute is earlier than dependant, since
# later ones were added before the bijection was known
for (rhs in setdiff(seq_along(dependencies), match(nonfixed[b], column_names))) {
for (nonfixed_index in b[-1]) {
replacement <- nonfixed[[nonfixed_index]]
if (match(replacement, column_names) < rhs) {
dependencies[[rhs]] <- union(
dependencies[[rhs]],
lapply(
Filter(\(d) is.element(first_index, d), dependencies[[rhs]]),
\(d) c(setdiff(d, first_index), replacement)
)
)
}
stopifnot(!anyDuplicated(dependencies[[rhs]]))
}
}
}
dependencies
}
add_deps_implied_by_simple_keys <- function(
dependencies,
determinant_keys,
dependant_keys,
valid_dependant_attrs
) {
# transfer determinants of kept dependant key to others
if (length(dependant_keys) > 0) {
first_dep <- dependant_keys[[1]]
deps <- setdiff(dependencies[[first_dep]], as.list(determinant_keys))
for (key in dependant_keys) {
replacements <- setdiff(determinant_keys, key)
dependencies[[key]] <- c(deps, as.list(replacements))
stopifnot(!anyDuplicated(dependencies[[key]]))
}
}
# swap determinant keys around in compound determinants
if (length(determinant_keys) > 0) {
first_det <- determinant_keys[[1]]
for (rhs in setdiff(valid_dependant_attrs, dependant_keys)) {
for (replacement in determinant_keys[-1]) {
dependencies[[rhs]] <- c(
dependencies[[rhs]],
lapply(
Filter(
\(d) is.element(first_det, d) && length(d) > 1,
dependencies[[rhs]]
),
\(d) c(setdiff(d, first_det), replacement)
)
)
stopifnot(!anyDuplicated(dependencies[[rhs]]))
}
}
}
dependencies
}
flatten <- function(dependencies, attributes) {
result <- list()
for (i in seq_along(dependencies)) {
rhs <- names(dependencies)[i]
result <- c(
result,
lapply(dependencies[[i]], \(lhs) list(lhs, rhs))
)
}
functional_dependency(result, attributes)
}
filter_nonflat_dependencies <- function(
dependencies,
detset_limit
) {
lapply(
dependencies,
\(x) {
if (length(x) == 0)
return(x[FALSE])
wanted <- lengths(x) <= detset_limit
x[wanted]
}
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.