Nothing
#' Decompose a data frame based on given normalised dependencies
#'
#' Decomposes a data frame into several relations, based on the given database
#' schema. It's intended that the data frame satisfies all the functional
#' dependencies implied by the schema, such as if the schema was constructed
#' from the same data frame. If this is not the case, the function will returns
#' an error.
#'
#' If the schema was constructed using approximate dependencies for the same
#' data frame, `decompose` returns an error, to prevent either duplicate records
#' or lossy decompositions. This is temporary: for the next update, we plan to
#' add an option to allow this, or to add "approximate" equivalents of databases
#' and database schemas.
#'
#' @param df a data.frame, containing the data to be normalised.
#' @param schema a database schema with foreign key references, such as given by
#' \code{\link{autoref}}.
#'
#' @return A \code{\link{database}} object, containing the data in \code{df}
#' within the database schema given in \code{schema}.
#' @export
decompose <- function(df, schema) {
stopifnot(!anyDuplicated(names(schema)))
stopifnot(identical(names(df), attrs_order(schema)))
inferred_fds <- synthesised_fds(attrs(schema), keys(schema))
if (length(inferred_fds) > 0L)
inferred_fds <- unlist(inferred_fds, recursive = FALSE)
check_fd <- function(df, fd) {
both_proj <- df_unique(df[, unlist(fd), drop = FALSE])
key_proj <- df_unique(both_proj[, fd[[1]], drop = FALSE])
nrow(key_proj) == nrow(both_proj)
}
fds_satisfied <- vapply(
inferred_fds,
check_fd,
logical(1L),
df = df
)
if (!all(fds_satisfied)) {
stop(paste(
"df doesn't satisfy functional dependencies in schema:",
paste(
vapply(
inferred_fds[!fds_satisfied],
\(fd) paste0("{", toString(fd[[1]]), "} -> ", fd[[2]]),
character(1)
),
collapse = "\n"
),
sep = "\n"
))
}
create_insert(df, schema) |>
database(references(schema))
}
create_insert <- function(df, schema) {
relations <- stats::setNames(
Map(
\(attrs, keys) {
list(
# conditional needed to handle 0-attrs case,
# i.e. decomposing to table_dum and table_dee
df = df_unique(df[, attrs, drop = FALSE]),
keys = keys
)
},
attrs(schema),
keys(schema)
),
names(schema)
)
relation(relations, attrs_order(schema))
}
drop_primary_dups <- function(df, prim_key) {
# Reduces a data.frame to have unique values of the attributes in the given
# primary key. If the other columns are not uniquely determined by the primary
# key, as can be the case for approximate dependencies, the most common value
# for each primary key value is used.
df_lst <- list()
if (nrow(df_unique(df[, prim_key, drop = FALSE])) == nrow(df))
return(df)
groups <- split(
df,
as.list(df[, c(prim_key), drop = FALSE]),
drop = TRUE
)
for (group in groups) {
df_lst <- c(df_lst, list(data.frame(lapply(group, Mode))))
}
result <- `rownames<-`(
stats::setNames(Reduce(rbind, df_lst), colnames(df)),
NULL
)
for (i in seq_along(df)) {
class(result[[i]]) <- class(df[[i]])
}
result
}
Mode <- function(x) {
uniqs <- unique(x)
tabs <- tabulate(match(x, uniqs))
uniqs[[which.max(tabs)]]
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.