Nothing
#' Compute linearized leaf IDs for microdata
#'
#' This function calculates linearized integer IDs for each record in a micro dataset
#' based on a set of hierarchies. These IDs match the `leaf_id` and
#' `contributing_leaf_ids` generated by [hier_grid()].
#'
#' @param data a `data.table` containing the microdata.
#' @param dims a named `list` of `sdc_hierarchy` objects. The names of the list
#' elements must correspond to existing column names in `data`.
#'
#' @export
#' @examples
#' # Setup Hierarchies
#' h1 <- hier_create("Total", nodes = LETTERS[1:3])
#' h1 <- hier_add(h1, root = "A", node = "a1")
#' h1 <- hier_add(h1, root = "a1", node = "aa1") # h1 terminals: aa1, B, C (N=3)
#'
#' h2 <- hier_create("Total", letters[1:2]) # h2 terminals: a, b (N=2)
#'
#' # Create the Grid
#' # With add_dups = FALSE, bogus parents 'A' and 'a1' are removed.
#' grid <- hier_grid(h1, h2, add_dups = FALSE, add_contributing_cells = TRUE)
#'
#' # The 'leaf_id' in `grid` is calculated using Column-Major order:
#' # ID = i1 + (i2 - 1) * N1
#'
#' # Generate micro data
#' microdata <- data.table::data.table(
#' region = c("aa1", "aa1", "B", "C", "B"),
#' sector = c("a", "b", "a", "b", "b"),
#' turnover = c(100, 200, 50, 300, 150)
#' )
#'
#' # Map the strings in microdata to the same integer leaf_ids.
#' # We provide a named list where names 'region' and 'sector' match microdata.
#' microdata[, leaf_id := hier_create_ids(
#' data = microdata,
#' dims = list(region = h1, sector = h2)
#' )]
#'
#' # Aggregation Example:
#' # To get 'Total Region' for 'Sector a' from the grid:
#' target_cell <- grid[v1 == "Total" & v2 == "a"]
#' ids <- target_cell$contributing_leaf_ids[[1]]
#'
#' val_total_a <- sum(microdata[leaf_id %in% ids, turnover])
#' # Result: 150 (Records: aa1_a [100] + B_a [50])
#' @return an integer vector of leaf_ids matching the 'leaf_id' column in [hier_grid()]
hier_create_ids <- function(data, dims) {
# validate inputs
if (!inherits(data, "data.table")) {
stop("'data' must be a data.table.", call. = FALSE)
}
v_names <- names(dims)
if (is.null(v_names) || any(v_names == "")) {
stop("'dims' must be a fully named list where names match columns in 'data'.", call. = FALSE)
}
if (!all(v_names %in% names(data))) {
missing_vars <- v_names[!v_names %in% names(data)]
stop(sprintf(
"The following variables in 'dims' were not found in 'data': %s",
paste(missing_vars, collapse = ", ")
), call. = FALSE)
}
# prepare sets of required (base-level) leafs and dimension sizes
dim_sizes <- integer(length(dims))
local_idx_list <- list()
for (i in seq_along(dims)) {
# et basge-level (lowest) leaves using the same recursive logic as in hier_grid
raw_leaves <- rcpp_get_leaves_list(dims[[i]])
terminals <- sort(unique(unlist(raw_leaves)), method = "radix")
dim_sizes[i] <- length(terminals)
# match microdata column (using list names) to these indices
col_name <- v_names[i]
idx <- match(data[[col_name]], terminals)
if (any(is.na(idx))) {
warning(sprintf(
"Some values in column '%s' are not terminal leaves. They will result in NA leaf_ids.",
col_name
))
}
local_idx_list[[i]] <- idx
}
# linearization multipliers (column-major)
multipliers <- c(1, cumprod(dim_sizes[-length(dim_sizes)]))
# calculate ids
idx_matrix <- do.call(cbind, local_idx_list)
leaf_ids <- as.integer((idx_matrix - 1) %*% multipliers + 1)
return(leaf_ids)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.