R/hier_create_ids.R

Defines functions hier_create_ids

Documented in hier_create_ids

#' Compute linearized leaf IDs for microdata
#'
#' This function calculates linearized integer IDs for each record in a micro dataset
#' based on a set of hierarchies. These IDs match the `leaf_id` and
#' `contributing_leaf_ids` generated by [hier_grid()].
#'
#' @param data a `data.table` containing the microdata.
#' @param dims a named `list` of `sdc_hierarchy` objects. The names of the list
#' elements must correspond to existing column names in `data`.
#'
#' @export
#' @examples
#' # Setup Hierarchies
#' h1 <- hier_create("Total", nodes = LETTERS[1:3])
#' h1 <- hier_add(h1, root = "A", node = "a1")
#' h1 <- hier_add(h1, root = "a1", node = "aa1") # h1 terminals: aa1, B, C (N=3)
#'
#' h2 <- hier_create("Total", letters[1:2])      # h2 terminals: a, b (N=2)
#'
#' # Create the Grid
#' # With add_dups = FALSE, bogus parents 'A' and 'a1' are removed.
#' grid <- hier_grid(h1, h2, add_dups = FALSE, add_contributing_cells = TRUE)
#'
#' # The 'leaf_id' in `grid` is calculated using Column-Major order:
#' # ID = i1 + (i2 - 1) * N1
#'
#' # Generate micro data
#' microdata <- data.table::data.table(
#'    region = c("aa1", "aa1", "B", "C", "B"),
#'    sector = c("a", "b", "a", "b", "b"),
#'    turnover = c(100, 200, 50, 300, 150)
#' )
#'
#' # Map the strings in microdata to the same integer leaf_ids.
#' # We provide a named list where names 'region' and 'sector' match microdata.
#' microdata[, leaf_id := hier_create_ids(
#'    data = microdata,
#'    dims = list(region = h1, sector = h2)
#' )]
#'
#' # Aggregation Example:
#' # To get 'Total Region' for 'Sector a' from the grid:
#' target_cell <- grid[v1 == "Total" & v2 == "a"]
#' ids <- target_cell$contributing_leaf_ids[[1]]
#'
#' val_total_a <- sum(microdata[leaf_id %in% ids, turnover])
#' # Result: 150 (Records: aa1_a [100] + B_a [50])
#' @return an integer vector of leaf_ids matching the 'leaf_id' column in [hier_grid()]
hier_create_ids <- function(data, dims) {
  # validate inputs
  if (!inherits(data, "data.table")) {
    stop("'data' must be a data.table.", call. = FALSE)
  }

  v_names <- names(dims)
  if (is.null(v_names) || any(v_names == "")) {
    stop("'dims' must be a fully named list where names match columns in 'data'.", call. = FALSE)
  }

  if (!all(v_names %in% names(data))) {
    missing_vars <- v_names[!v_names %in% names(data)]
    stop(sprintf(
      "The following variables in 'dims' were not found in 'data': %s",
      paste(missing_vars, collapse = ", ")
    ), call. = FALSE)
  }

  # prepare sets of required (base-level) leafs and dimension sizes
  dim_sizes <- integer(length(dims))
  local_idx_list <- list()

  for (i in seq_along(dims)) {
    # et basge-level (lowest) leaves using the same recursive logic as in hier_grid
    raw_leaves <- rcpp_get_leaves_list(dims[[i]])
    terminals <- sort(unique(unlist(raw_leaves)), method = "radix")
    dim_sizes[i] <- length(terminals)

    # match microdata column (using list names) to these indices
    col_name <- v_names[i]
    idx <- match(data[[col_name]], terminals)

    if (any(is.na(idx))) {
      warning(sprintf(
        "Some values in column '%s' are not terminal leaves. They will result in NA leaf_ids.",
        col_name
      ))
    }
    local_idx_list[[i]] <- idx
  }

  # linearization multipliers (column-major)
  multipliers <- c(1, cumprod(dim_sizes[-length(dim_sizes)]))

  # calculate ids
  idx_matrix <- do.call(cbind, local_idx_list)
  leaf_ids <- as.integer((idx_matrix - 1) %*% multipliers + 1)

  return(leaf_ids)
}

Try the sdcHierarchies package in your browser

Any scripts or data that you put into this service are public.

sdcHierarchies documentation built on March 20, 2026, 1:06 a.m.