Nothing
#' Apply Threshold-Based Masking to a Data Frame
#'
#' @description
#' The `mask_table` function applies threshold-based masking to specified columns in a data frame.
#' It uses the `mask_counts` function to mask counts that are below a certain threshold, adhering
#' to data privacy requirements. The function can handle grouped data and calculate percentages if required.
#' It ensures convergence by checking specific criteria after each iteration.
#'
#' @param data A data frame containing the counts to be masked. Must be a data frame.
#' @param threshold A positive numeric value specifying the threshold below which values must be suppressed. Default is 11.
#' @param col_groups A character vector or a list of character vectors, where each character vector specifies columns in `data` to which masking should be applied.
#' @param group_by An optional character string specifying a column name in `data` to group the data by before masking.
#' @param overwrite_columns Logical; if `TRUE`, the original columns are overwritten with masked counts. If `FALSE`, new columns are added with masked counts. Default is `TRUE`.
#' @param percentages Logical; if `TRUE`, percentages are calculated and masked accordingly. Default is `FALSE`.
#' @param perc_decimal = A positive numeric value specifying the decimals for percentages. Default is 0.
#' @param zero_masking Logical; if `TRUE`, zeros can be masked as secondary cells when present. Passed to `mask_counts`. Default is `FALSE`.
#' @param secondary_cell Character string specifying the method for selecting secondary cells when necessary. Options are `"min"`, `"max"`, or `"random"`. Passed to `mask_counts`. Default is `"min"`.
#' @param .verbose Logical; if `TRUE`, progress messages are printed during masking. Default is `FALSE`.
#'
#' @return A data frame with masked counts in specified columns. If `percentages = TRUE`, additional columns with percentages are added. The structure of the returned data frame depends on the `overwrite_columns` parameter.
#'
#' @examples
#' data("countmaskr_data")
#'
#' aggregate_table <- countmaskr_data %>%
#' select(-c(id, age)) %>%
#' gather(block, Characteristics) %>%
#' group_by(block, Characteristics) %>%
#' summarise(N = n()) %>%
#' ungroup()
#'
#' mask_table(aggregate_table,
#' group_by = "block",
#' col_groups = list("N")
#' )
#'
#' mask_table(aggregate_table,
#' group_by = "block",
#' col_groups = list("N"),
#' overwrite_columns = FALSE,
#' percentages = TRUE
#' )
#'
#' countmaskr_data %>%
#' count(race, gender) %>%
#' pivot_wider(names_from = gender, values_from = n) %>%
#' mutate(across(all_of(c("Male", "Other")), ~ ifelse(is.na(.), 0, .)),
#' Overall = Female + Male + Other, .after = 1
#' ) %>%
#' countmaskr::mask_table(.,
#' col_groups = list(c("Overall", "Female", "Male", "Other")),
#' overwrite_columns = TRUE,
#' percentages = FALSE
#' )
#'
#' @seealso \code{\link{mask_counts}}
#' @importFrom tibble tibble
#' @importFrom dplyr bind_rows
#' @importFrom tidyr gather pivot_wider
#' @export
mask_table <- function(data,
threshold = 11,
col_groups,
group_by = NULL,
overwrite_columns = TRUE,
percentages = FALSE,
perc_decimal = 0,
zero_masking = FALSE,
secondary_cell = "min",
.verbose = FALSE) {
# Input validation
if (!is.data.frame(data)) {
stop("Argument 'data' must be a data frame.")
}
if (!is.numeric(threshold) || length(threshold) != 1 || threshold <= 0) {
stop("Argument 'threshold' must be a positive numeric value.")
}
if (!is.logical(overwrite_columns) || length(overwrite_columns) != 1) {
stop("Argument 'overwrite_columns' must be a logical value (TRUE or FALSE).")
}
if (!is.logical(percentages) || length(percentages) != 1) {
stop("Argument 'percentages' must be a logical value (TRUE or FALSE).")
}
if (!is.logical(zero_masking) || length(zero_masking) != 1) {
stop("Argument 'zero_masking' must be a logical value (TRUE or FALSE).")
}
if (!secondary_cell %in% c("min", "max", "random")) {
stop("Argument 'secondary_cell' must be 'min', 'max', or 'random'.")
}
if (!is.logical(.verbose) || length(.verbose) != 1) {
stop("Argument '.verbose' must be a logical value (TRUE or FALSE).")
}
if (percentages && !is.numeric(perc_decimal)) {
stop("Argument 'perc_decimal' must be a positive numeric value.")
}
# Validate 'group_by'
if (!is.null(group_by)) {
if (!is.character(group_by) || length(group_by) != 1 || !group_by %in% names(data)) {
stop("Argument 'group_by' must be a single column name present in 'data'.")
}
}
# Requesting percentages will always return both original and masked percentages
if (percentages) {
overwrite_columns <- FALSE
}
# Ensure col_groups is a list of vectors of column names
if (!is.list(col_groups)) {
if (!is.character(col_groups)) {
stop("Argument 'col_groups' must be a character vector or a list of character vectors.")
}
col_groups <- list(col_groups)
}
# Check if all columns in 'col_groups' exist in 'data'
all_col_groups <- unique(unlist(col_groups))
if (!all(all_col_groups %in% names(data))) {
missing_cols <- all_col_groups[!all_col_groups %in% names(data)]
stop(paste("Some columns in 'col_groups' do not exist in 'data':", paste(missing_cols, collapse = ", ")))
}
# Split data into blocks if group_by is specified
if (!is.null(group_by)) {
data_blocks <- split(data, data[[group_by]])
} else {
data_blocks <- list(data)
}
# Initialize list to store masked data blocks
masked_blocks <- list()
# Loop over each block
for (block_name in seq_along(data_blocks)) {
block_data <- data_blocks[[block_name]]
if (isTRUE(.verbose)) {
message(paste0("Starting masking for block: ", names(data_blocks)[block_name]))
}
# Loop over each group in col_groups
for (group in col_groups) {
if (!all(group %in% names(block_data))) {
stop(paste("Some columns in 'col_groups' do not exist in the data block:", paste(group, collapse = ", ")))
}
if (isTRUE(.verbose)) {
message(paste0("Masking columns: ", paste(group, collapse = ", ")))
}
# Extract the counts for the current group
original_counts <- block_data[, group, drop = FALSE]
# Initialize masked_counts
masked_counts <- original_counts
# Start the repeat loop for masking
repeat {
if (isTRUE(.verbose)) {
message("Applying mask_counts across columns...")
}
# Apply mask_counts across columns
across_column_mask <- apply(
masked_counts,
MARGIN = 2,
mask_counts,
threshold = threshold,
zero_masking = zero_masking,
secondary_cell = secondary_cell
)
# Ensure the result is a data frame
across_column_mask <- data.frame(across_column_mask, check.names = FALSE)
if (isTRUE(.verbose)) {
message("Applying mask_counts across rows...")
}
# Apply mask_counts across rows
if(length(group) > 1) {
across_row_mask <- t(apply(
across_column_mask,
MARGIN = 1,
mask_counts,
threshold = threshold,
zero_masking = zero_masking,
secondary_cell = secondary_cell
))
} else {
across_row_mask <- across_column_mask
}
# Convert to data frame
across_row_mask <- data.frame(across_row_mask, check.names = FALSE)
colnames(across_row_mask) <- colnames(masked_counts)
masked_counts <- across_row_mask
# Check convergence criteria
if (nrow(masked_counts) >= 1) {
# Calculate total masked cells per column
total_masked_cells <- sapply(masked_counts, function(col) sum(grepl("<", col), na.rm = TRUE))
# Calculate total available (unmasked) cells per column
total_available_cells <- sapply(masked_counts, function(col) sum(!grepl("<", col), na.rm = TRUE))
# Calculate total zeros per column
total_zeros <- sapply(masked_counts, function(col) sum(col == "0" | is.na(col), na.rm = TRUE))
# Check if convergence criteria are met
if ((nrow(masked_counts) == 1) ||
(any(total_masked_cells == 1) &&
all(total_available_cells[total_masked_cells == 1] == total_zeros[total_masked_cells == 1])) ||
(!any(total_masked_cells == 1))) {
if (isTRUE(.verbose)) {
message("Convergence criteria met. Exiting loop.")
}
# Overwrite or add masked counts to block_data
if (isTRUE(overwrite_columns)) {
block_data[, group] <- masked_counts
} else {
masked_colnames <- paste0(group, "_masked")
block_data[, masked_colnames] <- masked_counts
}
break # Exit the repeat loop
} else {
if (isTRUE(.verbose)) {
message("Convergence criteria not met. Repeating masking process.")
}
# Continue to the next iteration with updated masked_counts
}
} else {
# No data to process, break the loop
if (isTRUE(.verbose)) {
message("No data to process. Exiting loop.")
}
break
}
}
# Handle percentages if required
if (isTRUE(percentages)) {
if (isTRUE(.verbose)) {
message("Calculating percentages...")
}
# Convert counts to numeric
original_counts_numeric <- data.frame(lapply(original_counts, extract_digits), check.names = FALSE)
masked_counts_numeric <- data.frame(lapply(masked_counts, extract_digits), check.names = FALSE)
# Calculate original totals
original_totals <- colSums(original_counts_numeric, na.rm = TRUE)
# Avoid division by zero
original_totals[original_totals == 0] <- NA
# Calculate original percentages
original_percentages <- sweep(original_counts_numeric, 2, original_totals, FUN = "/") * 100
# Keep unrounded copy for boundary comparisons (0% / 100% edge-case checks)
original_percentages_unrounded <- original_percentages
original_percentages <- round(original_percentages, digits = perc_decimal)
# Create original percentages data frame with suffix '_perc'
original_percentages_char <- matrix(NA_character_, nrow = nrow(original_percentages), ncol = ncol(original_percentages))
colnames(original_percentages_char) <- paste0(colnames(original_counts), "_perc")
# Assign original percentages
original_percentages_char[!is.na(original_percentages)] <- paste0(original_percentages[!is.na(original_percentages)], " %")
original_percentages_char[is.na(original_percentages)] <- NA_character_
# Convert to data frame
original_percentages_df <- data.frame(original_percentages_char, check.names = FALSE)
# Now handle masked percentages
# Calculate masked percentages
masked_percentages <- sweep(masked_counts_numeric, 2, original_totals, FUN = "/") * 100
masked_percentages <- round(masked_percentages, digits = perc_decimal)
# Initialize masked_percentages_char with appropriate dimensions and names
masked_percentages_char <- matrix(NA_character_, nrow = nrow(masked_percentages), ncol = ncol(masked_percentages))
colnames(masked_percentages_char) <- paste0(colnames(masked_counts), "_perc_masked")
# Create logical matrices for conditions
is_masked_sec_cell <- as.matrix(apply(masked_counts, 2, function(col) grepl("<", col) & !grepl(paste0("<", threshold), col)))
is_small_cell <- as.matrix(apply(masked_counts, 2, function(col) grepl(paste0("^<", threshold, "$"), col)))
# Ensure is_na_cell has the same dimensions as other logical matrices
is_na_cell <- as.matrix(is.na(masked_counts_numeric))
# Ensure all matrices have the same dimensions
if (nrow(masked_counts_numeric) == 1) {
# For single row case, ensure all matrices are row matrices
is_masked_sec_cell <- matrix(is_masked_sec_cell, nrow = 1)
is_small_cell <- matrix(is_small_cell, nrow = 1)
is_na_cell <- matrix(is_na_cell, nrow = 1)
# Set column names to match
colnames(is_masked_sec_cell) <- colnames(masked_counts)
colnames(is_small_cell) <- colnames(masked_counts)
colnames(is_na_cell) <- colnames(masked_counts)
}
# Compute dynamic percentage boundary labels based on perc_decimal.
# These apply for any perc_decimal value (0, 1, 2, …):
# min_perc = 1 / 10^perc_decimal (e.g. 1 % for 0, 0.1 % for 1, 0.01 % for 2)
# max_perc = 100 - min_perc (e.g. 99 % for 0, 99.9 % for 1, 99.99 % for 2)
min_perc <- 10^(-perc_decimal)
max_perc <- 100 - min_perc
min_perc_label <- format(min_perc, nsmall = perc_decimal, trim = TRUE)
max_perc_label <- format(max_perc, nsmall = perc_decimal, trim = TRUE)
# Flag: rounded masked percentage is exactly 0 but the underlying count is > 0.
# This catches values too small to display at the chosen decimal precision.
# Applies regardless of masking status (primary, secondary, or unmasked).
is_zero_perc_nonzero_count <- as.matrix(
masked_percentages == 0 & !is.na(masked_percentages) & masked_counts_numeric > 0
)
# Flag: rounded masked percentage is exactly 100 but the TRUE (unrounded) original
# percentage was not 100 – i.e. rounding inflated the display to 100 %.
is_hundred_perc_not_original <- as.matrix(
masked_percentages >= 100 &
!is.na(masked_percentages) &
original_percentages_unrounded != 100
)
# Ensure new flag matrices share dimensions in the single-row edge case
if (nrow(masked_counts_numeric) == 1) {
is_zero_perc_nonzero_count <- matrix(is_zero_perc_nonzero_count, nrow = 1)
colnames(is_zero_perc_nonzero_count) <- colnames(masked_counts)
is_hundred_perc_not_original <- matrix(is_hundred_perc_not_original, nrow = 1)
colnames(is_hundred_perc_not_original) <- colnames(masked_counts)
}
# Assign percentages with priority order:
# 1. Small cells (primary masked cells) -> "masked cell"
masked_percentages_char[is_small_cell] <- "masked cell"
# 2. Secondary masked cells (excluding boundary overrides below) -> "<percentage %"
is_sec_normal <- is_masked_sec_cell & !is_small_cell &
!is_zero_perc_nonzero_count & !is_hundred_perc_not_original
masked_percentages_char[is_sec_normal] <-
paste0("<", masked_percentages[is_sec_normal], " %")
# 3. Unmasked percentages (excluding boundary overrides below) -> "percentage %"
is_unmasked_normal <- !is_masked_sec_cell & !is_na_cell & !is_small_cell &
!is_zero_perc_nonzero_count & !is_hundred_perc_not_original
masked_percentages_char[is_unmasked_normal] <-
paste0(masked_percentages[is_unmasked_normal], " %")
masked_percentages_char[!is_masked_sec_cell & !is_small_cell & is_na_cell] <- NA_character_
# 4. Override – rounded 0 % with non-zero count -> "<min_perc %"
# Applied AFTER steps 1-3 so it overrides "masked cell" too, as the count
# is genuinely present but the magnitude is below the displayable resolution.
masked_percentages_char[is_zero_perc_nonzero_count & !is_small_cell] <-
paste0("<", min_perc_label, " %")
# 5. Override – rounded 100 % but true original is not 100 % -> ">max_perc %"
# Prevents rounding artefacts from implying complete coverage.
masked_percentages_char[
is_hundred_perc_not_original & !is_small_cell
] <-
paste0(">", max_perc_label, " %")
# Convert to data frame
masked_percentages_df <- as.data.frame(masked_percentages_char, check.names = FALSE)
# Add original and masked percentages to block_data
if (isTRUE(overwrite_columns)) {
block_data <- cbind(block_data, masked_percentages_df)
} else {
block_data <- cbind(block_data, original_percentages_df, masked_percentages_df)
}
}
# Overwrite or add masked counts to block_data (if not already done)
if (!isTRUE(overwrite_columns)) {
masked_colnames <- paste0(group, "_masked")
block_data[, masked_colnames] <- masked_counts
}
} # End of col_groups loop
# Add the processed block to masked_blocks
masked_blocks[[block_name]] <- block_data
} # End of data_blocks loop
# Combine all blocks back into a single data frame
masked_data <- tibble::tibble(dplyr::bind_rows(masked_blocks))
return(masked_data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.