R/format_template.R

Defines functions format_template filter_on_marginal_of_spanning_var get_combinations contains_non_total

Documented in contains_non_total filter_on_marginal_of_spanning_var format_template get_combinations

#' Check for Non-Total Values in a Data Frame
#'
#' This function checks if any column in a data frame contains values
#' that are not part of a given set of totals.
#'
#' @param data A data frame containing the data to be checked.
#' @param totals A vector of values considered as totals.
#'
#' @return A logical value: `TRUE` if at least one non-total value exists, otherwise `FALSE`.
#'
#' @examples
#' \dontrun{
#' df <- data.frame(
#'   col1 = c("A", "B", "TOTAL"),
#'   col2 = c("X", "TOTAL", "TOTAL")
#' )
#' contains_non_total(df, c("TOTAL"))
#' # Returns TRUE
#' }
#'
#' @importFrom dplyr summarise
#'
contains_non_total <- function(data, totals) {
  any(data %>%
        summarise(across(everything(), ~ any(!(.x %in% totals)))) %>%
        unlist())
}

#' Generate All Combinations of spanning variables
#'
#' This function generates all possible combinations of keys from a given named list or vector.
#'
#' @param criteria A named list or vector, where the names represent the keys.
#'
#' @return A list of combinations, where each combination is a character vector of key names.
#'
#' @examples
#' \dontrun{
#' criteria <- list(key1 = "value1", key2 = "value2", key3 = "value3")
#' get_combinations(criteria)
#' # Returns a list of combinations: c("key1"), c("key2"), c("key3"), c("key1", "key2"), ...
#' }
get_combinations <- function(criteria) {
  keys <- names(criteria)
  unlist(lapply(1:length(keys), function(k) combn(keys, k, simplify = FALSE)), recursive = FALSE)
}

#' Filter a Data Frame Based on Marginal Criteria
#'
#' This function filters rows in a data frame based on marginal criteria.
#' It applies filtering conditions to a subset of keys and their corresponding values,
#' and excludes rows where other keys match the given totals.
#'
#' @param data A data frame to be filtered.
#' @param criteria A named list or vector where the names are column names, and the values
#'        are the totals or values to be used in filtering.
#' @param subset_keys A character vector of keys (column names) to be included in the filter criteria.
#'
#' @return A filtered data frame where rows meet the specified criteria and exclude the subset keys.
#'
#' @examples
#' \dontrun{
#' df <- data.frame(
#'   col1 = c("A", "B", "TOTAL"),
#'   col2 = c("X", "TOTAL", "Y"),
#'   col3 = c("Z", "Z", "Z")
#' )
#' criteria <- list(col1 = "TOTAL", col2 = "TOTAL", col3 = "TOTAL")
#' filter_on_marginal_of_spanning_var(df, criteria, subset_keys = c("col1", "col2"))
#' # Filters the data frame based on the criteria and subset_keys.
#' }
filter_on_marginal_of_spanning_var <- function(data, criteria, subset_keys) {
  # Create filter expressions for subset_keys with ==
  filter_expr_in <- purrr::map2(
    subset_keys,
    criteria[subset_keys],
    ~ rlang::expr(!!rlang::sym(.x) == !!.y)
  )
  # Create filter expressions for all other keys with !=
  other_keys <- setdiff(names(criteria), subset_keys)
  filter_expr_not_in <- purrr::map2(
    other_keys,
    criteria[other_keys],
    ~ rlang::expr(!!rlang::sym(.x) != !!.y)
  )
  # Combine the two sets of expressions
  combined_filter_expr <- c(filter_expr_in, filter_expr_not_in)
  # Apply the combined filter
  data %>% filter(!!!combined_filter_expr) %>% select(-!!subset_keys)
}

#' Determines the tables described in a template gathering all the published cells
#'
#' @param data template gathering all the published cells
#' @param indicator_column name of the column in which the indicators are
#' @param spanning_var_tot a named list of the spanning variables and their totals
#' @param field_columns vecotr of all the columns that are fields (ex: year of collect)
#'
#' @return named list of a dataframe describing the tables (metadata) and a list of
#' the modalities of each hierarchical variable (modalities)
#' @export
#'
#' @examples
#' data(enterprise_template)
#'
#' template_formatted <- format_template(
#'   data = enterprise_template,
#'   indicator_column = "INDICATOR",
#'   spanning_var_tot = list(
#'     ACTIVITY = "BTSXO_S94",
#'     NUMBER_EMPL = "_T",
#'     LEGAL_FORM = "_T"),
#'   field_columns = c("TIME_PERIOD")
#' )
#' View(template_formatted$metadata)
#'
#' @importFrom purrr compact
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr everything across all_of pull row_number bind_rows n_distinct
#'
#' @export
format_template <- function(data,indicator_column,spanning_var_tot,field_columns) {
  data <- data %>% mutate(field = apply(across(all_of(field_columns)), 1, paste0, collapse = "_"))
  # each modality of field variables can be treated independently
  list_df_field <- split(data,data$field)
  list_metadata <- purrr::imap(list_df_field, function(df_field,field_name){
    # if there is no link between the indicators, then they can all be treated independently
    list_df_indicator <- split(df_field,df_field %>% select(!!indicator_column))
    list_df_metadata <- purrr::imap(list_df_indicator, function(df_indicator,indicator_name){
      # Step 1: Identify valid columns with more than one unique value
      valid_columns <- df_indicator %>%
        summarise(across(all_of(names(spanning_var_tot)), dplyr::n_distinct)) %>%
        tidyr::pivot_longer(everything(), names_to = "column", values_to = "n_unique") %>%
        filter(n_unique > 1) %>%
        pull(column)
      spanning_totals_valid <- spanning_var_tot[valid_columns]
      df_valid_columns <- df_indicator %>% select(all_of(valid_columns))

      # Step 2 : get all the possible combinations of spanning variables
      combinations <- get_combinations(spanning_totals_valid)
      # Step 3: Filter and keep valid combinations, i.e. combinations with something
      # else than just the total
      tables_crossing_non_totals <- purrr::map(
        combinations,
        ~ {
          # Apply the filter to get the filtered dataframe
          filtered_df <- filter_on_marginal_of_spanning_var(df_valid_columns, spanning_totals_valid, .x)
          # Check if there are non-total values in the filtered dataframe
          if (contains_non_total(filtered_df, unlist(spanning_totals_valid))) {
            return(filtered_df)  # Keep the dataframe if non-total values remain
          } else {
            return(NULL)  # Discard the dataframe if no non-total values remain
          }
        }
      ) %>% compact()

      # Step 4: Name the list of dataframes
      valid_combinations <- purrr::keep(combinations, ~ {
        filtered_df <- filter_on_marginal_of_spanning_var(df_valid_columns, spanning_totals_valid, .x)
        contains_non_total(filtered_df, unlist(spanning_totals_valid))
      })
      names(tables_crossing_non_totals) <- purrr::map_chr(tables_crossing_non_totals, ~ {
        # Get the column names of the current data frame
        remaining_columns <- names(.x)
        # Construct the name based on these columns
        paste0("crossing.", paste(remaining_columns, collapse = ".and."))
      })
      tables_crossing_variables <- purrr::discard(tables_crossing_non_totals, function(single_var) {
        # Compare the modalities only for the dataframes with one column, i.e. one spanning variable
        if (ncol(single_var) == 1) {
          name_var <- names(single_var)
          # Check if the modalities of this column are all included in at least one of
          # the dataframes crossing multiple spanning variables
          any(purrr::map_lgl(tables_crossing_non_totals, function(multiple_var) {
            ncol(multiple_var) > 1 &&
              name_var %in% names(multiple_var) &&
              setequal(single_var %>% dplyr::pull(!!name_var),
                       multiple_var %>% dplyr::pull(!!name_var) %>% unique())
          }))
        } else {
          FALSE
        }
      })
      spanning_description <- purrr::imap_dfr(
        tables_crossing_variables,
        ~ {
          # Count unique values for each column and rename with prefixes
          formatted_counts <- setNames(
            paste0("hrc_", tolower(names(.x)), "_", sapply(.x, function(col) length(unique(col)))),
            names(.x)
          )
          # Extract variable names and formatted counts
          variable_names <- names(formatted_counts) # Spanning variables
          hrc_values <- formatted_counts            # Corresponding hrc values
          unique_modalities <- setNames(
            sapply(.x, function(col) unique(col), simplify = FALSE),
            formatted_counts
          )
          # Create a dynamic data frame
          data.frame(
            # Dynamically create spanning columns
            setNames(as.list(variable_names), paste0("spanning_", seq_along(variable_names))),
            # Dynamically create hrc spanning columns
            setNames(as.list(hrc_values), paste0("hrc_spanning_", seq_along(hrc_values))),
            stringsAsFactors = FALSE
          ) %>% mutate(indicator = indicator_name, unique_modalities = list(unique_modalities))
        }
      )
      spanning_description <- spanning_description %>%
        mutate(field = field_name) %>%
        mutate(
          table_name = paste(
            "table", field_name, indicator_name, row_number(),
            sep = "_"
          )
        )

      modalities <- spanning_description$unique_modalities
      names(modalities) <- spanning_description$table_name

      spanning_description <- spanning_description %>% select(-unique_modalities)

      return(list(
        metadata = spanning_description,
        modalities = modalities
      ))
    })
    # Combine metadata and modalities
    combined_metadata <- do.call(bind_rows, purrr::map(list_df_metadata, "metadata"))
    combined_modalities <- purrr::reduce(purrr::map(list_df_metadata, "modalities"), c)
    return(list(
      metadata = combined_metadata,
      modalities = combined_modalities
    ))
  })

  # Combine all metadata and modalities across fields
  final_metadata <- do.call(bind_rows, purrr::map(list_metadata, "metadata"))
  final_modalities <- purrr::reduce(purrr::map(list_metadata, "modalities"), c)

  return(list(
    metadata = final_metadata %>% select(table_name,field,indicator,everything()),
    modalities = final_modalities
  ))
}
InseeFrLab/rtauargus documentation built on Feb. 25, 2025, 6:32 a.m.