R/sp_from_5_to_3.R

Defines functions from_5_to_3 nb_nodes

Documented in from_5_to_3

# Count the number of nodes in a hierarchical file
# Expects 2 arguments:
# - Either a named list and a variable,
# - Or an hrc (hierarchical file) and hrc_name = FALSE
nb_nodes <- function(hrcfiles, v = NULL, hrc_name = TRUE) {
  # Check if the variable has an associated hrc file or if hrc_name == FALSE
  if (hrc_name && !(v %in% names(hrcfiles)) || (!hrc_name && is.null(hrcfiles))) {
    # Non-hierarchical variable or hrcfiles == NULL
    return(1)
  }

  # Take the specified file if hrc_name = TRUE, otherwise take the hrc directly provided
  hrc <- ifelse(hrc_name, hrcfiles[[v]], hrcfiles)

  # Unimportant value for the following steps
  total <- "This_Is_My_Total"

  # Convert to hierarchy
  res_sdc <- sdcHierarchies::hier_import(inp = hrc, from = "hrc", root = total) %>%
    sdcHierarchies::hier_convert(as = "sdc")

  # Return the number of nodes
  return(length(res_sdc$dims))
}

#' Function reducing from 5 to 3 categorical variables
#'
#' @param dfs data.frame with 5 categorical variables (n >= 3 in the general case)
#' @param dfs_name name of the data.frame in the list provided by the user
#' @param totcode named vector of totals for categorical variables
#' @param hrcfiles named vector indicating the hrc files of hierarchical variables
#' among the categorical variables of dfs
#' @param sep_dir allows forcing the writing of hrc files in a separate folder
#' defaulted to FALSE
#' @param hrc_dir folder where to write the hrc files if forcing the writing
#' in a new folder or if no folder is specified in hrcfiles
#' @param v1 allows forcing the value of the first variable to merge
#' when reducing from 5 to 4 dimensions, not specified by default (NULL)
#' @param v2 allows forcing the value of the second variable to merge
#' when reducing from 5 to 4 dimensions, not specified by default (NULL)
#' @param v3 allows forcing the value of the first variable to merge
#' when reducing from 4 to 3 dimensions, not specified by default (NULL)
#' @param v4 allows forcing the value of the second variable to merge
#' when reducing from 4 to 3 dimensions, not specified by default (NULL)
#' @param sep separator used during concatenation of variables
#' @param maximize_nb_tabs specifies whether to prefer selecting hierarchical variables with
#' the most nodes as a priority (TRUE), which generates more tables
#' but of smaller size, or non-hierarchical variables with the least modality (FALSE)
#' to create fewer tables
#' @param verbose prints the different steps of the function to notify
#' the user of the progress, mainly for the general function gen_tabs_5_4_to_3()
#'
#' @return a list containing the following components:
#' \itemize{
#'   \item `tabs`: named list of dataframes with 3 dimensions
#'   (n-2 dimensions in the general case) endowed with nested hierarchies
#'   \item `hrcs5_4`: named list of hrc specific to the variable created
#'   via the merge when reducing from 5 to 4 dimensions
#'   \item `hrcs4_3`: named list of hrc specific to the variable created
#'   via the merge when reducing from 4 to 3 dimensions
#'   \item `alt_tot5_4`: named list of totals when reducing from 5 to 4 dimensions
#'   \item `alt_tot4_3`: named list of totals when reducing from 4 to 3 dimensions
#'   \item `vars`: named list of vectors representing the merged variables
#'   during the two steps of dimension reduction
#' }
#'
#' @examples
#' library(dplyr)
#' data <- expand.grid(
#'   ACT = c("Total", "A", "B", "A1", "A2", "B1", "B2"),
#'   GEO = c("Total", "GA", "GB", "GA1", "GA2", "GB1", "GB2"),
#'   SEX = c("Total", "F", "M","F1","F2","M1","M2"),
#'   AGE = c("Total", "AGE1", "AGE2", "AGE11", "AGE12", "AGE21", "AGE22"),
#'   ECO = c("PIB","Ménages","Entreprises"),
#'   stringsAsFactors = FALSE,
#'   KEEP.OUT.ATTRS = FALSE
#' ) %>%
#'   as.data.frame()
#'
#' data <- data %>% mutate(VALUE = 1:n())
#'
#' hrc_act <- "hrc_ACT.hrc"
#' sdcHierarchies::hier_create(root = "Total", nodes = c("A","B")) %>%
#'   sdcHierarchies::hier_add(root = "A", nodes = c("A1","A2")) %>%
#'   sdcHierarchies::hier_convert(as = "argus") %>%
#'   slice(-1) %>%
#'   mutate(levels = substring(paste0(level,name),3)) %>%
#'   select(levels) %>%
#'   write.table(file = hrc_act, row.names = FALSE, col.names = FALSE, quote = FALSE)
#'
#' hrc_geo <- "hrc_GEO.hrc"
#' sdcHierarchies::hier_create(root = "Total", nodes = c("GA","GB")) %>%
#'   sdcHierarchies::hier_add(root = "GA", nodes = c("GA1","GA2")) %>%
#'   sdcHierarchies::hier_add(root = "GB", nodes = c("GB1","GB2")) %>%
#'   sdcHierarchies::hier_convert(as = "argus") %>%
#'   slice(-1) %>%
#'   mutate(levels = substring(paste0(level,name),3)) %>%
#'   select(levels) %>%
#'   write.table(file = hrc_geo, row.names = FALSE, col.names = FALSE, quote = FALSE)
#'
#' hrc_sex <- "hrc_SEX.hrc"
#' sdcHierarchies::hier_create(root = "Total", nodes = c("F","M")) %>%
#'   sdcHierarchies::hier_add(root = "F", nodes = c("F1","F2")) %>%
#'   sdcHierarchies::hier_add(root = "M", nodes = c("M1","M2")) %>%
#'   sdcHierarchies::hier_convert(as = "argus") %>%
#'   slice(-1) %>%
#'   mutate(levels = substring(paste0(level,name),3)) %>%
#'   select(levels) %>%
#'   write.table(file = hrc_sex, row.names = FALSE, col.names = FALSE, quote = FALSE)
#'
#' # Results of the function
#' res1 <- from_5_to_3(
#'   dfs = data,
#'   dfs_name = "tab",
#'   totcode = c(SEX="Total",AGE="Total", GEO="Total", ACT="Total", ECO = "PIB"),
#'   hrcfiles = c(ACT = hrc_act, GEO = hrc_geo, SEX = hrc_sex),
#'   sep_dir = TRUE,
#'   hrc_dir = "output",
#'   v1 = "ACT",
#'   v2 = "AGE",
#'   v3 = "SEX",
#'   v4 = "ECO"
#' )
#'
#' res2 <- from_5_to_3(
#'   dfs = data,
#'   dfs_name = "tab",
#'   totcode = c(SEX="Total",AGE="Total", GEO="Total", ACT="Total", ECO = "PIB"),
#'   hrcfiles = c(ACT = hrc_act, GEO = hrc_geo, SEX = hrc_sex),
#'   sep_dir = TRUE,
#'   hrc_dir = "output",
#'   verbose = TRUE
#' )
#' @keywords internal
#' @export
from_5_to_3 <- function(
    dfs,
    dfs_name,
    totcode,
    hrcfiles = NULL,
    sep_dir = FALSE,
    hrc_dir = "hrc_alt",
    v1 = NULL,
    v2 = NULL,
    v3 = NULL,
    v4 = NULL,
    sep = "_",
    maximize_nb_tabs = FALSE,
    verbose = FALSE)
{
  # Update the output folder containing the hierarchies
  if( (length(hrcfiles) != 0) & !sep_dir){
    dir_name <- dirname(hrcfiles[[1]])
  } else {
    dir_name <- hrc_dir
  }

  # We remove a dimension from our starting dataframe
  res_5_4 <- from_4_to_3(dfs = dfs,
                                 dfs_name = dfs_name,
                                 totcode = totcode,
                                 hrcfiles = hrcfiles,
                                 sep_dir = TRUE,
                                 hrc_dir = dir_name,
                                 v1 = v1,
                                 v2 = v2,
                                 sep = sep,
                                 maximize_nb_tabs = maximize_nb_tabs)
  if (verbose){
    cat(paste(dfs_name,"has generated",length(res_5_4$tabs),"tables in total\n"))
    cat("Reducing from 4 to 3...\n")
  }

  # Retrieving the merged variables
  v1f <- res_5_4$vars[[1]]
  v2f <- res_5_4$vars[[2]]
  new_var = paste(v1f, v2f, sep=sep)

  # Updating the totals
  totcode2 <- totcode
  totcode2 <- totcode2[!(names(totcode2) %in% c(v1f, v2f))]
  # totcode2[[new_var]] <- 1

  # Updating hrc files
  hrcfiles2 <- hrcfiles
  hrcfiles2 <- hrcfiles2[!(names(hrcfiles2) %in% c(v1f, v2f))]

  # Categorical variables without hierarchy in our 4D tables
  cat_vars <- c(names(totcode2),new_var)

  non_hier_vars <- intersect(
    setdiff(names(dfs), names(hrcfiles2)),
    cat_vars
  )

  # Choice of variables for the 4 -> 3 transition and verification of those provided in argument
  # We now choose v3 and v4 to be sure that the same variable
  # is created within all the sub-tables

  # First variable for the 4 to 3 transition
  if (!is.null(v3)){
    if (!(v3 %in% cat_vars)){
      stop(paste("v3 is not a categorical variable, v3 = ", v3,
                 "The categorical variables are: ",paste(cat_vars, collapse = ", ")), sep = "")
    }
  } else {
    # we choose a variable avoiding v4
    v3 <- chose_var_to_merge(dfs = dfs[setdiff(names(dfs),v4)],
                      totcode = totcode2[setdiff(names(totcode2),v4)],
                      hrcfiles = hrcfiles2[setdiff(names(hrcfiles2),v4)],
                      maximize_nb_tabs = maximize_nb_tabs)

    if (!is.null(v4)){
      # We need to do two different if statements otherwise NULL != new_var crashes!
      if (v4 != new_var & maximize_nb_tabs == TRUE){
        v3 <- new_var
      }
      # If v4 = NULL no need to compare v4 != new_var
    } else if (maximize_nb_tabs == TRUE){
      v3 <- new_var
    }
  }

  # Second variable for the 4 to 3 transition
  if (!is.null(v4)){
    if (!(v4 %in% cat_vars)){
      stop(paste("v4 is not a categorical variable, v4 = ", v4,
                 "The categorical variables are: ",paste(cat_vars, collapse = ", ")), sep = "")
    }
    if (v3 == v4){
      stop("Error. You are trying to merge a variable with itself")
    }

  } else {
    # we choose a variable avoiding v3
    v4 <- chose_var_to_merge(dfs = dfs[setdiff(names(dfs),v3)],
                      totcode = totcode2[setdiff(names(totcode2),v3)],
                      hrcfiles = hrcfiles2[setdiff(names(hrcfiles2),v3)],
                      maximize_nb_tabs = maximize_nb_tabs)

    # Rq : v3 can not be NULL
    if (v3 != new_var & maximize_nb_tabs == TRUE){
      v4 <- new_var
    }
  }

  appel_4_3_gen <- function(nom_dfsb){
    # Update the arguments of the function
    dfsb <- res_5_4$tabs[[nom_dfsb]]

    hrcfiles2b <-  c(hrcfiles2, res_5_4$hrcs[[nom_dfsb]])
    names(hrcfiles2b)[length(hrcfiles2b)] <- new_var

    totcode2[[new_var]] <- res_5_4$alt_tot[[nom_dfsb]]

    from_4_to_3(dfs = dfsb,
                        dfs_name = nom_dfsb,
                        totcode = totcode2,
                        hrcfiles = hrcfiles2b,
                        sep_dir = TRUE,
                        hrc_dir = dir_name,
                        v1 = v3,
                        v2 = v4,
                        sep = sep)
  }

  # Transform all our 4-var tables into 3-var tables
  res_5_3 <- lapply(
    names(res_5_4$tabs),
    appel_4_3_gen
  )

  tabs <- unlist(lapply(res_5_3, function(x) x$tabs), recursive = FALSE)
  hrcs4_3 <- unlist(lapply(res_5_3, function(x) x$hrcs), recursive = FALSE)
  alt_tot4_3 <- unlist(lapply(res_5_3, function(x) x$alt_tot), recursive = FALSE)

  vars1 <- res_5_4$vars
  vars2 <- res_5_3[[1]]$vars # merged variables are always the same
  vars_tot <- list(vars1,vars2)
  names(vars_tot) <- c("five_to_three","four_to_three")

  # Memorization of res5_4

  # Case we merge 4 different variables
  if (!(new_var %in% c(v3,v4))){
    # We repeat as many times res5_4[i] as the table will create
    # 3-dimensional tables

    # Each 4-dimensional table will create the same number of 3-dimensional tables
    # because the selected variables have the same modes in each of them
    nb_rep <- length(tabs) / length(res_5_4$tabs)
    hrcs5_4 <- as.list(unlist(lapply(res_5_4$hrcs,
                                     function(x) rep(x,nb_rep))))

    alt_tot5_4 <- as.list(unlist(lapply(res_5_4$alt_tot,
                                        function(x) rep(x,nb_rep))))

    # If we merge 3 variables into one, the number of tables
    # created by each table changes!
  } else {
    # Store the name of the variable that is not new_var in a new object
    non_fused_var <- ifelse(v3 == new_var, v4, v3)

    # Calculate the value of nb_nodes once for each res_5_4$hrcs[[x]]
    # to avoid calculating the same quantity twice
    results <- lapply(1:length(res_5_4$hrcs), function(x) {
      nb_node_value <- 2 * nb_nodes(res_5_4$hrcs[[x]], hrc_name = FALSE) *
                           nb_nodes(hrcfiles2, non_fused_var)

      # Use the calculated value for hrcs5_4 and alt_tot5_4
      list(
        hrcs = rep(res_5_4$hrcs[[x]], nb_node_value),
        alt_tot = rep(res_5_4$alt_tot[[x]], nb_node_value)
      )
    })

    # Extract the values for hrcs5_4 and alt_tot5_4
    hrcs5_4 <- as.list(unlist(lapply(results, function(x) x$hrcs)))
    alt_tot5_4 <- as.list(unlist(lapply(results, function(x) x$alt_tot)))
  }

  return(list(tabs = tabs,
              hrcs5_4 = hrcs5_4,
              hrcs4_3 = hrcs4_3,
              alt_tot5_4 = alt_tot5_4,
              alt_tot4_3 = alt_tot4_3,
              vars = vars_tot)
  )
}
InseeFrLab/rtauargus documentation built on Feb. 25, 2025, 6:32 a.m.