reduce_dims: General function that selects the appropriate separator and...

View source: R/sp_reduce_dims.R

reduce_dimsR Documentation

General function that selects the appropriate separator and applies dimension reduction.

Description

General function that selects the appropriate separator and applies dimension reduction.

Usage

reduce_dims(
  dfs,
  dfs_name,
  totcode,
  hrcfiles = NULL,
  sep_dir = FALSE,
  hrc_dir = "hrc_alt",
  vars_to_merge = NULL,
  nb_tab_option = "min",
  limit = NULL,
  over_split = FALSE,
  vec_sep = c("___", "_XXX_", "_YYY_", "_TTT_", "_UVW_"),
  verbose = FALSE
)

Arguments

dfs

data.frame with 4 or 5 categorical variables

dfs_name

name of the data.frame in the list provided by the user

totcode

named vector of totals for categorical variables

hrcfiles

named vector indicating the hrc files of hierarchical variables among the categorical variables of dfs

sep_dir

allows forcing the writing of hrc into a separate folder, default is FALSE

hrc_dir

folder to write hrc files if writing to a new folder is forced or if no folder is specified in hrcfiles

vars_to_merge

NULL or vector of variables to be merged: 2 in dimension 4; 3 or 4 in dimension 5

nb_tab_option

strategy to follow for choosing variables automatically:

  • 'min': minimize the number of tables;

  • 'max': maximize the number of tables;

  • 'smart': minimize the number of tables under the constraint of their row count.

limit

maximum allowed number of rows in the smart or over_split = TRUE case

over_split

indicates if we split in several tables the tables bigger than limit at the end of the reduction process ; it decreases the number of hierarchy of these tables

vec_sep

vector of candidate separators to use

verbose

print the different steps of the function to inform the user of progress

Value

A list containing:

  • tabs: named list of 3-dimensional dataframes with nested hierarchies

  • alt_hrc: named list of hrc specific to the variables created during merging to go to dimension 3

  • alt_totcode: named list of totals specific to the variables created during merging to go to dimension 3

  • vars: categorical variables of the output dataframes

  • sep: separator used to link the variables

  • totcode: named vector of totals for all categorical variables

  • hrcfiles: named vector of hrc for categorical variables (except the merged one)

  • fus_vars: named vector of vectors representing the merged variables during dimension reduction

Examples

library(dplyr)
# Examples for dimension 4

data <- expand.grid(
  ACT = c("Total", "A", "B", "A1", "A2","A3", "B1",
  "B2","B3","B4","C","D","E","F","G","B5"),
  GEO = c("Total", "G1", "G2"),
  SEX = c("Total", "F", "M"),
  AGE = c("Total", "AGE1", "AGE2"),
  stringsAsFactors = FALSE
) %>%
  as.data.frame() %>%
  mutate(VALUE = 1)

if(!dir.exists("hrc")) dir.create("hrc")
hrc_act <- "hrc/hrc_ACT4.hrc"

sdcHierarchies::hier_create(
  root = "Total",
  nodes = c("A","B","C","D","E","F","G")
) %>%
  sdcHierarchies::hier_add(root = "A", nodes = c("A1","A2","A3")) %>%
  sdcHierarchies::hier_add(root = "B", nodes = c("B1","B2","B3","B4","B5")) %>%
  sdcHierarchies::hier_convert(as = "argus") %>%
  slice(-1) %>%
  mutate(levels = substring(paste0(level,name),3)) %>%
  select(levels) %>%
  write.table(
    file = hrc_act, row.names = FALSE, col.names = FALSE, quote = FALSE
  )

# Reduce dim by forcing variables to be merged
res1 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total", AGE = "Total", GEO = "Total", ACT = "Total"),
  hrcfiles = c(ACT = hrc_act),
  sep_dir = TRUE,
  vars_to_merge = c("ACT", "GEO"),
  hrc_dir = "output",
  verbose = TRUE
)

# Split the output in order to be under the limit & forcing variables to be merged
res1b <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total", AGE = "Total", GEO = "Total", ACT = "Total"),
  hrcfiles = c(ACT = hrc_act),
  sep_dir = TRUE,
  hrc_dir = "output",
  nb_tab_option = 'smart',
  over_split = TRUE,
  verbose = TRUE,
  limit = 100
)

# Result of the function (minimizes the number of created tables by default)
res2 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total", AGE = "Total", GEO = "Total", ACT = "Total"),
  hrcfiles = c(ACT = hrc_act),
  sep_dir = TRUE,
  hrc_dir = "output",
  verbose = TRUE
)

# Result of the function (maximize the number of created tables)
res3 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total", AGE = "Total", GEO = "Total", ACT = "Total"),
  hrcfiles = c(ACT = hrc_act),
  sep_dir = TRUE,
  hrc_dir = "output",
  nb_tab_option = "max",
  verbose = TRUE
)

# Example for dimension 5

data <- expand.grid(
  ACT = c("Total_A", paste0("A", seq(1,5),"_"),paste0("A1_", seq(1,7)),paste0("A2_", seq(1,9))),
  GEO = c("Total_G", "GA", "GB", "GA1", "GA2", "GB1", "GB2","GA3","GB3","GB4"),
  SEX = c("Total_S", "F", "M","F1","F2","M1","M2"),
  AGE = c("Ensemble", "AGE1", "AGE2", "AGE11", "AGE12", "AGE21", "AGE22"),
  ECO = c("PIB","Ménages","Entreprises"),
  stringsAsFactors = FALSE,
  KEEP.OUT.ATTRS = FALSE
) %>%
  as.data.frame() %>%
  mutate(VALUE = 1:n())

hrc_act <- "hrc/hrc_ACT5.hrc"
sdcHierarchies::hier_create(root = "Total_A", nodes = paste0("A", seq(1,5),"_")) %>%
  sdcHierarchies::hier_add(root = "A1_", nodes = paste0("A1_", seq(1,7))) %>%
  sdcHierarchies::hier_add(root = "A2_", nodes = paste0("A2_", seq(1,9))) %>%
  sdcHierarchies::hier_convert(as = "argus") %>%
  slice(-1) %>%
  mutate(levels = substring(paste0(level,name),3)) %>%
  select(levels) %>%
  write.table(file = hrc_act, row.names = FALSE, col.names = FALSE, quote = FALSE)

hrc_age <- "hrc/hrc_AGE5.hrc"
sdcHierarchies::hier_create(root = "Ensemble", nodes = c("AGE1", "AGE2")) %>%
  sdcHierarchies::hier_add(root = "AGE1", nodes = c("AGE11", "AGE12")) %>%
  sdcHierarchies::hier_add(root = "AGE2", nodes = c("AGE21", "AGE22")) %>%
  sdcHierarchies::hier_convert(as = "argus") %>%
  slice(-1) %>%
  mutate(levels = substring(paste0(level,name),3)) %>%
  select(levels) %>%
  write.table(file = hrc_age, row.names = FALSE, col.names = FALSE, quote = FALSE)

hrc_geo <- "hrc/hrc_GEO5.hrc"
sdcHierarchies::hier_create(root = "Total_G", nodes = c("GA","GB")) %>%
  sdcHierarchies::hier_add(root = "GA", nodes = c("GA1","GA2","GA3")) %>%
  sdcHierarchies::hier_add(root = "GB", nodes = c("GB1","GB2","GB3","GB4")) %>%
  sdcHierarchies::hier_convert(as = "argus") %>%
  slice(-1) %>%
  mutate(levels = substring(paste0(level,name),3)) %>%
  select(levels) %>%
  write.table(file = hrc_geo, row.names = FALSE, col.names = FALSE, quote = FALSE)

# Results of the function
res4 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total_S", AGE = "Ensemble", GEO = "Total_G", ACT = "Total_A", ECO = "PIB"),
  hrcfiles = c(ACT = hrc_act, GEO = hrc_geo, AGE = hrc_age),
  sep_dir = TRUE,
  hrc_dir = "output",
  verbose = TRUE
)

res5 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total_S", AGE = "Ensemble", GEO = "Total_G", ACT = "Total_A", ECO = "PIB"),
  hrcfiles = c(ACT = hrc_act, GEO = hrc_geo),
  sep_dir = TRUE,
  hrc_dir = "output",
  nb_tab_option = 'smart',
  limit = 1300,
  verbose = TRUE
)

res6 <- reduce_dims(
  dfs = data,
  dfs_name = "tab",
  totcode = c(SEX = "Total_S", AGE = "Ensemble", GEO = "Total_G", ACT = "Total_A", ECO = "PIB"),
  hrcfiles = c(ACT = hrc_act, GEO = hrc_geo),
  sep_dir = TRUE,
  hrc_dir = "output",
  nb_tab_option = 'min',
  verbose = TRUE,
  limit = 4470,
  over_split = TRUE
)

InseeFrLab/rtauargus documentation built on Feb. 25, 2025, 6:32 a.m.