R/tab_rtauargus.R

Defines functions tab_rtauargus2 tab_rtauargus

Documented in tab_rtauargus tab_rtauargus2

#' Protect one table by suppressing cells with Tau-Argus
#'
#' The function prepares all the files needed by Tau-Argus and launches the
#' software with the good settings and gets back the result.
#'
#' @inheritParams tab_rda
#' @inheritParams tab_arb
#' @inheritParams run_arb
#'
#' @param files_name string used to name all the files needed to process.
#' All files will have the same name, only their extension will be different.
#' @param dir_name string indicated the path of the directory in which to save
#' all the files (.rda, .hst, .txt, .arb, .csv) generated by the function.
#' @param unif_labels boolean, if explanatory variables have to be standardized
#' @param split_tab `r lifecycle::badge("experimental")` boolean,
#' whether to reduce dimension to 3 while treating a table of dimension 4 or 5
#' (default to `FALSE`)
#' @param limit `r lifecycle::badge("experimental")` numeric, used to choose
#' which variable to merge (if nb_tab_option = 'smart')
#' and split table with a number of row above this limit in order to avoid
#' tauargus failures
#' @param nb_tab_option `r lifecycle::badge("experimental")` strategy to follow
#' to choose variables automatically while splitting:
#' \itemize{
#'   \item{`"min"`: minimize the number of tables;}
#'   \item{`"max"`: maximize the number of tables;}
#'   \item{`"smart"`: minimize the number of tables under the constraint
#'   of their row count.}
#' }
#' @param ... any parameter of the tab_rda, tab_arb or run_arb functions, relevant
#' for the treatment of tabular.
#'
#' @return
#' If output_type equals to 4 and split_tab = FALSE,
#' then the original tabular is returned with a new
#' column called Status, indicating the status of the cell coming from Tau-Argus :
#' "A" for a primary secret due to frequency rule, "B" for a primary secret due
#' to dominance rule, "D" for secondary secret and "V" for no secret cell.
#'
#' If split_tab = TRUE,
#' then the original tabular is returned with some new columns which are boolean
#' variables indicating the status of a cell at each iteration of the protection
#' process as we get with `tab_multi_manager()` function. `TRUE`
#' denotes a cell that have to be suppressed. The last column is then the
#' final status of the suppression process of the original table.
#'
#' If `split_tab = FALSE` and  `output_type` doesn't equal to `4`,
#' then the raw result from tau-argus is returned.
#'
#' @section Standardization of explanatory variables and hierarchies:
#'
#' The boolean argument `unif_labels` is useful to
#' prevent some common errors in using Tau-Argus. Indeed, Tau-Argus needs that,
#' within a same level of a hierarchy, the labels have the same number of
#' characters. When the argument is set to TRUE, `tab_rtauargus`
#' standardizes the explanatory variables to prevent this issue.
#' Hierarchical explanatory variables (explanatory variables associated to
#' a hrc file) are then modified in the tabular data and an another hrc file is
#' created to be relevant with the tabular. In the output, these modifications
#' are removed.
#'
#' @examples
#'\dontrun{
#' library(dplyr)
#' data(turnover_act_size)
#'
#' # Prepare data with primary secret ----
#' turnover_act_size <- turnover_act_size %>%
#'   mutate(
#'     is_secret_freq = N_OBS > 0 & N_OBS < 3,
#'     is_secret_dom = ifelse(MAX == 0, FALSE, MAX/TOT>0.85),
#'     is_secret_prim = is_secret_freq | is_secret_dom
#'   )
#'
#' # Make hrc file of business sectors ----
#' data(activity_corr_table)
#' hrc_file_activity <- activity_corr_table %>%
#'   write_hrc2(file_name = "hrc/activity")
#'
#' # Compute the secondary secret ----
#' options(
#'   rtauargus.tauargus_exe =
#'     "Y:/Logiciels/TauArgus/TauArgus4.2.3/TauArgus.exe"
#' )
#'
#' res <- tab_rtauargus(
#'   tabular = turnover_act_size,
#'   files_name = "turn_act_size",
#'   dir_name = "tauargus_files",
#'   explanatory_vars = c("ACTIVITY", "SIZE"),
#'   hrc = c(ACTIVITY = hrc_file_activity),
#'   totcode = c(ACTIVITY = "Total", SIZE = "Total"),
#'   secret_var = "is_secret_prim",
#'   value = "TOT",
#'   freq = "N_OBS",
#'   verbose = FALSE
#' )
#'
#' # Reduce dims feature
#'
#' data(datatest1)
#' res_dim4 <- tab_rtauargus(
#'   tabular = datatest1,
#'   dir_name = "tauargus_files",
#'   explanatory_vars = c("A10", "treff","type_distrib","cj"),
#'   totcode = rep("Total", 4),
#'   secret_var = "is_secret_prim",
#'   value = "pizzas_tot_abs",
#'   freq = "nb_obs_rnd",
#'   split_tab = TRUE
#' )
#' }
#' @export
tab_rtauargus <- function(
    tabular,
    explanatory_vars,
    files_name = NULL,
    dir_name = NULL,
    totcode = getOption("rtauargus.totcode"),
    hrc = NULL,
    secret_var = NULL,
    secret_no_pl = NULL,
    cost_var = NULL,
    value = "value",
    freq = "freq",
    ip = 10,
    maxscore = NULL,
    suppress = "MOD(1,5,1,0,0)",
    safety_rules = paste0("MAN(",ip,")"),
    show_batch_console = FALSE,
    output_type = 4,
    output_options = "",
    unif_labels = TRUE,
    split_tab = FALSE,
    nb_tab_option = "smart",
    limit = 14700,
    ...
){

  .dots <- list(...)

  ## 0. CONFLITS PARAMETRES .................

  # tabular not a data.frame
  if(!is.data.frame(tabular)){
    stop("tabular has to be a dataframe.")
  }
  if(any(!explanatory_vars %in% names(tabular))){
    stop("At least one of the explanatory vars is not a tabular's column name")
  }
  if(any(!c(value, freq) %in% names(tabular))){
    stop(paste0(value, " or ", freq, " is not a tabular's column name"))
  }
  if(!is.null(maxscore)){
    if(!maxscore %in% names(tabular)){
      stop(paste0(maxscore, " is not a tabular's column name"))
    }
  }
  if(!is.null(cost_var)){
    if(!cost_var %in% names(tabular)){
      stop(paste0(cost_var, " is not a tabular's column name"))
    }
  }
  if(!is.null(secret_var)){
    if(!secret_var %in% names(tabular)){
      stop(paste0(secret_var, " is not a tabular's column name"))
    }
  }
  if(length(totcode) < length(explanatory_vars)){
    stop("totcode must have the same length as explanatory_vars")
  }
  if(length(names(totcode)) < length(explanatory_vars)){
    names(totcode) <- explanatory_vars
  }

  if(is.null(files_name)) files_name <- "targus_file"
  if(is.null(dir_name)) dir_name <- getwd()

  if (split_tab){
    # detect secret_var = NULL
    # We want to split the table but the primary secret have not been posed
    if ( !grepl("MAN", safety_rules) ){
      stop("While using split_tab = TRUE, you can't use tauargus to put primary secret")
    }
    if ( is.null(secret_var) ){
      stop("While using split_tab = TRUE, a secret_var has to be provided")
    }
    # split_tab strategy only work with dimension 4 or 5 tables
    if ( ! length(explanatory_vars) %in% c(4,5) ){
      stop(
        "You use split_tab = TRUE. However it only works with 4 or 5 dimensions
      tables."
      )
    }
  }

  if (length(explanatory_vars) %in% c(4,5)){
    if (split_tab){

      params_rt4 <- formals(fun = "tab_rtauargus4")
      params_rt4 <- params_rt4[1:(length(params_rt4)-1)]
      call <- sys.call(); call[[1]] <- as.name('list')
      new_params <- eval.parent(call)

      for(param in intersect(names(params_rt4), names(new_params))){
        params_rt4[[param]] <- new_params[[param]]
      }

      params_rt4$tabular <- tabular
      params_rt4$totcode <- totcode
      params_rt4$dir_name <- dir_name
      params_rt4$files_name <- files_name

      return(do.call("tab_rtauargus4", params_rt4))

    } else {
      message("Warning :
It is highly recommended to use split_tab = TRUE when using rtauargus with 4 or 5 dimensions tables.
It allows to split the table in several tables with 3 dimensions.

With split_tab = FALSE, tauargus treats the table in 4 or 5 dimensions.
In this case, the secondary secret may not being optimal according to tauargus itself
and the process may take longer.")
    }
  }


  ## 1. TAB_RDA  .....................
  tabular_original <- tabular
  # uniformisation des chaines de caractères des variables catégorielles, hors total
  # tabular ......................
  if(unif_labels){
    res_unif <- uniformize_labels(tabular, explanatory_vars, hrc, totcode)
    tabular <- res_unif$data
    if(!is.null(hrc)) hrc <- res_unif$hrc_unif
  }

  # parametres
  param_tab_rda <- param_function(tab_rda, .dots)
  param_tab_rda$tabular <- tabular
  param_tab_rda$tab_filename <- file.path(dir_name, paste0(files_name, ".tab"))
  param_tab_rda$rda_filename <- file.path(dir_name, paste0(files_name, ".rda"))
  param_tab_rda$hst_filename <- if(is.null(secret_var) & is.null(cost_var)) NULL else file.path(dir_name, paste0(files_name, ".hst"))
  param_tab_rda$explanatory_vars <- explanatory_vars
  param_tab_rda$hrc <- hrc

  param_tab_rda$totcode <- totcode
  param_tab_rda$secret_var <- secret_var
  param_tab_rda$secret_no_pl <- secret_no_pl
  param_tab_rda$cost_var <- cost_var
  param_tab_rda$value <- value
  param_tab_rda$freq <- freq
  param_tab_rda$ip <- ip
  param_tab_rda$maxscore <- maxscore

  # appel (+ récuperation noms tab hst et rda)
  input <- do.call(tab_rda, param_tab_rda)


  ## 2. TAB_ARB .........................

  # parametres
  param_arb <- param_function(tab_arb, .dots)
  param_arb$tab_filename <- input$tab_filename
  param_arb$rda_filename <- input$rda_filename
  param_arb$hst_filename <- input$hst_filename
  param_arb$arb_filename <- file.path(dir_name, paste0(files_name, ".arb"))
  param_arb$output_names <- file.path(dir_name, paste0(files_name, ".csv"))
  #TODO : generaliser le choix de l'extension
  param_arb$output_type <- output_type
  param_arb$output_options <- output_options
  param_arb$explanatory_vars <- explanatory_vars
  param_arb$value <- value
  param_arb$safety_rules <- safety_rules
  param_arb$suppress <- suppress

  # appel (+ récupération nom batch)
  batch <- do.call(tab_arb, param_arb)

  ## 3. RUN_ARB ...........................

  # parametres
  param_run0 <- param_function(run_arb, .dots)
  param_system <- param_function(system, .dots)
  param_run <- c(param_run0, param_system)
  param_run$arb_filename <- param_arb$arb_filename
  param_run$logbook <- file.path(dir_name, paste0(files_name, ".txt"))
  param_run$is_tabular <- TRUE
  param_run$show_batch_console <- show_batch_console

  # appel
  res <- do.call(run_arb, param_run)

  # RESULTAT .............................
  if(output_type == 4){

    res_import <- utils::read.csv(
      param_arb$output_names,
      header = FALSE,
      col.names = c(explanatory_vars, value, freq, "Status","Dom"),
      colClasses = c(rep("character", length(explanatory_vars)), rep("numeric",2), "character", "numeric"),
      stringsAsFactors = FALSE,
      na.strings = ""
    )
    if(unif_labels){
      res_import <- cbind.data.frame(
        apply(res_import[,explanatory_vars,drop=FALSE], 2, rev_var_pour_tau_argus),
        res_import[, !names(res_import) %in% explanatory_vars]
      )
    }
    mask <- merge(tabular_original, res_import[,c(explanatory_vars,"Status")], by = explanatory_vars, all = TRUE)

    utils::write.csv(
      res_import,
      file = param_arb$output_names,
      row.names = FALSE
    )

    return(mask)

  }else{
    if(unif_labels){
      res <- cbind.data.frame(
        apply(res[,explanatory_vars,drop=FALSE], 2, rev_var_pour_tau_argus),
        res[, !names(res) %in% explanatory_vars]
      )
    }
    return(res)
  }

}

#' Wrapper of tab_rtauargus adapted for `tab_multi_manager` function.
#'
#' @inheritParams tab_rtauargus
#' @param ip Interval Protection Level (10 by default)
#' @param ... Other arguments of `tab_rtauargus` function
#'
#' @return
#' The original tabular is returned with a new
#' column called Status, indicating the status of the cell coming from Tau-Argus :
#' "A" for a primary secret due to frequency rule, "B" for a primary secret due
#' to dominance rule, "D" for secondary secret and "V" for no secret cell.
#'
#' @seealso `tab_rtauargus`
#'
#' @export
#'
#' @examples
#'\dontrun{
#' library(dplyr)
#' data(turnover_act_size)
#'
#' # Prepare data with primary secret ----
#' turnover_act_size <- turnover_act_size %>%
#'   mutate(
#'     is_secret_freq = N_OBS > 0 & N_OBS < 3,
#'     is_secret_dom = ifelse(MAX == 0, FALSE, MAX/TOT>0.85),
#'     is_secret_prim = is_secret_freq | is_secret_dom
#'   )
#'
#' # Make hrc file of business sectors ----
#' data(activity_corr_table)
#' hrc_file_activity <- activity_corr_table %>%
#'   write_hrc2(file_name = "hrc/activity")
#'
#' # Compute the secondary secret ----
#' options(
#'   rtauargus.tauargus_exe =
#'     "Y:/Logiciels/TauArgus/TauArgus4.2.3/TauArgus.exe"
#' )
#'
#' res <- tab_rtauargus2(
#'   tabular = turnover_act_size,
#'   files_name = "turn_act_size",
#'   dir_name = "tauargus_files",
#'   explanatory_vars = c("ACTIVITY", "SIZE"),
#'   hrc = c(ACTIVITY = hrc_file_activity),
#'   totcode = c(ACTIVITY = "Total", SIZE = "Total"),
#'   secret_var = "is_secret_prim",
#'   value = "TOT",
#'   freq = "N_OBS"
#' )
#'
#' # reduce dimensions feature
#' data(datatest1)
#' res_dim4 <- tab_rtauargus2(
#'   tabular = datatest1,
#'   dir_name = "tauargus_files",
#'   explanatory_vars = c("A10", "treff","type_distrib","cj"),
#'   totcode = rep("Total", 4),
#'   secret_var = "is_secret_prim",
#'   value = "pizzas_tot_abs",
#'   freq = "nb_obs_rnd",
#'   split_tab = TRUE
#' )
#' }
tab_rtauargus2 <- function(
    tabular,
    files_name = NULL,
    dir_name = NULL,
    explanatory_vars,
    totcode,
    hrc = NULL,
    secret_var = NULL,
    secret_no_pl = NULL,
    cost_var = NULL,
    value = "value",
    freq = "freq",
    ip = 10,
    suppress = "MOD(1,5,1,0,0)",
    split_tab = TRUE,
    nb_tab_option = "smart",
    limit = 14700,
    ...
){

  .dots = list(...)

  params <- param_function(tab_rtauargus, .dots)
  params$tabular = tabular
  params$files_name = files_name
  params$dir_name = dir_name
  params$explanatory_vars = explanatory_vars
  params$totcode = totcode
  params$hrc = hrc
  params$secret_var = secret_var
  params$secret_no_pl = secret_no_pl
  params$cost_var = cost_var
  params$value = value
  params$freq = freq
  if(is.null(ip)){
    if(!"safety_rules" %in% names(params)){
      stop("Either ip or safety_rules has to be set.")
    }
  }else{
    params$safety_rules = "MAN(0)" #paste0("MAN(",ip,")")
  }
  params$ip = ip
  params$suppress = suppress
  params$split_tab = if(length(explanatory_vars) > 3) split_tab else FALSE
  params$dir_name = if(params$split_tab) file.path(dir_name, files_name) else dir_name
  params$nb_tab_option = nb_tab_option
  params$limit = limit
  params$show_batch_console = FALSE
  params$output_type = 4
  params$output_options = ""
  params$unif_labels = TRUE
  params$separator = ","
  params$verbose = FALSE

  res <- do.call("tab_rtauargus", params)

  if(params$split_tab & length(explanatory_vars) > 3){
    vars_secret_iteration <-
      names(res)[grepl("^is_secret_[1-9].*$", names(res), perl=TRUE)]
    last_iteration <-  names(res)[length(names(res))]
    res$Status <- ifelse(res[[secret_var]], "B",
                         ifelse(res[[last_iteration]], "D", "V")
    )
    res <- res[,c(setdiff(names(res), vars_secret_iteration))]
  }

  return(res)

}
InseeFrLab/rtauargus documentation built on Feb. 25, 2025, 6:32 a.m.