R/modify_cnt_ptable.R

Defines functions modify_cnt_ptable

Documented in modify_cnt_ptable

#' @title Modify a ptable suitable for frequency count variables
#' 
#' @description [modify_cnt_ptable()] is a function to modify the standard 
#' ptable for count variables that is generated by [create_cnt_ptable()] or 
#' within the 'cellKey'-package. The noise intervals in 
#' the standard ptable are ordered from -D to D. A modified ptable still has 
#' the same properties as the standard ptable but can ensure a higher 
#' protection of perturbed frequency tables since the noise probabilities are
#' split and the intervals are rearranged. 
#' 
#' @details 
#' In a first step, the noise probabilities larger than a threshold value 
#' will be split. Then, the split noise probabilities are randomly rearranged 
#' using a seed (the modifications is replicable). Finally, the intervals of 
#' the ptable will be adjusted. 
#' 
#' @seealso [create_ptable()]
#' 
#' @param input The ptable-object of class 'ptable', 'ck_params' or data.table 
#' @param threshold The maximum width of the intervals after modification
#' @param seed A seed for the rearrangement of the split intervals
#' @return Returns an object of class [ptable-class] or a data.table.
#' @author Tobias Enderle, \email{tobias.enderle@@destatis.de}
#' @examples 
#' # Original ptable
#' ptab <- create_cnt_ptable(3, 1)
#' 
#' # modified ptable
#' ptab_mod <- modify_cnt_ptable(ptab, 0.3, seed = 5467)
#' ptab_mod@pTable
#' 
#' @export
#' @rdname modify_cnt_ptable
#' @md
modify_cnt_ptable <- function(input,
                              threshold = 0.2,
                              seed = NULL){
  
  . <- ptab_obj_new <- ptab <- NULL
  
  if (!(threshold <= 1 & threshold > 0))
    stop("Please, specify a threshold larger than 0 and less equal 1.")
  
  # Check Input Type: ptable-object or ptable
  if (class(input)[1] == "ptable"){
    
    if (!(input@table == "cnts"))
      stop("Only works with ptables for frequency count tables!")
    
    ptab_obj_new <- copy(input)
    ptab <- ptab_obj_new@pTable
    
  } else if (class(input)[1] == "data.table") {
    
    stopifnot(identical(
      names(input),
      c("i", "j", "p", "v", "p_int_lb", "p_int_ub", "type")
    ))
    ptab <- input
    
  } else if (class(input)[1] == "ck_params") {
    
    if (!(input$type == "cnts"))
      stop("Only works with ptables for frequency count tables!")
    
    
    stopifnot(identical(
      names(input$params$ptable),
      c("i", "j", "p", "v", "lb", "ub", "type")
    ))
    ck_obj_new <- copy(input)
    ptab <- ck_obj_new$params$ptable
    setnames(ptab, "lb", "p_int_lb")
    setnames(ptab, "ub", "p_int_ub")
    
    
  } else {
    
    stop("Input must be an object of class 'ptable', 'ck_params' or a ptable 
         of class 'data.table'.")
  }
  
  
  
  modify <- function(ptab, seed, threshold){
    
    . <- p <- i <- p_int_ub <- p_int_lb <- v <- NULL
    
    # Function to split the probabilities into smaller parts 
    # (of size 'threshold' for maximum)
    # - each interval must be at least of size 'threshold'
    # - only split intervals that are smaller than 1 
    #   (e.g. i=0 has p=1 and doesn't have to be splitted)
    splitter <- function(x, threshold){
      
      if (x >= threshold && x != 1){
        
        times <- floor(x / threshold)
        y <- c(rep(threshold, times))
        y <- c(y, x-sum(y))
        
      } else {
        y <- x
      }
      
      return(y)
      
    }
    
    # split the probabilities of each noise into smaller parts
    # (according to the threshold value)
    erg <- lapply(ptab$p, function(x) splitter(x = x, threshold = threshold ))
    
    # enlarge the ptable using the split noise probabilities
    ptab_mod <- ptab[rep(1:.N, lapply(erg, length)),]
    # add the new probabilities to the enlarged ptable
    ptab_mod[, p:= unlist(erg)][]
    
    # reorder the noises the within each group 'i' using a seed for replication
    set.seed(seed)
    ptab_mod[, order := sample(1:.N, .N), by = list(i)]
    ptab_mod <- ptab_mod[order(i,order)]
    
    
    
    # new computation of intervals using the split and reordered noise probs
    ptab_mod[, p_int_ub := cumsum(p), by = list(i)]
    ptab_mod[, p_int_lb := p_int_ub - p]
    
    # rounding to have 'ndigits'
    ndigits <- 8
    ptab_mod[, p_int_lb := round(p_int_lb, ndigits)]
    ptab_mod[, p_int_ub := round(p_int_ub, ndigits)]
    # IMPORTANT step: Due to rounding errors, 'p' is replaced 
    # by the differences of the rounded intervals
    ptab_mod[, p := p_int_ub - p_int_lb][]
    
    
    # help variable: consecutive intervals with identical noise
    ptab_mod[, diff:= c(NA, diff(v)), by = list(i)]
    # meta information about consecutive intervals with identical noise
    #ptab_mod[, sum(diff==0, na.rm = TRUE), by = list(i)]
    cnt_cons_intervals <- ptab_mod[, sum(diff==0, na.rm = TRUE)]
    if (cnt_cons_intervals > 0) 
      message(
        paste0(
          "There are ",
          cnt_cons_intervals,
          " consecutive intervals with identical noise. You can try another 
          'seed' or proceed. At least, check whether the modified ptable has 
          sufficiently rearranged sub-intervals."
        )
      )
    
    ptab_mod[, c("diff", "order") := NULL][]
    
    return(ptab_mod)
  }
  
  
  ptab_mod <- modify(ptab = ptab, seed = seed, threshold = threshold)
  attr(ptab_mod, "intervals") <- "modified"
  
  if (class(input)[1] == "ptable"){
    
    ptab_obj_new@pTable <- ptab_mod
    output <- ptab_obj_new
  } else if (class(input)[1] == "ck_params") {
    
    setnames(ptab_mod, "p_int_ub", "ub")
    setnames(ptab_mod, "p_int_lb", "lb")
    ck_obj_new$params$ptable <- ptab_mod
    output <- ck_obj_new
  } else {
    
    output <- ptab_mod
    
  }
  
  message("\n NOTE for Tau-Argus: 
          Please use a new Tau-Argus Release (>= 4.2.3).")
  return(output)
  
}
#' @rdname modify_cnt_ptable
#' @usage NULL
#' @export
modify_cnts_ptable <- modify_cnt_ptable
tenderle/ptable documentation built on March 5, 2023, 3:35 a.m.