R/GaussSuppressDec.R

Defines functions GaussSuppressDec

Documented in GaussSuppressDec

#' Cell suppression with synthetic decimal numbers
#' 
#' \code{\link{GaussSuppressionFromData}}, or one of its wrappers, is run and decimal numbers are added to output by
#' executing  \code{\link[RegSDC]{SuppressDec}}. 
#'
#' @param data Input daata as a data frame 
#' @param ... Further parameters to \code{\link{GaussSuppressionFromData}}
#' @param fun A function: \code{\link{GaussSuppressionFromData}} or one of its wrappers such as
#'              \code{\link{SuppressSmallCounts}} and \code{\link{SuppressDominantCells}}.
#' @param output NULL (default), `"publish"`, `"inner"`, `"publish_inner"`, or `"publish_inner_x"` (x also).
#' @param use_freqVar Logical (`TRUE`/`FALSE`) with a default value of `NA`. Determines whether the variable  
#'   `freqVar` is used as the basis for generating decimal numbers. 
#'   If `NA`, the parameter is set to `TRUE`, except in the following cases, where it is set to `FALSE`:
#'   - If `freqVar` is not available.
#'   - If `runIpf` is `FALSE` and `fun` is one of the functions `SuppressFewContributors` or `SuppressDominantCells`.
#' 
#'   When `use_freqVar` is `FALSE`, only zeros are used instead. This approach is more robust in practice, 
#'   as decimal numbers can then be stored more accurately. 
#'   The default value is chosen to ensure compatibility with existing code and to allow for the use of `freqVar` 
#'   when dealing with frequency tables, which may be useful.

#' @param digits Parameter to \code{\link[SSBtools]{RoundWhole}}. Values close to whole numbers will be rounded.
#' @param nRep NULL or an integer. When >1, several decimal numbers will be generated.
#' @param rmse Desired root mean square error of decimal numbers. 
#'            Variability around the expected, according to the linear model, inner frequencies.
#'            The expected frequencies are calculated from the non-suppressed publishable frequencies.     
#' @param sparseLimit Limit for the number of rows of a reduced x-matrix within the algorithm. When exceeded, a new sparse algorithm is used.
#' @param rndSeed If non-NULL, a random generator seed to be used locally within the function without affecting the random value stream in R. 
#' @param runIpf When TRUE, additional frequencies are generated by iterative proportional fitting using \code{\link[SSBtools]{Mipf}}.
#' @param eps  Parameter to \code{\link[SSBtools]{Mipf}}.
#' @param iter Parameter to \code{\link[SSBtools]{Mipf}}.
#' @param mismatchWarning  Whether to produce the warning "`Mismatch between whole numbers and suppression`", when relevant.
#'                       When `nRep>1`, all replicates must satisfy the whole number requirement for non-suppressed cells.  
#'                       When `mismatchWarning` is integer (`>0`), this will be used as parameter `digits` to \code{\link[SSBtools]{RoundWhole}} 
#'                       when doing mismatch checking (can be quite low when `nRep>1`). 
#' @param whenDuplicatedInner Function to be called when default output and when cells marked as inner correspond to 
#'                            several input cells (aggregated) since they correspond to published cells. 
#' @param whenMixedDuplicatedInner Function to be called in the case above when some inner cells correspond
#'                                 to published cells (aggregated) and some not (not aggregated).
#' @return A data frame where inner cells and cells to be published are combined or output according to parameter `output`. 
#' 
#' @importFrom SSBtools RoundWhole Match Mipf RbindAll
#' @importFrom RegSDC SuppressDec
#' @importFrom Matrix crossprod
#' @importFrom stats runif
#' @export
#' 
#' @author Øyvind Langrsud
#' 
#' @seealso [SuppressionFromDecimals()]
#' 
#'
#' @examples
#' a <- GaussSuppressDec(data = SSBtoolsData("example1"), 
#'                       fun = SuppressSmallCounts, 
#'                       dimVar = c("age", "geo"),
#'                       preAggregate = TRUE, 
#'                       freqVar = "freq", maxN = 3)
#' a                       
#'                  
#' 
#' b <- GaussSuppressDec(data = SSBtoolsData("magnitude1"), 
#'                       fun = SuppressDominantCells, 
#'                       numVar = "value", 
#'                       formula = ~sector2 * geo + sector4 * eu,
#'                       contributorVar = "company", k = c(80, 99))
#' b  
#'  
#' # FormulaSelection() works on this output as well 
#' FormulaSelection(b, ~sector2 * geo)                       
#'                       
GaussSuppressDec = function(data, 
                            ..., 
                            fun = GaussSuppressionFromData,
                            output = NULL, 
                            use_freqVar = NA,
                            digits = 9, 
                            nRep = NULL,
                            rmse = pi/3,
                            sparseLimit = 500,
                            rndSeed = 123,
                            runIpf = FALSE,
                            eps = 0.01,
                            iter = 100,
                            mismatchWarning = TRUE,
                            whenDuplicatedInner = NULL,
                            whenMixedDuplicatedInner = warning){

  if (!is.null(rndSeed)) {
    if (!exists(".Random.seed")) 
      if (runif(1) < 0) 
        stop("Now seed exists")
    exitSeed <- .Random.seed
    on.exit(.Random.seed <<- exitSeed)
    set.seed(rndSeed)
  }
  
  if(!is.null(output)){
    if(!(output %in% c("publish_inner_x", "publish_inner", "inner", "publish")))
      stop('Allowed non-NULL values of parameter output are "publish_inner_x", "publish_inner", "inner" and "publish".')
    
  } else {
    output <- ""
  }
  
  if (is.null(nRep)) {
    freqDecNames <- "freqDec"
    nRep <- 1
  } else {
    freqDecNames <- paste0("freqDec", paste(seq_len(nRep)))[seq_len(nRep)]
  }
  
  a <- fun(data, ..., output = "publish_inner_x")
  
  startRow <- attr(a$publish, "startRow", exact = TRUE)

  freqVar <- attr(a$inner, "freqVar")
  weightVar <- attr(a$inner, "weightVar")
  numVar <- attr(a$inner, "numVar")
  
  dimVarPub <- colnames(a$publish)
  dimVarPub <- dimVarPub[!(dimVarPub %in% c(freqVar, "primary", "suppressed", weightVar, numVar, MoreVars(...)))]
  dimVarPub <- dimVarPub[(dimVarPub %in% colnames(a$inner))]
  
  
  if (is.na(use_freqVar)) {
    use_freqVar <- TRUE
    if (!length(freqVar)) {
      use_freqVar <- FALSE
    } else {
      if (!runIpf) {
        if (identical(fun, SuppressFewContributors))
          use_freqVar <- FALSE
        if (identical(fun, SuppressDominantCells))
          use_freqVar <- FALSE
      }
    }
  }
  
  if (use_freqVar) {
    freq_for_dec_publish <- a$publish[[freqVar]]
    freq_for_dec_inner <- a$inner[[freqVar]]
  } else {
    freq_for_dec_publish <- rep(0L, nrow(a$publish))
    freq_for_dec_inner <- rep(0L, nrow(a$inner))
  }
  
  z <- as.matrix(freq_for_dec_publish)
  y <- as.matrix(freq_for_dec_inner)
  
  if (nRep) {
    yDec <- SuppressDec(a$x, z = z, y = y, suppressed = a$publish$suppressed, digits = digits, nRep = nRep, rmse = rmse, sparseLimit = sparseLimit)
    zDec <- RoundWhole(as.matrix(Matrix::crossprod(a$x, yDec)), digits = digits)
  } else {
    yDec <- matrix(0, nrow(y),0)
    zDec <- matrix(0, nrow(z),0)
  }
  
  if(runIpf){
    freqDecNames <- c(freqDecNames, "freqIpf")
    yIpf <- Mipf(x= a$x[,!a$publish$suppressed, drop=FALSE], z = z[!a$publish$suppressed, 1, drop=FALSE], iter = iter, eps = eps)
    cat("\n")
    yDec <- cbind(yDec,  as.matrix(yIpf))
    zDec <- cbind(zDec,  as.matrix(Matrix::crossprod(a$x, yIpf)))
  }
  
  
  colnames(yDec) <- freqDecNames
  colnames(zDec) <- freqDecNames
  
  
  a$publish <- cbind(a$publish, zDec)
  rownames(a$publish) <- NULL
  a$inner <- cbind(a$inner, yDec)
  
  if (nRep & mismatchWarning) {
    
    if (is.numeric(mismatchWarning)) {
      digitsPrimary <- mismatchWarning
    } else {
      digitsPrimary <- digits
    }
    
    # Re-use primary-function originally made for SuppressionFromDecimals
    suppressionFromDecimals <- PrimaryDecimals(freq = freq_for_dec_publish, num = a$publish[freqDecNames[1:nRep]], nDec = nRep, digitsPrimary = digitsPrimary)
    
    if (any(a$publish$suppressed != suppressionFromDecimals)) {
      incorrectly_suppressed <- !a$publish$suppressed & suppressionFromDecimals
      incorrectly_unsuppressed <- a$publish$suppressed & !suppressionFromDecimals
      n_incorrectly_suppressed <- sum(incorrectly_suppressed)
      n_incorrectly_unsuppressed <- sum(incorrectly_unsuppressed)
      if (n_incorrectly_suppressed) {
        show_var <- c(freqVar, numVar)[1]
        max_incorrect_value <- round(max(a$publish[[show_var]][incorrectly_suppressed]), 1)
        parenthesis1 <- paste0(" (", max_incorrect_value, " max ", show_var, ")")
      } else {
        parenthesis1 <- ""
      }
      if (n_incorrectly_unsuppressed) {
        n_incorrectly_primary <- sum(incorrectly_unsuppressed & a$publish$primary)
        parenthesis2 <- paste0(" (", n_incorrectly_primary, " primary)")
      } else {
        parenthesis2 <- ""
      }
      warning(paste0(
        "Mismatch between whole numbers and suppression: ",
        n_incorrectly_suppressed, " incorrectly suppressed", 
        parenthesis1, ", ",
        n_incorrectly_unsuppressed, " incorrectly unsuppressed", 
        parenthesis2))
    }
  }
  
  if (!is.null(startRow)) {
    attr(a$publish, "startRow") <- startRow
  }
  
  if (output == "publish_inner_x") 
    return(a)
  
  if (output == "publish_inner") 
    return(a[c("publish", "inner")])
  
  if (output == "publish") 
    return(a$publish)
  
  if (output == "inner") 
    return(a$inner)
  
  ma <- Match(a$inner[dimVarPub], a$publish[dimVarPub])
  
  anyDuplicated_ma <- anyDuplicated(ma[!is.na(ma)])
  
  a$publish$isPublish <- TRUE
  a$publish$isInner <- FALSE
  a$publish$isInner[ma[!is.na(ma)]] <- TRUE
  
  if (!anyNA(ma)) {
    if (anyDuplicated_ma & !is.null(whenDuplicatedInner)) {
      whenDuplicatedInner("Duplicated inner rows identified. Aggregation applied to some variables.")
    }
  } else {
    if (anyDuplicated_ma & !is.null(whenMixedDuplicatedInner)) {
      whenMixedDuplicatedInner("Duplicated inner rows identified. Aggregation applied to some variables in some rows.")
    }
  }
  
  extra_inner <- which(!(names(a$inner) %in% names(a$publish)))
  extra_inner_to_publish <- integer(0)
  if (length(extra_inner)) {
    ma_rows <- !is.na(ma)
    ma_ok <- ma[ma_rows]
    ma_unique <- unique(ma_ok)
    n_ma_unique <- length(ma_unique)
    if (anyDuplicated_ma) {
      for (i in extra_inner) {  
        if (nrow(unique(cbind(ma_ok, a$inner[ma_rows, i]))) == n_ma_unique) {
          # Variable is ok when equal value in duplicate rows. 
          # Only one of them will be copied to a$publish
          extra_inner_to_publish <- c(extra_inner_to_publish, i)
        }
      }
    } else {
      extra_inner_to_publish <- extra_inner
    }
  }
  extra_inner_to_remove <- extra_inner[!(extra_inner %in% extra_inner_to_publish)]
  if (length(extra_inner_to_publish)) {
    extra_inner_to_publish <- names(a$inner)[extra_inner_to_publish]
    a$publish[extra_inner_to_publish] <- NA
    if (length(ma_unique)) {
      a$publish[ma_unique, extra_inner_to_publish] <- a$inner[match(ma_unique, ma), extra_inner_to_publish]
    }
  }
  if (length(extra_inner_to_remove)) {
    a$inner <- a$inner[-extra_inner_to_remove]
  }
  
  
  if (!anyNA(ma)) {
    return(a$publish)
  }
  
  a$inner <- a$inner[is.na(ma), , drop = FALSE]
  
  a$inner$isPublish <- FALSE
  a$inner$isInner <- TRUE
  
  
  if (!is.null(startRow)) {
    startRow <-   c(startRow, StaRt_InneR = nrow(a$publish) + 1L)
  }
  
  a <- RbindAll(a$publish, a$inner)
  
  if (!is.null(startRow)) {
    attr(a, "startRow") <- startRow
  }
  
  a
}

Try the GaussSuppression package in your browser

Any scripts or data that you put into this service are public.

GaussSuppression documentation built on June 8, 2025, 10:43 a.m.