R/riskyCells.R

Defines functions riskyCellsWork riskyCells

Documented in riskyCells

#' riskyCells
#'
#' Allows to compute risky (unweighted) combinations of key variables either
#' up to a specified dimension or using identification level. This mimics the
#' approach taken in mu-argus.
#'
#' @name riskyCells
#' @md
#' @docType methods
#' @param obj a `data.frame`, `data.table` or an [sdcMicroObj-class] object
#' @param useIdentificationLevel (logical) specifies if tabulation should be
#' done up to a specific dimension (`useIdentificationLevel = FALSE` using
#' argument `maxDim`) or taking identification levels
#' (`useIdentificationLevel = FALSE` using argument `level`) into account.
#' @param threshold a numeric vector specifiying the thresholds at which cells
#' are considered to be unsafe. In case a tabulation is done up to a specific
#' level (`useIdentificationLevel = FALSE`), the thresholds may be specified
#' differently for each dimension. In the other case, the same threshold is
#' used for all tables.
#' @param ... see possible arguments below
#' - `keyVars`: index or variable-names within `obj` that should be used for
#' tabulation. In case `obj` is a [sdcMicroObj-class] object, this argument is
#' not used and the pre-defined key-variables are used.
#' - `level`: in case `useIdentificationLevel = TRUE`, this numeric vector
#' specifies the importance of the key variables. The construction of output
#' tables follows the implementation in mu-argus, see e.g
#' [mu-argus](https://github.com/sdcTools/manuals/raw/master/mu-argus/MUmanual5.1.pdf).
#' The length of this numeric vector must match the number of key variables.
#' - `maxDim`: in case `useIdentificationLevel = FALSE`, this number specifies
#' maximal number of variables to tablulate.
#' @return a `data.table` showing the number of unsafe cells, thresholds for
#' any combination of the key variables. If the input was a [sdcMicroObj-class]
#' object and some modifications have been already applied to the categorical
#' key variables, the resulting output contains the number of unsafe cells
#' both for the original and the modified data.
#' @keywords manip
#' @author Bernhard Meindl
#' @export
#' @examples
#' ## data.frame method / all combinations up to maxDim
#' # riskyCells(
#' #  obj = testdata2,
#' #  keyVars = 1:5,
#' #  threshold = c(50, 25, 10, 5),
#' #  useIdentificationLevel = FALSE,
#' #  maxDim = 4
#' # )
#' #riskyCells(
#' #  obj  = testdata2,
#' #  keyVars = 1:5,
#' #  threshold = 10,
#' #  useIdentificationLevel = FALSE,
#' #  maxDim = 3
#' #)
#' #
#' ### data.frame method / using identification levels
#' #riskyCells(
#' #  obj = testdata2,
#' #  keyVars = 1:6,
#' #  threshold = 20,
#' #  useIdentificationLevel = TRUE,
#' #  level = c(1, 1, 2, 3, 3, 5)
#' #)
#' #riskyCells(
#' #  obj = testdata2,
#' #  keyVars = c(1, 3, 4, 6),
#' #  threshold = 10,
#' #  useIdentificationLevel = TRUE,
#' #  level = c(1, 2, 2, 4)
#' #)
#' #
#' ### sdcMicroObj-method / all combinations up to maxDim
#' #testdata2[1:6] <- lapply(1:6, function(x) {
#' #  testdata2[[x]] <- as.factor(testdata2[[x]])
#' #})
#' #
#' #sdc <- createSdcObj(
#' #  dat = testdata2,
#' #  keyVars = c("urbrur", "roof", "walls", "water", "electcon", "relat", "sex"),
#' #  numVars = c("expend", "income", "savings"),
#' #  w = "sampling_weight")
#' #
#' #r0 <- riskyCells(
#' #  obj = sdc,
#' #  useIdentificationLevel=FALSE,
#' # threshold = c(20, 10, 5),
#' # maxDim = 3
#' #)
#' #
#' ### in case key-variables have been modified, we get counts for
#' ### original and modified data
#' #sdc <- groupAndRename(
#' #  obj = sdc,
#' #  var = "roof",
#' #  before = c("5", "6", "9"),
#' #  after = "5+"
#' #)
#' #r1 <- riskyCells(
#' #  obj = sdc,
#' #  useIdentificationLevel = FALSE,
#' #  threshold = c(10, 5, 3),
#' #  maxDim = 3
#' #)
#' #
#' ### sdcMicroObj-method / using identification levels
#' #riskyCells(
#' #  obj = sdc,
#' #  useIdentificationLevel = TRUE,
#' #  threshold = 10,
#' #  level = c(1, 1, 3, 4, 5, 5, 5)
#' #)
riskyCells <- function(obj, useIdentificationLevel=FALSE, threshold, ...) {
  # checks
  stopifnot(is.logical(useIdentificationLevel))
  stopifnot(length(useIdentificationLevel)==1)

  params <- list(...)

  if (inherits(obj, "sdcMicroObj")) {
    if (!is.null(params$keyVars)) {
      warning("argument 'keyVars' has been specified, but is ignored because argument 'obj' is not a data.frame!")
    }
    keyVars <- obj@keyVars
  } else {
    keyVars <- params$keyVars

    if (is.character(keyVars)) {
      stopifnot(all(keyVars %in% colnames(obj)))
      keyVars <- match(keyVars, colnames(obj))
    }
    if (is.numeric(keyVars)) {
      stopifnot(all(keyVars %in% 1:ncol(obj)))
    }
    if (!is.data.table(obj)) {
      obj <- as.data.table(obj)
    }
  }

  if (useIdentificationLevel==TRUE) {
    if (!is.null(params$maxDim)) {
      stop("you can either specify 'maxDim' or set 'useIdentificationLevel' to TRUE!\n")
    }
    # check level argument
    level <- params$level
    if (!is.numeric(level)) {
      stop("you need to specify argument 'level' in case 'useIdentificationLevel' is TRUE!\n")
    }
    if (length(level)!=length(keyVars)) {
      stop("length(level) must match the number of key-variables!\n")
    }
    if (!all(level==as.integer(level))) {
      stop("argument 'levels' must be whole numbers!\n")
    }
    if (!all(level>0)) {
      stop("argument 'levels' must be whole numbers > 0!\n")
    }
  } else {
    ## checks for useIdentificationLevel==FALSE
    if (!is.null(params$level)) {
      stop("you can either specify 'level' or set 'useIdentificationLevel' to FALSE!\n")
    }
    maxDim <- params$maxDim
    if (!is.numeric(maxDim)) {
      stop("argument 'maxDim' needs to be numeric!\n")
    }
    if (maxDim != as.integer(maxDim)) {
      stop("argument 'maxDim' must be an integer!\n")
    }
    if (maxDim > length(keyVars)) {
      stop("argument 'maxDim' needs to be <= the number of key variables!\n")
    }
  }

  if (!is.numeric(threshold)) {
    stop("argument 'threshold' needs to be numeric!\n")
  }
  if (!all(threshold==as.integer(threshold))) {
    stop("argument 'threshold' needs to be whole numbers!\n")
  }
  riskyCellsX(obj=obj, useIdentificationLevel=useIdentificationLevel, threshold=threshold, ...)
}
setGeneric("riskyCellsX", function(obj, useIdentificationLevel, threshold, ...) {
  standardGeneric("riskyCellsX")
})

setMethod(f="riskyCellsX", signature=c(obj="data.frame"),
definition = function(obj, keyVars, useIdentificationLevel, threshold, ...) {
  res <- riskyCellsWork(df=obj, keyVars=keyVars, useIdentificationLevel=useIdentificationLevel, threshold=threshold, ...)
  res
})

setMethod(f="riskyCellsX", signature=c(obj="sdcMicroObj"),
definition=function(obj, useIdentificationLevel, threshold, ...) {
  params <- list(...)
  unsafe_cells <- NULL
  mod_keyvars <- !identical(obj@manipKeyVars, obj@origData[,obj@keyVars])
  res <- riskyCellsX(obj=obj@origData, keyVars=obj@keyVars, useIdentificationLevel=useIdentificationLevel, threshold=threshold, ...)
  if (mod_keyvars) {
    setnames(res,ncol(res),c("unsafe_cells_orig"))
  }

  if (!identical(obj@manipKeyVars, obj@origData[,obj@keyVars])) {
    res2 <- riskyCellsX(obj=obj@manipKeyVars, keyVars=obj@keyVars, useIdentificationLevel=useIdentificationLevel, threshold=threshold, ...)
    res$unsafe_cells_manip <- res2[,unsafe_cells]
  }
  res
})

riskyCellsWork <- function(df, keyVars, useIdentificationLevel=FALSE, threshold, ...) {
  N <- th <- NULL

  riskycells_upto_dimension <- function(dat, keyVars, maxDim, threshold) {
    if (length(threshold)==1) {
      threshold <- rep(threshold, maxDim)
    } else {
      stopifnot(length(threshold)==maxDim)
    }

    out <- list()
    length(out) <- maxDim

    tmp <- dat[,keyVars, with=F]
    keyVars <- 1:ncol(tmp)
    for (dd in 1:maxDim) {
      todo_combs <- utils::combn(names(tmp)[keyVars], dd)
      res <- matrix(NA, ncol=maxDim+1, nrow=ncol(todo_combs))

      for (i in 1:ncol(todo_combs)) {
        vv <- todo_combs[,i]
        setkeyv(tmp, vv)
        agg <- tmp[,.N, by=key(tmp)]
        agg <- agg[N<=threshold[dd]]

        nr_unsafe <- nrow(agg)
        res[i, 1] <- nr_unsafe
        res[i, 2:(2+length(vv)-1)] <- vv
      }
      res <- as.data.table(res)
      res[,th:=threshold[dd]]
      out[[dd]] <- as.data.table(res)
    }
    out <- rbindlist(out)
    setnames(out, c("unsafe_cells", paste0("dim", 1:maxDim),"threshold"))
    out
  }
  riskycells_using_identification_level <- function(dat, keyVars, level, threshold) {
    stopifnot(is.numeric(threshold))
    stopifnot(length(threshold)==1)
    stopifnot(threshold==as.integer(threshold))
    stopifnot(threshold>=1)

    dt <- data.table(v=names(dat)[keyVars], l=level)

    ## compute combinations
    maxDimN <- length(unique(dt$l))

    res <- riskycells_upto_dimension(dat, keyVars=keyVars, maxDim=maxDimN, threshold=threshold)

    # add levels to result
    #ids <- 2:(ncol(res)-1)
    #dims <- do.call("cbind",
    #lapply(ids, function(x) {
    #  dt$l[match(res[[x]], dt$v)]
    #}))
    #dims <- as.data.table(dims)
    #setnames(dims, paste0("dim",1:ncol(dims)))
    #res <- cbind(res, dims)

    # remove some combinations, aka implement the "dutch approach"
    #has_dups <- sapply(1:nrow(dims), function(x) {
    #  v <- na.omit(as.numeric(dims[x]))
    #  !any(duplicated(v)) | 1 %in% v | length(unique(v))==1
    #})
    #cbind(dims, has_dups)
    #dims[,has_dups:=has_dups]
    res
  }

  stopifnot(inherits(df, "data.frame"))
  if (!is.data.table(df)) {
    df <- as.data.table(df)
  }
  params <- list(...)
  if (useIdentificationLevel) {
    res <- riskycells_using_identification_level(dat=df, keyVars=keyVars, level=params$level, threshold=threshold)
  } else {
    res <- riskycells_upto_dimension(dat=df, keyVars=keyVars, maxDim=params$maxDim, threshold=threshold)
  }
  res <- res[,c(2:ncol(res),1), with=F]
  res[]
}
sdcTools/sdcMicro documentation built on March 15, 2024, 12:32 p.m.