R/localSuppression.R

Defines functions kAnon plot.localSuppression print.localSuppression localSuppressionWORK sum_na suppSubset localSuppression

Documented in kAnon localSuppression plot.localSuppression print.localSuppression

#' Local Suppression to obtain k-anonymity
#'
#' Algorithm to achieve k-anonymity by performing local suppression.
#'
#' The algorithm provides a k-anonymized data set by suppressing values in key
#' variables. The algorithm tries to find an optimal solution to suppress as
#' few values as possible and considers the specified importance vector. If not
#' specified, the importance vector is constructed in a way such that key
#' variables with a high number of characteristics are considered less
#' important than key variables with a low number of characteristics.
#'
#' The implementation provides k-anonymity per strata, if slot 'strataVar' has
#' been set in \code{\link{sdcMicroObj-class}} or if parameter 'strataVar' is
#' used when appying the data.frame method. For details, have a look
#' at the examples provided.
#'
#' @rdname localSuppression
#' @param obj a \code{\link{sdcMicroObj-class}}-object or a \code{data.frame}
#' @param k threshold for k-anonymity
#' @param importance numeric vector of numbers between 1 and n (n=length of
#' vector keyVars).  This vector represents the "importance" of variables that
#' should be used for local suppression in order to obtain k-anonymity.
#' key-variables with importance=1 will - if possible - not suppressed,
#' key-variables with importance=n will be used whenever possible.
#' @param combs numeric vector. if specified, the algorithm will provide k-anonymity
#' for each combination of n key variables (with n being the value of the ith element
#' of this parameter. For example, if combs=c(4,3), the algorithm will provide
#' k-anonymity to all combinations of 4 key variables and then k-anonymity to all
#' combinations of 3 key variables. It is possible to apply different k to these
#' subsets by specifying k as a vector. If k has only one element, the same value
#' of k will be used for all subgroups.
#' @param ... see arguments below
#' \describe{
#' \item{keyVars: }{names (or indices) of categorical key variables (for data-frame method)}
#' \item{strataVars: }{name (or index) of variable which is used for stratification purposes, used
#' in the data.frame method. This means that k-anonymity is provided within each category
#' of the specified variable.}
#' \item{alpha: }{numeric value between 0 and 1 specifying how much keys that
#' contain missing values (\code{NAs}) should contribute to the calculation
#' of \code{fk} and \code{Fk}. For the default value of \code{1}, nothing changes with
#' respect to the implementation in prior versions. Each \emph{wildcard-match} would
#' be counted while for \code{alpha=0} keys with missing values would be basically ignored.
#' Used in the data-frame method only because in the method for \code{\link{sdcMicroObj-class}}-objects,
#' this value is extracted from slot \code{options}.}
#' }
#' @return Manipulated data set with suppressions that has k-anonymity with
#' respect to specified key-variables or the manipulated data stored in the
#' \code{\link{sdcMicroObj-class}}.
#' @author Bernhard Meindl, Matthias Templ
#' @references
#' Templ, M. Statistical Disclosure Control for Microdata: Methods and Applications in R.
#' \emph{Springer International Publishing}, 287 pages, 2017. ISBN 978-3-319-50272-4.
#' \doi{10.1007/978-3-319-50272-4}
#'
#' Templ, M. and Kowarik, A. and Meindl, B.
#' Statistical Disclosure Control for Micro-Data Using the R Package sdcMicro.
#' \emph{Journal of Statistical Software}, \strong{67} (4), 1--36, 2015. \doi{10.18637/jss.v067.i04}
#'
#' @keywords manip
#' @export
#' @note Deprecated methods 'localSupp2' and 'localSupp2Wrapper' are no longer available
#' in sdcMicro > 4.5.0.
#' \code{kAnon} is a more intutitive term for localSuppression because the aim is always
#' to obtain k-anonymity for some parts of the data.
#' @examples
#' \donttest{
#' data(francdat)
#'
#' ## Local Suppression
#' localS <- localSuppression(francdat, keyVar=c(4,5,6))
#' localS
#' plot(localS)
#'
#' ## for objects of class sdcMicro, no stratification
#' data(testdata2)
#' kv <- c("urbrur", "roof", "walls", "water", "electcon", "relat", "sex")
#' sdc <- createSdcObj(testdata2, keyVars = kv, w = "sampling_weight")
#' sdc <- localSuppression(sdc)
#'
#' ## for objects of class sdcMicro, with stratification
#' testdata2$ageG <- cut(testdata2$age, 5, labels=paste0("AG",1:5))
#' sdc <- createSdcObj(
#'   dat = testdata2,
#'   keyVars = kv,
#'   w = "sampling_weight",
#'   strataVar = "ageG"
#' )
#' sdc <- localSuppression(sdc)
#'
#' ## it is also possible to provide k-anonymity for subsets of key-variables
#' ## with different parameter k!
#' ## in this case we want to provide 10-anonymity for all combinations
#' ## of 5 key variables, 20-anonymity for all combinations with 4 key variables
#' ## and 30-anonymity for all combinations of 3 key variables.
#' sdc <- createSdcObj(testdata2, keyVars = kv, w = "sampling_weight")
#' combs <- 5:3
#' k <- c(10, 20, 30)
#' sdc <- localSuppression(sdc, k = k, combs = combs)
#'
#' ## data.frame method (no stratification)
#' inp <- testdata2[,c(kv, "ageG")]
#' ls <- localSuppression(inp, keyVars = 1:7)
#' print(ls)
#' plot(ls)
#'
#' ## data.frame method (with stratification)
#' ls <- kAnon(inp, keyVars = 1:7, strataVars = 8)
#' print(ls)
#' plot(ls)
#' }
localSuppression <- function(obj, k = 2, importance = NULL, combs = NULL, ...) {
  localSuppressionX(
    obj = obj,
    k = k,
    importance = importance,
    combs = combs,
    ...)
}

setGeneric("localSuppressionX", function(obj, k = 2, importance = NULL, combs = NULL, ...) {
  standardGeneric("localSuppressionX")
})

setMethod(
  f = "localSuppressionX",
  signature = c("sdcMicroObj"),
  definition = function(obj, k = 2, importance = NULL, combs = NULL) {
    obj <- nextSdcObj(obj)
    ### get data from manipKeyVars
    df <- as.data.frame(get.sdcMicroObj(obj, type = "manipKeyVars"))
    strataVars <- get.sdcMicroObj(obj, "strataVar")
    keyVars <- 1:length(obj@keyVars)
    if (!is.null(strataVars)) {
      df <- cbind(df, get.sdcMicroObj(obj, type = "origData")[, strataVars, drop = FALSE])
      stratV <- length(keyVars) + 1:length(strataVars)
    } else {
      stratV <- NULL
    }

    alpha <- get.sdcMicroObj(obj, type = "options")$alpha
    ls <- localSuppressionWORK(
      x = df,
      keyVars = keyVars,
      strataVars = stratV,
      k = k,
      combs = combs,
      importance = importance,
      alpha = alpha)

    # create final output
    obj <- set.sdcMicroObj(obj, type = "manipKeyVars", input = list(ls$xAnon))
    ls$xAnon <- NULL
    class(ls) <- unclass("list")
    obj <- set.sdcMicroObj(obj, type = "localSuppression", input = list(ls))

    # transfer suppression patterns if ghostVars is specified
    ghostVars <- get.sdcMicroObj(obj, type = "ghostVars")
    if (!is.null(ghostVars)) {
      manipData <- get.sdcMicroObj(obj, type = "manipKeyVars")
      manipGhostVars <- get.sdcMicroObj(obj, type = "manipGhostVars")
      cn <- colnames(get.sdcMicroObj(obj, type = "origData"))
      for (i in seq_along(ghostVars)) {
        # index of keyVar within manipData
        kV <- match(cn[ghostVars[[i]][[1]]], colnames(manipData))
        isna <- is.na(manipData[[kV]])

        # get indices of linked variables within ghostVars and
        # transfer suppression pattern
        vv <- match(cn[ghostVars[[i]][[2]]], colnames(manipGhostVars))
        for (j in 1:length(vv)) {
          manipGhostVars[[vv[j]]][isna] <- NA
        }
      }
      obj <- set.sdcMicroObj(obj, type = "manipGhostVars", input = list(manipGhostVars))
    }
    obj <- calcRisks(obj)
    obj
  }
)

setMethod(
  f = "localSuppressionX",
  signature = c("data.frame"),
  definition = function(obj,
                        k = 2,
                        importance = NULL,
                        combs = NULL,
                        keyVars,
                        strataVars = NULL,
                        alpha = 1) {

    localSuppressionWORK(
      x = obj,
      keyVars = keyVars,
      k = k,
      strataVars = strataVars,
      importance = importance,
      combs = combs,
      alpha = alpha)
  }
)

suppSubset <- function(x, k, importance, alpha)  {
  # checks
  if (length(k) != 1 | k < 1) {
    stop("argument 'k' must be of length 1 and > 0.", call. = FALSE)
  }
  if (!is.null(importance)) {
    if (length(importance) != ncol(x)) {
      stop("length of importance-vector does not match number of key variables!", call. = FALSE)
    }
  }

  # k can be at most the number of obs!
  if (nrow(x) < k) {
    stop("k is larger or equal the group size (in at least one stratum).", call. = FALSE)
  }
  k <- min(k, nrow(x))

  keyVars <- names(x)
  x <- data.table(x)

  # calculate number of suppressions for each keyVar
  # before trying to achieve k-anonymity
  NAinKey <- x[, lapply(.SD, function(x) sum(is.na(x)))]

  x[, idvarextraforsls := .I]

  ##############
  # The dataset is reduced to a smaller dataset in the following way
  # 1) all NAs are initialized with the first unique value
  # of the corresponding keyVariable
  xKeys <- x[, c(keyVars, "idvarextraforsls"), with = FALSE]
  for (kV in keyVars) {
    if (NAinKey[[kV]] > 0) {
      e1 <- parse(text = paste0("is.na(", kV, ")"))
      e2 <- parse(text = paste0(kV, ":=unique(", kV, ")[1]"))
      xKeys[eval(e1), eval(e2)]
    }
  }
  setkeyv(xKeys, keyVars)

  # 2) fk is computed
  erg <- xKeys[, list(fk = .N), by = key(xKeys)]
  xKeys <- merge(xKeys, erg)

  # 3) from groups with fk > k, all observations except k observations are removed
  weg <- fkd <- idvarextraforsls <- fk <- NA # for CHECK-NOTES
  erg <- xKeys[fk > k] # more than k
  erg[, fkd := fk - k]
  if (nrow(erg) > 0) {
    erg2 <- erg[, utils::tail(.SD, fkd[1]), by = key(erg)]
    xKeys <- data.table(x)
    setkey(xKeys, "idvarextraforsls")
    erg2 <- erg2[, list(idvarextraforsls)]
    erg2[, weg := 1]
    setkey(erg2, "idvarextraforsls")
    xKeys <- merge(xKeys, erg2, all = TRUE)
    x <- xKeys[is.na(weg)]
  }

  # 4) afterwards the old lS-Algo is applied
  ff <- freqCalc(x, keyVars = keyVars, alpha = alpha)
  rk <- indivRisk(ff)
  runInd <- TRUE

  importanceI <- (length(importance) + 1) - importance

  # prepare data input for cpp_calcSuppInds()
  # factors must be recoded as numeric
  mat <- x[, keyVars, with = FALSE]
  for (kV in names(mat)) {
    if (is.factor(mat[[kV]])) {
      ex <- parse(text = paste0(kV, ":=as.numeric(", kV, ")"))
      mat[, eval(ex)]
    }
  }
  mat <- as.matrix(mat)
  while (runInd) {
    ind.problem <- which(ff$fk < k)
    ind.problem  <- ind.problem[order(rk$rk[ind.problem], decreasing = TRUE)]
    for (i in seq_along(ind.problem)) {
      params <- list(alpha = alpha, id = as.integer(ind.problem[i]))
      res <- cpp_calcSuppInds(mat, mat[ind.problem[i], ], params = params)
      ind <- res$ids
      if (length(ind) > 0) {
        colInd <- NULL
        colIndsSorted <- keyVars[order(importanceI)]
        while (is.null(colInd)) {
          for (cc in colIndsSorted) {
            # special case where we have to suppress values in the problematic instance itself
            # because no other candidates are available
            if (length(ind) == 1 && ind == ind.problem[i]) {
              z <- which(!is.na(mat[ind, cc]))
            } else {
              z <- which(mat[ind.problem[i], cc] != mat[ind, cc] & !is.na(mat[ind, cc]))
            }
            if (length(z) > 0) {
              colInd <- cc
              break
            }
          }
        }
        x[[colInd]][ind.problem[i]] <- NA
        mat[ind.problem[i], colInd] <- NA # required for cpp_calcSuppInds()
      } else {
        stop("Error the length of the index vector is 0.", call. = FALSE)
      }
    }
    ff <- freqCalc(x, keyVars = keyVars, alpha = alpha)
    rk <- indivRisk(ff)
    if (all(ff$fk >= k)) {
      runInd <- FALSE
    }
  }
  # 5) the last step is to merge the smaller k-anonymized data set back to the
  # original data set with initial NAs introduced again
  if (nrow(erg) > 0) {
    xrem <- data.table(x[, "idvarextraforsls", with = F], weg = 1)
    x[, weg := NULL]

    setkey(xrem, "idvarextraforsls")
    xKeys[, weg := NULL]
    setkey(xKeys, "idvarextraforsls")
    xKeys <- merge(xKeys, xrem, all = TRUE)
    xKeys <- xKeys[is.na(weg), ]
    xKeys[, weg := NULL]
    x <- rbind(x, xKeys)
    setkey(x, "idvarextraforsls")
    x[, idvarextraforsls := NULL]
  } else {
    setkey(x, "idvarextraforsls")
    x[, idvarextraforsls := NULL]
  }
  out <- list(xAnon = x)#, supps=supps, totalSupps=totalSupps)
  return(out)
}
sum_na <- function(x) {
  sum(is.na(x))
}
localSuppressionWORK <- function(x, keyVars, strataVars, k=2, combs, importance=NULL, alpha) {
  # find a suppression pattern for a simple subset that is not stratified
  # input: df=data.table with only keyVars
  # k: parameter for k-anonymity (length 1)
  # importance: importance-vector with length equals ncol(df)
  strata <- NULL
  if (!"data.table" %in% class(x)) {
    x <- as.data.table(x)
  }
  if (is.numeric(keyVars)) {
    keyVarsNum <- keyVars
    keyVars <- names(x)[keyVars]
  } else {
    keyVarsNum <- match(keyVars, colnames(x))
  }
  if (is.numeric(strataVars)) {
    strataVarsNum <- strataVars
    strataVars <- names(x)[strataVars]
  } else {
    strataVarsNum <- match(strataVars, colnames(x))
  }

  # checks and preparations if we apply localSuppression on
  # subsets of key variables
  if (!is.null(combs)) {
    combs <- as.integer(combs)
    if (length(combs) != length(k)) {
      # using the same k!
      k <- rep(k, length(combs))
    }
    if (!all(combs > 0)) {
      stop("each element of 'comb' must be > 0!", call. = FALSE)
    }
    if (any(combs > length(keyVars))) {
      stop("at least one element of 'combs' is to large!", call. = FALSE)
    }

    # all combinations we need to tackle are stored here
    tree <- lapply(combs, function(x) {
      utils::combn(keyVarsNum, x)
    })
  }

  # calculate importance if specified
  if (is.null(importance)) {
    xx <- x[, lapply(.SD, function(y) {
      length(table(y))
    }), .SDcols = keyVars]
    importance <- match(names(xx), names(
      sort(unlist(xx), decreasing = FALSE))
      )
  } else {
    if (length(setdiff(sort(importance), 1:length(keyVars))) > 0) {
      e <- "The importance vector needs to be discrete numbers between 1 and the number of key variables."
      stop(e, call. = FALSE)
    }
  }

  # calculate number of suppressions for each keyVar
  # before trying to achieve k-anonymity
  NABefore <- is.na(x)
  if (is.null(strataVars)) {
    NAinKey <- x[, lapply(.SD, sum_na), .SDcols = keyVars, by = strataVars]
  } else {
    x[, strata := apply(x[, strataVars, with = F], 1, paste, collapse = "-")]
    NAinKey <- x[, lapply(.SD, sum_na), .SDcols = keyVars, by = "strata"]
    NAinKey_tot <- x[, lapply(.SD, sum_na), .SDcols = keyVars]
    NAinKey_tot[, strata := "Total"]
    NAinKey_tot <- NAinKey_tot[, c("strata", keyVars), with = F]
    NAinKey <- rbind(NAinKey, NAinKey_tot)
    x[, strata := NULL]
  }

  # performing the k-Anon algorithm
  # no stratification required
  if (is.null(strataVars)) {
    if (is.null(combs)) {
      inpDat <- x[, keyVars, with = FALSE]
      res <- suppSubset(
        x = inpDat,
        k = k,
        importance = importance,
        alpha = alpha)
      xAnon <- res$xAnon
    } else {
      # no strata but subsets of key variables (combs)
      counter <- 0
      tmpDat <- copy(x)
      for (gr in seq_along(tree)) {
        cur_k <- k[gr]
        #log <- paste0("providing ",cur_k,"-Anonymity for ",ncol(tree[[gr]])," combinations ")
        #log <- paste0(log, "of ",combs[gr]," key variables.\n")
        #message(log)
        for (comb in 1:ncol(tree[[gr]])) {
          #message("combination",comb,"|",ncol(tree[[gr]]),"\n")
          counter <- counter + 1
          kV <- tree[[gr]][, comb]
          cur_importance <- rank(importance[kV], ties.method = "min")
          inpDat <- tmpDat[, kV, with = F]
          res <- suppSubset(
            x = inpDat,
            k = cur_k,
            importance = cur_importance,
            alpha = alpha)

          # replace: is there a more elegant way?
          for (z in 1:length(kV)) {
            set(tmpDat, i = NULL, j = kV[z], res$xAnon[[z]])
          }
        }
      }
      # prepare output
      xAnon <- tmpDat
    }
  } else {
    ## we want k-anonymity in each strata!
    inpDat <- x[, keyVars, with = FALSE]
    inpDat[, strata := apply(x[, strataVars, with = FALSE], 1, paste, collapse = "-")]
    inpDat[, sortid := 1:nrow(inpDat)]
    spl <- split(inpDat, inpDat$strata)

    # to be able to sort back later!
    sortid <- as.numeric(unlist(sapply(spl, function(x) {
      x[["sortid"]]
    })))

    supps <- xAnon <- vector("list", length = length(spl))
    totalSupps <- rep(NA, length(spl))
    if (is.null(combs)) {
      # todo: using parallel/mclapply?
      for (i in seq_along(spl)) {
        res <- suppSubset(
          x = spl[[i]][, keyVars, with = FALSE],
          k = k,
          importance = importance,
          alpha = alpha)
        xAnon[[i]] <- res$xAnon
      }
    } else {
      # local Suppression by strata and combination of subsets!
      for (i in seq_along(spl)) {
        counter <- 0
        tmpDat <- copy(spl[[i]])
        for (gr in seq_along(tree)) {
          cur_k <- k[gr]
          #log <- paste0("providing ",cur_k,"-Anonymity for ",ncol(tree[[gr]])," combinations ")
          #log <- paste0(log, "of ",combs[gr]," key variables in strata ", names(spl)[i],"!\n")
          #message(log)
          for (comb in 1:ncol(tree[[gr]])) {
            counter <- counter + 1
            kV <- tree[[gr]][, comb]
            cur_importance <- rank(importance[kV], ties.method = "min")
            inpDat <- tmpDat[, kV, with = FALSE]
            res <- suppSubset(
              x = inpDat,
              k = cur_k,
              importance = cur_importance,
              alpha = alpha)

            # replace: is there a more elegant way?
            for (z in 1:length(kV)) {
              set(tmpDat, i = NULL, j = kV[z], res$xAnon[[z]])
            }
          }
        }
        # prepare output
        tmpDat[, sortid := NULL]
        xAnon[[i]] <- tmpDat
      }
    }
    xAnon <- rbindlist(xAnon)
    xAnon[, sortid := sortid]
    setkey(xAnon, sortid)
    xAnon[, sortid := NULL]
  }

  ## compute number of suppressions
  if (is.null(strataVars)) {
    totalSupps <- xAnon[, lapply(.SD, sum_na), .SDcols = keyVars]
    supps <- totalSupps - NAinKey
    totalSupps <- as.data.frame(totalSupps)
    supps <- as.data.frame(supps)
  } else {
    if (is.null(combs)) {
      xAnon[, strata := inpDat[, strata]]
    }
    totalSupps <- xAnon[, lapply(.SD, sum_na), .SDcols = keyVars, by = "strata"]
    totalSupps_tot <- xAnon[, lapply(.SD, sum_na), .SDcols = keyVars]
    totalSupps_tot[, strata := "Total"]
    totalSupps_tot <- totalSupps_tot[, c("strata", keyVars), with = FALSE]
    totalSupps <- rbind(totalSupps, totalSupps_tot)
    xAnon[, strata := NULL]

    supps <- copy(totalSupps)
    supps[, c(keyVars) := totalSupps[, keyVars, with = FALSE] - NAinKey[, keyVars, with = FALSE]]

    supps <- as.data.frame(supps)
    rownames(supps) <- supps$strata; supps$strata <- NULL
    totalSupps <- as.data.frame(totalSupps)
    rownames(totalSupps) <- totalSupps$strata; totalSupps$strata <- NULL
  }

  newSupps <- utils::tail(rowSums(supps), 1)
  res <- list(
    xAnon = as.data.frame(xAnon),
    supps = supps,
    totalSupps = totalSupps,
    newSupps = newSupps,
    anonymity = TRUE,
    keyVars = keyVars,
    strataVars = strataVars,
    importance = importance,
    k = k,
    threshold = NA,
    combs = combs
  )
  class(res) <- "localSuppression"
  invisible(res)
}

#' Print method for objects from class localSuppression
#'
#' @param x object from class localSuppression
#' @param \dots Additional arguments passed through.
#' @return Information about the frequency counts for key variables for object
#' of class \sQuote{localSuppression}.
#' @author Matthias Templ
#' @seealso \code{\link{localSuppression}}
#' @keywords print
#' @method print localSuppression
#' @export
#' @examples
#' ## example from Capobianchi, Polettini and Lucarelli:
#' data(francdat)
#' l1 <- localSuppression(francdat, keyVars=c(2,4,5,6))
#' l1
#'
print.localSuppression <- function(x, ...) {
  byStrata <- ifelse(nrow(x$totalSupps) == 1, FALSE, TRUE)
  totSupps <- utils::tail(rowSums(x$totalSupps), 1)
  addSupps <- utils::tail(rowSums(x$supps), 1)

  pp <- "\n-----------------------\n"
  pp <- paste0(pp, "Total number of suppressions in the key variables: ", totSupps," (new: ", addSupps, ")\n\n")
  if (!is.na(x$threshold)) {
    pp <- paste0(pp, "Number of suppressions by key variables:\n\n")
    message(pp)
    print(x$supps)

    kv <- x$keyVars[which.max(x$importance)]
    pp <- paste0("\nThe last application of localSupp() resulted")
    pp <- paste0(pp, " in ", x$newSupps, " additional suppressions in variable ", dQuote(kv), ".\n")
    pp <- paste0(pp, "These observations had individual risks >= ", x$threshold, ".\n")
    message(pp)
    return(invisible(NULL))
  }

  if (byStrata) {
    pp <- paste0(pp, "Number of suppressions by key variables and strata:\n(in parenthesis, the total number suppressions is shown)\n\n")
  } else {
    pp <- paste0(pp, "Number of suppressions by key variables:\n(in parenthesis, the total number suppressions is shown)\n\n")
  }

  dt <- x$supps
  for (i in 1:ncol(dt)) {
    dt[[i]] <- paste0(dt[[i]], " (",x$totalSupps[[i]],")")
  }
  message(pp)
  print(dt)

  if (byStrata==TRUE) {
    if (is.null(x$combs)) {
      if (all(x$anonymity)) {
        pp <- paste0("\n", x$k, "-anonymity == TRUE in all strata!\n")
      } else {
        prob <- rownames(x$supps)[which(!x$anonymity)]
        pp <- paste0("\n", x$k, "-anonymity == FALSE in the following strata:\n")
        pp <- paste0(pp, paste0(rownames(x$supps)[which(!x$anonymity)], collapse=", "))
      }
    } else {
      pp <- "\nk-anonymity has been achieved within strata and for combinations of key-variables!"
      for (i in 1:length(x$combs)) {
        pp <- paste(pp, paste0("\n", x$k[i], "-anonymity == ", all(x$anonymity)," for all ",x$combs[i],"-dimensional subsets of key variables within stratas"))
      }
      pp <- paste0(pp,"\n")
    }
  } else {
    if (!is.null(x$combs)) {
      pp <- NULL
      for (i in 1:length(x$combs)) {
        pp <- paste(pp, paste0("\n", x$k[i], "-anonymity == ", all(x$anonymity)," for all ",x$combs[i],"-dimensional subsets of key variables"))
      }
      pp <- paste(pp, "\n")
    } else {
      pp <- paste0("\n", x$k, "-anonymity == ", all(x$anonymity),"\n")
    }
  }
  pp <- paste0(pp, "-----------------------\n")
  message(pp)
  invisible(NULL)
}

#' Plots for localSuppression objects
#'
#' This function creates barplots to display the number of suppressed values
#' in categorical key variables to achieve `k`-anonymity.
#'
#' @param x object of derived from [localSuppression()]
#' @param ... Additional arguments, currently available are:
#' - `"showDetails"`: logical, if set, a plot of suppressions by
#' strata is shown (if possible)
#' @author Bernhard Meindl, Matthias Templ
#' @md
#' @seealso [localSuppression()]
#' @keywords plot
#' @method plot localSuppression
#' @export
#' @return a `ggplot` plot object
#' @examples
#' data(francdat)
#' @export plot.localSuppression
plot.localSuppression <- function(x, ...) {
  vals <- NULL
  byStrata <- !is.null(x$strataVars)
  params <- list(...)
  inp <- x$supps

  showDetails <- FALSE
  if (byStrata & !is.null(params$showDetails)) {
    if (params$showDetails) {
      showDetails <- TRUE
    }
  }

  # add overall suppressions if localSuppression was applied per strata
  if (!showDetails) {
    inp <- as.data.frame(t(apply(inp, 2, sum)))
    rownames(inp) <- "Overall"
  }

  keyVar <- rep(x$keyVars, nrow(inp))
  if (any(nchar(keyVar) >= 12)) {
    warnMsg <- "Too long variable names are cutted!\n"
    warning(warnMsg)
    keyVar <- substr(keyVar, 1, 12)
  }

  # create ggplot2-input
  df <- data.frame(
    keyVar = keyVar,
    strata = rep(rownames(inp), each = ncol(inp)),
    vals = as.vector(as.matrix(t(inp))))

  # barplot
  p <- ggplot(df, aes(x = keyVar, y = vals, fill = "darkgrey"))
  p <- p + geom_bar(color = "black", stat = "identity")
  p <- p + geom_text(aes(y = vals *  1.015, label = vals, vjust = 0), size = 3)
  if (nrow(inp) > 1) {
    p <- p + facet_wrap(~strata)
    p <- p + ggtitle("Number of suppressions to achieve k-anonymity by strata")
  } else {
    p <- p + ggtitle("Number of suppressions to achieve k-anonymity.")
  }
  p <- p + labs(x = "Key variables", y = "Number of suppressions") + theme(legend.position = "none")
  p <- p + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  p
}

#' @rdname localSuppression
#' @export
kAnon <- function(obj, k=2, importance=NULL, combs=NULL, ...) {
  localSuppression(
    obj = obj,
    k = k,
    importance = importance,
    combs = combs,
    ...)
}
sdcTools/sdcMicro documentation built on March 15, 2024, 12:32 p.m.