R/printFunctions.R
In sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Documented in freq

# format as string with given number of digits
prettyF <- function(inp, digits=3) {
  formatC(as.numeric(inp), format="f", digits=digits)
}

#' Freq
#'
#' Extract sample frequency counts (fk) or estimated population frequency counts (Fk)
#'
#' @param obj an \code{\link{sdcMicroObj-class}}-object
#' @param type either \code{'fk'} or \code{'FK'}
#' @author Bernhard Meindl
#' @return a vector containing sample frequencies or weighted frequencies
#' @export
#' @examples
#' data(testdata)
#' sdc <- createSdcObj(testdata,
#'   keyVars=c('urbrur','roof','walls','relat','sex'),
#'   pramVars=c('water','electcon'),
#'   numVars=c('expend','income','savings'), w='sampling_weight')
#' head(freq(sdc, type="fk"))
#' head(freq(sdc, type="Fk"))
freq <- function(obj, type="fk") {
  freqX(obj, type=type)
}

setGeneric("freqX", function(obj, type="fk") {
  standardGeneric("freqX")
})

setMethod(f="freqX", signature=c("sdcMicroObj"),
definition = function(obj, type="fk") {
  if (type == "fk")
    ret <- obj@risk$individual[, 2] else if (type == "Fk")
    ret <- obj@risk$individual[, 3] else stop(paste("type=", type, "is unknown."))
  return(ret)
})

#' Print and Extractor Functions for objects of class \code{\link{sdcMicroObj-class}}
#'
#' Descriptive print function for Frequencies, local Supression, Recoding,
#' categorical risk and numerical risk.
#'
#' Possible values for the type argument of the print function are: "freq": for
#' Frequencies, "ls": for Local Supression output, "pram": for results of
#' post-randomization "recode":for Recodes, "risk": forCategorical risk and
#' "numrisk": for Numerical risk.
#'
#' Possible values for the type argument of the freq function are: "fk": Sample
#' frequencies and "Fk": weighted frequencies.
#'
#' @name print.sdcMicroObj
#' @aliases print,sdcMicroObj-method
#' @docType methods
#' @param x An object of class \code{\link{sdcMicroObj-class}}
#' @param type Selection of the content to be returned or printed
#' @param docat logical, if TRUE (default) the results will be actually printed
#' @param ... the type argument for the print method, currently supported are:
#' \itemize{
#' \item general: basic information on the input obj such as the number of observations
#' and variables.
#' \item kAnon: displays information about 2- and 3-anonymity
#' \item ls: displays various information if local suppression has been applied.
#' \item pram: displays various information if post-randomization has been applied.
#' \item recode: shows information about categorical key variables before and after recoding
#' \item risk: displays information on re-identification risks
#' \item numrisk: displays risk- and utility measures for numerical key variables
#' }
#' @author Alexander Kowarik, Matthias Templ, Bernhard Meindl
#' @keywords classes
#' @export
#' @examples
#' data(testdata)
#' \donttest{
#' sdc <- createSdcObj(testdata,
#'   keyVars=c('urbrur','roof','walls','relat','sex'),
#'   pramVars=c('water','electcon'),
#'   numVars=c('expend','income','savings'), w='sampling_weight')
#' sdc <- microaggregation(sdc, method="mdav", aggr=3)
#' print(sdc)
#' print(sdc, type="general")
#' print(sdc, type="ls")
#' print(sdc, type="recode")
#' print(sdc, type="risk")
#' print(sdc, type="numrisk")
#' print(sdc, type="pram")
#' print(sdc, type="kAnon")
#' print(sdc, type="comp_numvars")
#' }
setMethod(f = "print", signature = c("sdcMicroObj"),
definition = function(x, type = "kAnon", docat=TRUE, ...) {
  if (!type %in% c("kAnon","ls","pram","recode","risk","numrisk","general", "comp_numvars")) {
    stop(paste("type=", type, "is unknown."))
  }

  # type=kAnon,ls,recode,risk,numrisk,general
  obj <- x
  hr <- paste0(rep("-", 70), collapse="")
  tabsp <- "  "
  n <- nrow(get.sdcMicroObj(x, "origData"))
  cn <- colnames(get.sdcMicroObj(x, "origData"))

  TFchange <- TRUE
  if ( identical(obj@risk$individual, obj@originalRisk$individual) ) {
    TFchange <- FALSE
  }

  if (type=="kAnon") {
    txt_f <- paste0("Infos on 2/3-Anonymity:\n\n")
    txt_f <- paste0(txt_f, "Number of observations violating\n")
    txt_f <- paste0(txt_f, tabsp,"- 2-anonymity: ")
    f_2anon <- sum(obj@risk$individual[,2]<2)
    f_2anon_o <- sum(obj@originalRisk$individual[,2]<2)

    f_3anon <- sum(obj@risk$individual[,2]<3)
    f_3anon_o <- sum(obj@originalRisk$individual[,2]<3)

    f_5anon <- sum(obj@risk$individual[,2]<5)
    f_5anon_o <- sum(obj@originalRisk$individual[,2]<5)

    f_2anon_p <- prettyF(100*f_2anon/n); f_2anon_op <- prettyF(100*f_2anon_o/n)
    f_3anon_p <- prettyF(100*f_3anon/n); f_3anon_op <- prettyF(100*f_3anon_o/n)
    f_5anon_p <- prettyF(100*f_5anon/n); f_5anon_op <- prettyF(100*f_5anon_o/n)

    if (TFchange) {
      txt_f <- paste0(txt_f, f_2anon, " (",f_2anon_p,"%) | in original data: ",f_2anon_o," (",f_2anon_op,"%)\n")
    } else {
      txt_f <- paste0(txt_f, f_2anon_o," (",f_2anon_op,"%)\n")
    }
    txt_f <- paste0(txt_f, tabsp,"- 3-anonymity: ")
    if (TFchange) {
      txt_f <- paste0(txt_f, f_3anon, " (",f_3anon_p,"%) | in original data: ",f_3anon_o," (",f_3anon_op,"%)\n")
    } else {
      txt_f <- paste0(txt_f, f_3anon_o," (",f_3anon_op,"%)\n")
    }
    txt_f <- paste0(txt_f, tabsp,"- 5-anonymity: ")
    if (TFchange) {
      txt_f <- paste0(txt_f, f_5anon, " (",f_5anon_p,"%) | in original data: ",f_5anon_o," (",f_5anon_op,"%)\n")
    } else {
      txt_f <- paste0(txt_f, f_5anon_o," (",f_5anon_op,"%)\n")
    }
    txt_f <- paste0(txt_f, "\n",hr,"\n\n")
    if (docat) {
      message(txt_f)
    }
    return(invisible(list(
      "2anon"=list(orig=f_2anon_o, orig_p=f_2anon_op, mod=f_2anon, mod_p=f_2anon_p),
      "3anon"=list(orig=f_3anon_o, orig_p=f_3anon_op, mod=f_3anon, mod_p=f_3anon_p),
      "5anon"=list(orig=f_5anon_o, orig_p=f_5anon_op, mod=f_5anon, mod_p=f_5anon_p))))
  }

  if (type=="risk") {
    risk <- obj@risk
    originalRisk <- obj@originalRisk

    # individual risk in original/modified data
    iR_mod <- risk$individual[,1]
    iR_orig <- originalRisk$individual[,1]

    # risky obs
    s_mod <-sum((iR_mod > median(iR_mod) + 2*mad(iR_mod)) & (iR_mod>0.1))
    s_orig <- sum((iR_orig > median(iR_orig) + 2*mad(iR_orig)) & (iR_orig>0.1))

    # expected re-identifications for original/modified data
    exp_ident <- prettyF(round(risk$global$risk_ER, 2), digits=2)
    exp_ident_p <- prettyF(round(risk$global$risk_pct, 2), digits=2)
    exp_ident_o <- prettyF(round(originalRisk$global$risk_ER, 2), digits=2)
    exp_ident_op <- prettyF(round(originalRisk$global$risk_pct, 2), digits=2)

    txt_r <- paste0("Risk measures:\n\n")
    txt_r <- paste0(txt_r, "Number of observations with higher risk than the main part of the data: ")
    if ( TFchange ) {
      txt_r <- paste0(txt_r,"\n",tabsp,"in modified data: ",s_mod,"\n")
      txt_r <- paste0(txt_r,tabsp,"in original data: ",s_orig,"\n")
    } else {
      txt_r <- paste0(txt_r, s_mod,"\n")
    }
    txt_r <- paste0(txt_r, "Expected number of re-identifications: ")
    if (TFchange) {
      txt_r <- paste0(txt_r, "\n",tabsp,"in modified data: ",exp_ident," (",exp_ident_p," %)\n")
      txt_r <- paste0(txt_r, tabsp,"in original data: ",exp_ident_o," (",exp_ident_op," %)\n")
    } else {
      txt_r <- paste0(txt_r, exp_ident," (",exp_ident_p," %)\n")
    }

    if ("hier_risk_ER" %in% names(risk$global)) {
      if (!is.na(risk$global$hier_risk_ER)) {
        txt_r <- paste0(txt_r, "\nInformation on hierarchical risk:\n")

        hR <- prettyF(round(risk$global$hier_risk_ER, 2), digits=2)
        hR_p <- prettyF(round(risk$global$hier_risk_pct, 2), digits=2)
        hR_o <- prettyF(round(originalRisk$global$hier_risk_ER,2), digits=2)
        hR_op <- prettyF(round(originalRisk$global$hier_risk_pct, 2), digits=2)

        txt_r <- paste0(txt_r, "Expected number of re-identifications: ")
        if (TFchange) {
          txt_r <- paste0(txt_r, "\n",tabsp,"in modified data: ",hR," (",hR_p," %)\n")
          txt_r <- paste0(txt_r, tabsp,"in original data: ",hR_o," (",hR_op," %)\n")
        } else {
          txt_r <- paste0(txt_r, hR," (",hR_p," %)\n")
        }
      } else {
        txt_r <- paste0(txt_r, tabsp, "Hierarchical risk is not available!\n")
      }
      txt_r <- paste0(txt_r, hr, "\n\n")
    }
    if (docat) {
      message(txt_r)
    }
    out <- list(
      "riskyObs"=list(orig=s_orig, mod=s_mod),
      "reident"=list(orig=exp_ident_o, orig_p=exp_ident_op, mod=exp_ident, mod_p=exp_ident_p))
    if ("hier_risk_ER" %in% names(risk$global)) {
      out <- list(out,
        "hierRisk"=list(orig=hR_o, orig_p=hR_op, mod=hR, mod_p=hR_p)
      )
    } else {
      out <- list(out, "hierRisk"=NA)
    }
    return(invisible(out))
  }

  if (type=="pram") {
    pp <- get.sdcMicroObj(obj, type="pram")
    if (is.null(pp)) {
      if (docat) {
        message(paste0("PRAM has not been applied!\n"))
      }
      return(invisible(NULL))
    }
    ss <- pp$summary
    params <- pp$params
    if (docat) {
      message("Post-Randomization (PRAM):\n")
      for (i in 1:nrow(ss)) {
        message("Variable:",ss$variable[i],"\n")
        message("--> final Transition-Matrix:\n")
        print(params[[i]]$Rs)
      }
      message("\nChanged observations:\n")
      print(ss)
      message(hr,"\n\n")
    }
    return(invisible(list(
      "pram_summary"=ss,
      "params"=params
    )))
  }

  if (type=="ls") {
    ls <- get.sdcMicroObj(obj, "localSuppression")
    if (is.null(ls)) {
      if (docat) {
        message(paste0("Local suppression has not been applied!\n"))
      }
      return(invisible(NULL))
    }
    keyVars <- colnames(obj@manipKeyVars)
    txt_ls <- paste0("Local suppression")
    supps <- as.data.table(ls$supps)
    suppsT <- as.data.table(ls$totalSupps)
    if (nrow(supps) > 1) {
      txt_ls <- paste0(txt_ls, " (applied per strata given by variable(s) ", paste0(ls$strataVars, collapse=", "),")\n")
    } else {
      txt_ls <- paste0(txt_ls, ":\n")
    }
    supps_perc <- prettyF(as.numeric(100*(supps[nrow(supps)]/nrow(obj@origData))), digits=3)
    supps_percT <- prettyF(as.numeric(100*(suppsT[nrow(suppsT)]/nrow(obj@origData))), digits=3)

    out <- data.table(keyVars, xx="|", supps=as.integer(supps[nrow(supps)]), y="|",supps_perc)
    setnames(out, c("KeyVar", "|","Suppressions (#)","|","Suppressions (%)"))
    outT <- data.table(keyVars, supps=as.integer(suppsT[nrow(suppsT)]), supps_percT)
    setnames(outT, c("KeyVar", "Suppressions (#)", "Suppressions (%)"))

    if (docat) {
      message(txt_ls)
      print(out, row.names=F)
      message(hr,"\n\n")
    }
    return(invisible(list(
      "supps"=out[,c(1,3,5), with=FALSE],
      "suppsT"=outT,
      "threshold"=ls$threshold,
      "strataVars"=ls$strataVars
    )))
  }

  if (type=="recode") {
    df.o <- as.data.table(get.sdcMicroObj(obj, "origData"))
    df.o <- df.o[,get.sdcMicroObj(obj, "keyVars"),with=F]
    df.rec <- as.data.table(get.sdcMicroObj(obj, "manipKeyVars"))

    # number of categories (inkl. NA)
    .stats <- function(x) {
      tab_x <- table(x)
      res <- c(
        length(unique(x)),
        mean(tab_x))

      if (any(tab_x) > 0) {
        return(c(res, min(tab_x[tab_x>0])))
      } else {
        return(c(res, 0))
      }
    }

    stats_o <- t(df.o[,lapply(.SD, function(x) .stats(x))])
    stats_rec <- t(df.rec[,lapply(.SD, function(x) .stats(x))])

    stats_o <- cbind(names(df.o), stats_o)
    stats_rec <- cbind(names(df.rec), stats_rec)

    stats_rec <- cbind(stats_rec[,1], stats_rec[,2], paste0("(",stats_o[,2],")"),
      prettyF(stats_rec[,3]), paste0("(",prettyF(stats_o[,3]),")"),
      stats_rec[,4], paste0("(",stats_o[,4],")"))
    stats_rec <- as.data.table(stats_rec)
    setnames(stats_rec, c("Key Variable","Number of categories","","Mean size","","Size of smallest (>0)",""))

    txt_rec <- paste0("Information on categorical key variables:\n")
    txt_rec <- paste0(txt_rec,"\nReported is the number, mean size and size of the smallest category >0 for recoded variables.\n")
    txt_rec <- paste0(txt_rec, "In parenthesis, the same statistics are shown for the unmodified data.\n")
    txt_rec <- paste0(txt_rec, "Note: NA (missings) are counted as seperate categories!\n\n")

    if (docat) {
      message(txt_rec)
      print(stats_rec, nrows=nrow(stats_rec), row.names=FALSE)
      message(hr,"\n\n")
    }
    return(invisible(list(
      "keyVars"=stats_rec[[1]],
      "categories"=list(orig=stats_rec[[2]], mod=stats_rec[[3]]),
      "meansize"=list(orig=stats_rec[[4]], mod=stats_rec[[5]]),
      "minsize"=list(orig=stats_rec[[6]], mod=stats_rec[[7]])
    )))
  }

  if (type=="numrisk") {
    if (length(obj@numVars)==0) {
      return(NULL)
    }
    risk <- obj@risk
    utility <- obj@utility

    txt_r <- paste0("Numerical key variables: ", paste(cn[x@numVars], collapse = ", "), "\n\n")
    risk_up <- prettyF(round(100 * risk$numeric, 2),2)

    if (!is.null(utility)) {
      il1 <- prettyF(round(utility$il1, 2),2)
      diff_eigen <- prettyF(round(utility$eigen * 100, 2))
    } else {
      il1 <- diff_eigen <- NA
    }

    if (TFchange) {
      txt_r <- paste0(txt_r, "Disclosure risk (~100.00% in original data):\n")
      txt_r <- paste0(txt_r, tabsp,"modified data: [0.00%; ",risk_up,"%]\n\n")
      if (!is.null(utility)) {
        txt_r <- paste0(txt_r, "Current Information Loss in modified data (0.00% in original data):\n")
        txt_r <- paste0(txt_r, tabsp, "IL1: ",il1,"\n")
        txt_r <- paste0(txt_r, tabsp, "Difference of Eigenvalues: ",diff_eigen,"%\n")
      }
    } else {
      txt_r <- paste0(txt_r, "Disclosure risk is currently between [0.00%; ",risk_up,"%]\n\n")
      if (!is.null(utility)) {
        txt_r <- paste0(txt_r, "Current Information Loss:\n")
        txt_r <- paste0(txt_r, tabsp, "- IL1: ",il1,"\n")
        txt_r <- paste0(txt_r, tabsp, "- Difference of Eigenvalues: ",diff_eigen,"%\n")
      }
    }
    txt_r <- paste0(txt_r, hr, "\n\n")
    if (docat) {
      message(txt_r)
    }
    return(invisible(list(
      "numVars"=cn[x@numVars],
      "riskOrig"=100,
      "risk_up"=risk_up,
      "il1"=il1,
      "diff_eigen"=diff_eigen
    )))
  }

  if (type=="general") {
    dims <- dim(get.sdcMicroObj(x, "origData"))
    txt <- paste("The input dataset consists of", dims[1], "rows and", dims[2], "variables.\n")

    delVars <- x@deletedVars
    if (!is.null(delVars)) {
      txt <- paste0(txt, "\nThe following variables have been deleted are not available in the output dataset:\n")
      for (i in 1:length(delVars)) {
        txt <- paste0(txt, tabsp, "--> ", delVars[i],"\n")
      }
      txt <- paste0(txt, "\n\n")
    }

    kV <- get.sdcMicroObj(x, "keyVars")
    if (length(kV) > 0) {
      txt <- paste0(txt, tabsp, "--> Categorical key variables: ", paste(cn[kV], collapse = ", "), "\n")
    }
    nV <- get.sdcMicroObj(x, "numVars")
    if (length(nV) > 0) {
      txt <- paste0(txt, tabsp, "--> Numerical key variables: ", paste(cn[nV], collapse = ", "), "\n")
    }
    wV <- get.sdcMicroObj(x, "weightVar")
    if (length(wV) > 0) {
      txt <- paste0(txt, tabsp, "--> Weight variable: ", cn[wV], "\n")
    }
    sV <- get.sdcMicroObj(x, "strataVar")
    if (length(sV) > 0) {
      txt <- paste0(txt, tabsp, "--> Strata variable(s): ", paste(cn[sV], collapse = ", "), "\n")
    }
    hhV <- get.sdcMicroObj(x, "hhId")
    if (length(hhV) > 0) {
      txt <- paste0(txt, tabsp, "--> Cluster/Household-Id variable: ", cn[hhV], "\n")
    }
    gV <- get.sdcMicroObj(x, "ghostVars")
    if (length(gV) > 0) {
      txt <- paste0(txt, tabsp, "--> Ghost variable(s) exist\n")
      for (i in 1:length(gV)) {
        cur_kv <- cn[gV[[i]][[1]]]
        cur_gv <- cn[gV[[i]][[2]]]
        txt <- paste0(txt, tabsp,tabsp,"Variable(s) ",paste(cur_gv, collapse = ", ")," are linked to key variable ",cur_kv,"\n")
        gV[[i]][[1]] <- cur_kv
        gV[[i]][[2]] <- cur_gv
      }
    }
    txt <- paste0(txt, hr,"\n\n")
    if (docat) {
      message(txt)
    }
    return(invisible(list(
      "dims"=dims,
      "delVars"=delVars,
      "keyVars"=cn[kV],
      "numVars"=cn[nV],
      "weightVar"=cn[wV],
      "strataVar"=cn[sV],
      "householdId"=cn[hhV],
      "ghostVars"=gV
    )))
  }

  if (type=="comp_numvars") {
    orig <- get.sdcMicroObj(x, type="origData")
    numvars <- colnames(orig)[get.sdcMicroObj(x, type="numVars")]
    if (length(numvars) == 0) {
      return(invisible(NULL))
    }
    nv_o <- orig[, numvars, drop=F]
    nv_m <- get.sdcMicroObj(x, "manipNumVars")
    if (docat) {
      message("Compare original and modified numeric key variables\n\n")
    }
    out <- list(
      "numVars"=numvars,
      "results"=list()
    )
    anyChanges <- FALSE # Check if any changes
    for (i in 1:ncol(nv_o)) {
      if (identical(nv_o[,i], nv_m[,i])) {
        # Add identical columns to output for GUI
        summary_o <- summary_m <- summary(nv_o[,i])
        dt <- as.data.table(rbind(c("orig",as.numeric(summary_o)), c("modified",as.numeric(summary_m))))
        setnames(dt, c("Type",names(summary_o)))
        out$results[[length(out$results)+1]] <- dt
        if (docat) {
          message(paste0(tabsp,"Variable ", shQuote(numvars[i])," has not been modified.\n\n"))
        }
      } else {
        anyChanges <- TRUE
        summary_o <- summary(nv_o[,i])
        summary_m <- summary(as.numeric(nv_m[,i]))
        val_cor <- cor(nv_o[,i], nv_m[,i], use="pairwise.complete.obs")
        dt <- as.data.table(rbind(c("orig",as.numeric(summary_o)), c("modified",as.numeric(summary_m))))
        setnames(dt, c("Type",names(summary_o)))
        if (docat) {
          message(paste0(tabsp,"Variable ", shQuote(numvars[i])," has been modified. The correlation is ",prettyF(val_cor),"\n\n"))
          print(dt)
          message("\n")
        }
        out$results[[length(out$results)+1]] <- dt
      }
    }
    if (!anyChanges) {
      out$results <- NULL
    }
    if (docat) {
      message(hr,"\n\n")
    }
    return(invisible(out))
  }
})
Any scripts or data that you put into this service are public.
sdcMicro documentation built on Aug. 22, 2025, 5:13 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sdcMicro
Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/printFunctions.R
In sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Defines functions freq prettyF

Documented in freq

Try the sdcMicro package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sdcMicro Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/printFunctions.R In sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

Defines functions freq prettyF

Documented in freq

Try the sdcMicro package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sdcMicro
Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation

R/printFunctions.R
In sdcMicro: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation