Nothing
#' Calculates file-level risk measures using a loglinear model.
#'
#' @param data Data frame containing the data to be evaluated.
#' @param weight Column name for sampling weights.
#' @param varpool Vector of column names to be used in model.
#' @param degree Highest degree of interaction terms to be used in the model.
#' @param numiter Maximum number of iterations to run iterative proportional
#' fitting for the loglinear model.
#' @param epsilon Maximum deviation allowed between observed and fitted margins.
#' @param blanks_as_missing If TRUE, character and factor variables that are
#' blank or pure whitespace are treated as missing values.
#' @param output_filename Name of the csv file to save the data set with
#' record-level risk measures, .tau1_rec and .tau2_rec, attached. NULL if no
#' output file is to be saved.
#' @details The data should not contain any missing values among \code{varpool}
#' variables or the \code{weight} variable.
#' @return An object of type \code{sdc_loglinear} containing calculated risk
#' measures.
#' @examples
#' data(exampledata)
#' vars <- c("BORNUSA", "CENREG", "DAGE3", "DRACE3", "EDUC3", "GENDER")
#' wgt <- "WEIGHT"
#'
#' results <- sdc_loglinear(exampledata, wgt, vars, degree=3)
#' print(results)
#' plot(results, plotvar1="BORNUSA", plotvar2="WEIGHT")
#' @export
sdc_loglinear <- function(data,
weight,
varpool,
degree=2,
numiter=40,
epsilon=0.001,
blanks_as_missing=TRUE,
output_filename=NULL) {
# Input validation -----------------------------------------------------------
sdc_loglinear_input_checks(data,
weight,
varpool,
degree,
numiter,
epsilon,
blanks_as_missing,
output_filename)
pi_results <- loglinear_workhorse(data,
weight,
varpool,
degree,
numiter,
epsilon,
fixed_pi=TRUE)
pi_k_results <- loglinear_workhorse(data,
weight,
varpool,
degree,
numiter,
epsilon,
fixed_pi=FALSE)
tables <- list(pi_results=pi_results$statistics,
pi_k_results=pi_k_results$statistics)
data_with_statistics <- merge(pi_k_results$data_with_statistics,
pi_results$data_with_statistics)
return_value <- list(ll_tables=tables,
data_with_statistics=data_with_statistics,
options=list(weight=weight,
varpool=varpool,
degree=degree,
numiter=numiter,
epsilon=epsilon,
blanks_as_missing=blanks_as_missing,
output_filename=output_filename))
if(!is.null(output_filename)) {
write.csv(data_with_statistics,
output_filename,
row.names=FALSE)
} # end if
class(return_value) <- "sdc_loglinear"
return(return_value)
} # end 'loglinear_nway'
# ------------------------------------------------------------------------------
# This does most of the work within sdc_loglinear
loglinear_workhorse <- function(data,
weight,
varpool,
degree=2,
numiter=40,
epsilon=0.01,
fixed_pi=TRUE) {
# Get tables -----------------------------------------------------------------
freq_tbl <- get_table(data, varpool, weight)
tmp <- setNames(lapply(varpool,
function(variable) unique((data[[variable]]))),
varpool)
grid <- merge(expand.grid(tmp), freq_tbl, all.x=TRUE)
grid[is.na(grid)] <- 0
# Check number of sample uniques ---------------------------------------------
num_unique <- sum(grid$.unweighted_freq == 1)
if(num_unique == 1)
warning(paste("Only 1 sample unique. Some statistics will not",
"be available and appear as NA."))
# Get average cell size ------------------------------------------------------
avg_cell_size <- get_average_cell_size(data, varpool)
# If no sample uniques, stop. ------------------------------------------------
if(num_unique == 0) {
values <- as.matrix(data.frame(sampsize=nrow(data),
avg_cell_size=avg_cell_size,
tau1Risk=0,
tau2Risk=0,
tau1=0,
tau2=0,
samp_unique=FALSE))
row.names(values) <- NULL
return(values)
} # end if
# Fit model ------------------------------------------------------------------
varterms <- paste(varpool, collapse=" + ")
formula <- as.formula(paste(".weighted_freq ~", varterms))
if(degree == 1)
formula2 <- as.formula(paste("~", varterms))
else
formula2 <- as.formula(paste0("~ (", varterms, ")^", degree))
tab <- xtabs(formula, grid)
model <- MASS::loglm(formula2, tab, fitted=TRUE, iter=numiter, eps=epsilon)
# Get lambdas ----------------------------------------------------------------
lambda_tbl <- as.data.frame.table(fitted(model))
names(lambda_tbl)[names(lambda_tbl) == "Freq"] <- ".lambda"
grid <- merge(grid, lambda_tbl, all.x=TRUE)
lambda <- grid$.lambda
# Get number of samples per cell, find sample uniques ------------------------
nsamp <- grid$.unweighted_freq
unique_sample <- nsamp == 1
# Calculate statistics -------------------------------------------------------
nsamp <- grid$.unweighted_freq
unique_sample <- nsamp == 1
pi <- rep(nrow(data)/sum(data[[weight]]), length(unique_sample))
if(!fixed_pi) {
pi_k <- grid$.unweighted_freq/grid$.weighted_freq
pi <- ifelse(nsamp == 0, pi, pi_k)
}
wgt <- ifelse(grid$.unweighted_freq == 0,
0,
grid$.weighted_freq/grid$.unweighted_freq)
p <- ifelse(nsamp >= 1, nsamp/wgt, 0)
arg1 <- ifelse(unique_sample & wgt <= 1, wgt,
ifelse(unique_sample, p/(1 - p)*log(1/p), 0))
arg2 <- ifelse(unique_sample, p, 0)
exp1 <- ifelse(unique_sample,
(1/((1 - pi)*lambda))*(1 - exp(-(1 - pi)*lambda)),
0)
pusu <- ifelse(unique_sample, exp(-(1 - pi)*lambda), 0)
r <- ifelse(lambda != 0,
(1/((1 - pi)*(lambda)))*(1 - exp(-(1 - pi)*lambda)),
0)
a1 <- ifelse(lambda != 0, (1 - pi)*(lambda)*exp(-lambda), 0)
b1 <- ifelse(pi != 0, ((1 - pi)^2)*(1/(2*pi))*(lambda)*exp(-lambda), 0)
aa <- ifelse(lambda != 0,
a1*(nsamp - pi*lambda) + b1*(((nsamp - pi*lambda)^2) - nsamp),
0)
vaa <- a1*a1*pi*lambda + 2*b1*b1*pi*lambda*pi*lambda
a2 <- ifelse(lambda != 0, exp(-pi*lambda)*r - (exp(-lambda)), 0)
b2 <- ifelse(lambda != 0 & pi != 0,
((1/(pi*lambda))*exp(-pi*lambda)*r) -
((1/(pi*lambda))*exp(-lambda)* (1 + .5*((1 - pi)*lambda))),
0)
bb2 <- a2*(nsamp - pi*lambda) + b2*(((nsamp - pi*lambda)^2) - nsamp)
vbb <-a2*a2*pi*lambda+2*b2*b2*pi*lambda*pi*lambda
stderr <- function(x) return(sd(x)/sqrt(length(x)))
aanew <- mean(aa)/stderr(aa) #This is B1/sqrt(Vr)
bbnew <- mean(bb2)/stderr(bb2) #This is B2/sqrt(Vr)
scaa1 <- sum(aa)/sqrt(sum(vaa)) #This is B1/sqrt(V)
scbb2 <- sum(bb2)/sqrt(sum(vbb)) #This is B2/sqrt(V)
B_tau1_type1 <- mean(aa)/stderr(aa) #This is B1/sqrt(Vr)
B_tau1_type2 <- sum(aa)/sqrt(sum(vaa)) #This is B1/sqrt(V)
B_tau2_type1 <- mean(bb2)/stderr(bb2) #This is B2/sqrt(Vr)
B_tau2_type2 <- sum(bb2)/sqrt(sum(vbb)) #This is B2/sqrt(V)
tau2 <- sum(exp1)
tau1 <- sum(pusu)
tau1argus <- sum(arg1)
tau2argus <- sum(arg2)
values <- as.matrix(data.frame(sampsize=nrow(data),
avg_cell_size=avg_cell_size,
tau1Risk=tau1/sum(nsamp),
tau2Risk=tau2/sum(nsamp),
tau1=tau1,
tau2=tau2,
B_tau1_type1=B_tau1_type1,
B_tau1_type2=B_tau1_type2,
B_tau2_type1=B_tau2_type1,
B_tau2_type2=B_tau2_type2,
samp_unique=TRUE))
if(degree > 1) {
values <- rbind(loglinear_workhorse(data,
weight,
varpool,
1,
numiter,
epsilon,
fixed_pi)$statistics,
values)
} # end if
row.names(values) <- NULL
# Add risk measures to data set ----------------------------------------------
keep_names <- !names(grid) %in%
c(".weighted_freq", ".unweighted_freq", ".lambda")
tmp <- grid[,keep_names,drop=FALSE]
tmp$.tau1_rec_cellmean <- pusu
tmp$.tau2_rec_cellmean <- exp1
data <- merge(data, tmp, all.x=TRUE)
if(any(is.na(data$.tau1_rec)))
data$.tau1_rec_cellmean[is.na(data$.tau1_rec_cellmean)] <- 0
if(any(is.na(data$.exp1)))
data$.tau2_rec_cellmean[is.na(data$.tau2_rec_cellmean)] <- 0
if(fixed_pi) {
data$.tau1_rec_overallmean <- data$.tau1_rec_cellmean
data$.tau2_rec_overallmean <- data$.tau2_rec_cellmean
keep_names <- !names(data) %in%
c(".tau1_rec_cellmean", ".tau2_rec_cellmean")
data <- data[,keep_names]
} # end if
# Return statistics and modified data set ------------------------------------
return(list(statistics=values, data_with_statistics=data))
} # end 'loglinear_workhorse'
# ------------------------------------------------------------------------------
#' @describeIn sdc_loglinear S3 print method for sdc_loglinear objects
#'
#' Prints tables of file-level reidentification risk measures.
#'
#' @param x Object of class sdc_loglinear, as returned by sdc_loglinear.
#' @param summary_outfile Name of summary output .txt file. If not NULL, console
#' output is copied to the file. Default is NULL (no logging of output).
#' Errors and warnings are not diverted (consider running in batch mode if
#' logging is needed).
#' @param ... Currently unused. For NextMethod compatibility.
#' @export
print.sdc_loglinear <- function(x,
summary_outfile=NULL,
...) {
# Log output if requested ----------------------------------------------------
if(!is.null(summary_outfile)) {
stopifnot(is.character(summary_outfile))
conn <- file(summary_outfile)
sink(conn, append=FALSE, split=TRUE)
on.exit(sink())
on.exit(close(conn), add=TRUE)
} # end if
degree <- x$options$degree
# Fixed pi -------------------------------------------------------------------
cat("RESULTS - Uses overall average weight\n")
loglinear_print_workhorse(x$ll_tables$pi_results, degree)
# pi and pi_k ----------------------------------------------------------------
cat("\nRESULTS - Uses cell average weight\n")
loglinear_print_workhorse(x$ll_tables$pi_k_results, degree)
return(invisible(NULL))
} # end 'print.sdc_loglinear'
# ------------------------------------------------------------------------------
loglinear_print_workhorse <- function(ll_table, degree) {
if(degree == 1)
interaction <- data.frame(interaction="none")
else
interaction <- data.frame(interaction=c("none", paste0(degree,"-way")))
if(any(!ll_table[,"samp_unique"])) {
interaction$interaction <- "No Sample Uniques"
tbl1 <- cbind(as.data.frame(ll_table)[,c("sampsize", "avg_cell_size")],
interaction,
as.data.frame(ll_table)[,c("tau1Risk", "tau2Risk")])
tbl2 <- as.data.frame(ll_table)[,5:6]
} else {
tbl1 <- cbind(as.data.frame(ll_table)[,c("sampsize", "avg_cell_size")],
interaction,
as.data.frame(ll_table)[,c("tau1Risk", "tau2Risk")])
tbl2 <- as.data.frame(ll_table)[,5:10]
} # end if...else
old_scipen <- options("scipen")$scipen
on.exit(options(scipen=old_scipen))
options(scipen=9)
print(tbl1, row.names=FALSE, digits=5)
cat("\n")
print(tbl2, row.names=FALSE, digits=5)
return(invisible(NULL))
} # end 'loglinear_print_workhorse'
# ------------------------------------------------------------------------------
#' @describeIn sdc_loglinear S3 plot method for \code{sdc_loglinear} objects
#'
#' Produces boxplots and scatterplots of record-level risk measures, tau1 and
#' tau2, of the degree specified in the original call to \code{sdc_loglinear}.
#'
#' @param plotpath Directory to save plots. Plots are saved as \emph{jpeg} files
#' (quality = 100\%). If the directory does not exist, it is first created.
#' If \code{plotpath} is NULL (default), plots are not saved.
#' @param plotvar1 A vector of names of discrete variables for boxplots. If
#' none, boxplots are not produced.
#' @param plotvar2 A vector of names of continuous variables for scatterplots.
#' If none, scatterplots are not produced.
#' @export
plot.sdc_loglinear <- function(x,
plotpath=NULL,
plotvar1=character(0),
plotvar2=character(0),
...) {
dat <- x$data_with_statistics
# Input validation -----------------------------------------------------------
stopifnot(is.null(plotpath) ||
(is.character(plotpath) && length(plotpath) == 1))
stopifnot(is.character(plotvar1) && all(plotvar1 %in% names(dat)))
stopifnot(is.character(plotvar2) && all(plotvar2 %in% names(dat)))
# Set save_plots flag --------------------------------------------------------
save_plots <- !is.null(plotpath)
# Run this again without saving, so that plots also appear in R console.
if(save_plots)
plot(x, plotvar1=plotvar1, plotvar2=plotvar2)
# Helper functions -----------------------------------------------------------
get_plot_filename <- function(is_scatter, is_tau1, is_overall, variable) {
fname <- paste0(ifelse(is_scatter, "scatter", "box"),
"--",
ifelse(is_tau1, "tau1", "tau2"),
ifelse(is_overall, "_overallmean--", "_cellmean--"),
variable,
".jpeg")
return(file.path(plotpath, fname))
} # end 'get_plot_filename'
# ----------------------------------------------------------------------------
# Call before making any plot
plot_start <- function(is_scatter, is_tau1, is_overall, variable) {
if(save_plots)
jpeg(get_plot_filename(is_scatter,
is_tau1,
is_overall,
variable),
quality=100)
else
dev.new()
return(invisible(NULL))
} # end 'plot_start'
# ----------------------------------------------------------------------------
# Call after making any plot
plot_end <- function() {
if(save_plots)
dev.off()
return(invisible(NULL))
} # end 'plot_end'
# ----------------------------------------------------------------------------
make_plot <- function(plotvar, is_tau1, is_scatter, is_overall) {
plot_start(is_scatter, is_tau1, is_overall, plotvar)
yvar <- paste0(".tau",
ifelse(is_tau1, "1", "2"),
"_rec_",
ifelse(is_overall, "overall", "cell"),
"mean")
if(is_scatter) {
scatrplt <-
ggplot2::ggplot(dat,
mapping=ggplot2::aes_string(x=plotvar,
y=yvar)) +
ggplot2::geom_point()
plot(scatrplt)
} else {
boxplot_formula <- as.formula(paste(yvar, "~", plotvar))
boxplot(boxplot_formula, data=dat)
} # end if...else
plot_end()
return(invisible(NULL))
} # end 'make_plot'
# ----------------------------------------------------------------------------
# Create output directory (if any) if it does not already exist --------------
# Clean up directory name
if(!is.null(plotpath))
plotpath <- file.path(plotpath)
if(!is.null(plotpath) && !dir.exists(plotpath))
dir.create(plotpath, recursive=TRUE)
# Make plots -----------------------------------------------------------------
if(length(plotvar1) > 0) {
arg_grid_box <- expand.grid(is_tau1=c(TRUE, FALSE),
is_overall=c(TRUE, FALSE),
plotvar=plotvar1,
stringsAsFactors=FALSE)
do.call(mapply, c(make_plot, arg_grid_box, is_scatter=FALSE))
} # end if
if(length(plotvar2) > 0) {
arg_grid_scatter <- expand.grid(is_tau1=c(TRUE, FALSE),
is_overall=c(TRUE, FALSE),
plotvar=plotvar2,
stringsAsFactors=FALSE)
do.call(mapply, c(make_plot, arg_grid_scatter, is_scatter=TRUE))
} # end if
return(invisible(NULL))
} # end 'plot.sdc_loglinear'
# ------------------------------------------------------------------------------
sdc_loglinear_input_checks <- function(data,
weight,
varpool,
degree,
numiter,
epsilon,
blanks_as_missing,
output_filename) {
stopifnot(is.data.frame(data))
stopifnot(nrow(data) > 0)
stopifnot(is.character(weight))
stopifnot(length(weight) == 1)
stopifnot(weight %in% names(data))
stopifnot(is.character(varpool))
stopifnot(length(varpool) >= 1)
stopifnot(all(varpool %in% names(data)))
stopifnot(is.numeric(degree))
stopifnot(length(degree) == 1)
stopifnot(degree >= 1)
stopifnot(degree == round(degree)) # is integer?
stopifnot(degree <= length(varpool))
stopifnot(is.numeric(numiter))
stopifnot(length(numiter) == 1)
stopifnot(numiter >= 1)
stopifnot(numiter == round(numiter)) # is integer?
stopifnot(is.numeric(epsilon))
stopifnot(length(epsilon) == 1)
stopifnot(epsilon > 0)
stopifnot(is.logical(blanks_as_missing))
stopifnot(is.null(output_filename) ||
(is.character(output_filename) &&
length(output_filename) == 1))
# Check for missing values ---------------------------------------------------
missing_variables <- sdc_loglinear_check_missing(data,
varpool,
weight,
blanks_as_missing)
# Throw exception if any missing values found --------------------------------
if(length(missing_variables) > 0) {
msg <- paste0("sdc_loglinear does not support missing values. ",
"Missing values found among: ",
paste0(missing_variables, collapse=" "))
stop(msg)
}
return(invisible(NULL))
} # end 'sdc_loglinear_input_checks'
# ------------------------------------------------------------------------------
sdc_loglinear_check_missing <- function(data,
varpool,
weight,
blanks_as_missing) {
has_missing <- function(x)
return(any(is.na(x)) ||
(blanks_as_missing && any(trimws(x) == "")))
variables <- c(varpool, weight)
missing <- variables[sapply(c(varpool, weight),
function(var)
return(has_missing(data[,var])))]
return(missing)
} # end 'sdc_loglinear_check_missing'
# ------------------------------------------------------------------------------
get_average_cell_size <- function(data, varpool) {
num_of_cells <-
prod(sapply(varpool,
function(var)
length(unique(as.vector(na.omit(data[[var]]))))
)
)
return(nrow(data)/num_of_cells)
} # end 'get_average_cell_size'
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.