In mneunhoe/dpBootstrap: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(dpBootstrap)

Here is the data that we want to use in the example. It's a sample from a (standard)-lognormal distribution.

data <- (rnorm(1000))

First you need to define the differentially private mechanism that you want to apply to the data. Note that for this particular dp cdf mechanism you need to provide a lower and upper bound where you expect the data to be (without looking at the data!).

projected_dp_cdf_mechanism <- function(data, n_bins = 1000, rho = 0.5, lower_bound = -3, upper_bound = 3) {

  # This generates a projected dp cdf of the data. All functions are part of the
  # dpBootstrap package.
  projected_dp_cdf <-
    data |> bin_data(lower_bound = lower_bound,
                     upper_bound = upper_bound,
                     n_bins = n_bins) |> dp_cdf(rho = rho) |>
    project_dp_cdf()

  return(projected_dp_cdf)
}


projected_dp_cdf_mechanism(pop)

instantiated_mechanism <- function(data) {
    projected_dp_cdf_mechanism(data, rho = 10)
  }


get_synthetic_data_from_cdf <- function(P_tilde) {
  lower_bound <- P_tilde$x[2]
  upper_bound <- P_tilde$x[length(P_tilde$x)-1]
  # For the generic pipeline to work we want the output of the mechanism to be
  # synthetic data in the same format as the input data.
  M_inv <- get_inv_M(length(P_tilde$y))



  # Get counts per bin
  tmp_count <-
    rmultinom(1, P_tilde$n, M_inv %*% P_tilde$y)


  # Replicate each bin the number of counts (this will bias mean estimation on 
  # the synthetic data by approx. bin_width/2)
  synthetic_data <- NULL
  for (i in 1:nrow(tmp_count)) {
    synthetic_data <-
      c(synthetic_data, rep(P_tilde$x[i], tmp_count[i,]))
  }

  # Make sure that examples in the excess bins are outside the boundaries but
  # not -Inf or Inf (R won't count them otherwise).
  synthetic_data[synthetic_data == -Inf] <- lower_bound - 1
  synthetic_data[synthetic_data == Inf] <- upper_bound + 1

  return(synthetic_data)
}

What statistics of interest do you want to calculate on the private data? The output should be a named list.

statistics_of_interest <- function(P_tilde) {

  synthetic_data <- get_synthetic_data_from_cdf(P_tilde)


  list(median = median(synthetic_data),
       mean = mean(synthetic_data),
       variance = mean(synthetic_data^2) - mean(synthetic_data)^2,
       quantiles = unname(quantile(synthetic_data, c(0.1, 0.25, 0.75, 0.9))))
}


statistics_of_interest_data <- function(data) {


  list(median = median(data),
       mean = mean(data),
       variance = mean(data^2) - mean(data)^2,
       quantiles = quantile(data, c(0.1, 0.25, 0.75, 0.9)))
}

statistics_of_interest_binned_data <- function(data) {

  lower_bound <- -3
  upper_bound <- 3

  histogram <-
    data |> bin_data(lower_bound = lower_bound,
                     upper_bound = upper_bound,
                     n_bins = 1000)

  binned_data <- NULL
  for (i in 1:length(histogram$histogram)) {
    binned_data <-
      c(binned_data, rep(histogram$breaks[i], histogram$histogram[i]))
  }

  list(median = median(binned_data),
       mean = mean(binned_data),
       variance = mean(binned_data^2) - mean(binned_data)^2,
       quantiles = quantile(binned_data, c(0.1, 0.25, 0.75, 0.9)))
}

Now we have everything defined to run the dp_bootstrap function. Note that depending on the input data and mechanism all of the above functions need to be adapted to the application.

cdf_results <- dp_bootstrap(
  data = data,
  mechanism = projected_dp_cdf_mechanism,
  get_synthetic_data = get_synthetic_data_from_cdf,
  statistics_of_interest = statistics_of_interest,
  B = 10,
  store_P_tilde = TRUE,
  store_P_tilde_tilde = TRUE
)

cdf_results$P_tilde_tilde

Get Point Estimates and Confidence Intervals from results

get_results <- function(cdf_results, alpha = 0.05){
# get point estimates from first run
point_estimates <- cdf_results$theta_tilde

# And quantiles from bootstrap
confidence_intervals <- vector("list", length = length(point_estimates))
names(confidence_intervals) <- names(point_estimates)

for(n in names(confidence_intervals)){
  tmp <- sapply(cdf_results$theta_tilde_tilde, function(x) x[[paste0(n)]])

  if(is.null(dim(tmp))){
   res <- quantile(tmp, c(alpha/2,1 - alpha/2))
  } else {
    res <- apply(tmp, 1, quantile, c(alpha/2,1 - alpha/2))
  }

  confidence_intervals[[paste0(n)]] <- res

}

return(list(point_estimates = point_estimates,
            confidence_intervals = confidence_intervals))

}


get_results(cdf_results)

Monte Carlo repetitions

set.seed(220616)

mc_reps <- 100

mc_results <- vector("list", length = mc_reps)

for(mc_rep in 1:mc_reps){

data <- (rnorm(1000))


cdf_results <- dp_bootstrap(
  data = data,
  mechanism = projected_dp_cdf_mechanism,
  get_synthetic_data = get_synthetic_data_from_cdf,
  statistics_of_interest = statistics_of_interest,
  B = 10
)

mc_results[[mc_rep]] <- get_results(cdf_results)

cat("MC Repetition: ", mc_rep, "\n")

}

Analyze Coverage

true_values <- list(median = 0,
                    mean = 0,
                    variance = 1,
                    quantiles = qnorm(c(0.1, 0.25, 0.75, 0.9)))

coverage <- vector("list", length = length(true_values))

names(coverage) <- names(true_values)

for(n in names(true_values)){
  tmp <- lapply(mc_results, function(x) x$confidence_intervals[[paste0(n)]])
  true_tmp <- true_values[[paste0(n)]]

  if(length(true_tmp) == 1){
    coverage[[paste0(n)]] <- mean(sapply(tmp, function(x) x[1] < true_tmp & x[2] > true_tmp))


  } else {
    coverage[[paste0(n)]] <- apply(sapply(tmp, function(x) {
      res <- NULL
      for(i in 1:length(true_tmp)){
        res <- c(res, x[1,i] < true_tmp[i] & x[2,i] > true_tmp[i])
      }
      return(res)
    }), 1, mean)

  }

}



# Step 1: Run experiment

# Step 2: Calculate Confidence Intervals

# Step 3: Evaluate


# What's the true value 

#