# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This library implements the RAPPOR marginal decoding algorithms using LASSO.
library(glmnet)
library(Matrix)
# So we don't have to change pwd
source.rappor <- function(rel_path) {
abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
source(abs_path)
}
source.rappor('./R/alternative.R')
EstimateBloomCounts <- function(params, obs_counts) {
# Estimates the number of times each bit in each cohort was set in original
# Bloom filters.
#
# Input:
# params: a list of RAPPOR parameters:
# k - size of a Bloom filter
# h - number of hash functions
# m - number of cohorts
# p - P(IRR = 1 | PRR = 0)
# q - P(IRR = 1 | PRR = 1)
# f - Proportion of bits in the Bloom filter that are set randomly
# to 0 or 1 regardless of the underlying true bit value
# obs_counts: a matrix of size m by (k + 1). Column one contains sample
# sizes for each cohort. Other counts indicated how many times
# each bit was set in each cohort.
#
# Output:
# ests: a matrix of size m by k with estimated counts for the probability
# of each bit set to 1 in the true Bloom filter.
# stds: standard deviation of the estimates.
p <- params$p
q <- params$q
f <- params$f
m <- params$m
k <- params$k
stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts))
p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1
p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1
p2 <- p11 - p01 # == (1 - f) * (q - p)
# When m = 1, obs_counts does not have the right dimensions. Fixing this.
dim(obs_counts) <- c(m, k + 1)
ests <- apply(obs_counts, 1, function(cohort_row) {
N <- cohort_row[1] # sample size for the cohort -- first column is total
v <- cohort_row[-1] # counts for individual bits
(v - p01 * N) / p2 # unbiased estimator for individual bits'
# true counts. It can be negative or
# exceed the total.
})
# NOTE: When k == 1, rows of obs_counts have 2 entries. Then cohort_row[-1]
# is a singleton vector, and apply() returns a *vector*. When rows have 3
# entries, cohort_row[-1] is a vector of length 2 and apply() returns a
# *matrix*.
#
# Fix this by explicitly setting dimensions. NOTE: It's k x m, not m x k.
dim(ests) <- c(k, m)
total <- sum(obs_counts[,1])
variances <- apply(obs_counts, 1, function(cohort_row) {
N <- cohort_row[1]
v <- cohort_row[-1]
p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1
p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1]
r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1
N * r * (1 - r) / p2^2 # variance of the binomial
})
dim(variances) <- c(k, m)
# Transform counts from absolute values to fractional, removing bias due to
# variability of reporting between cohorts.
ests <- apply(ests, 1, function(x) x / obs_counts[,1])
stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
# Some estimates may be set to infinity, e.g. if f=1. We want to account for
# this possibility, and set the corresponding counts to 0.
ests[abs(ests) == Inf] <- 0
list(estimates = ests, stds = stds)
}
FitLasso <- function(X, Y, intercept = TRUE) {
# Fits a Lasso model to select a subset of columns of X.
#
# Input:
# X: a design matrix of size km by M (the number of candidate strings).
# Y: a vector of size km with estimated counts from EstimateBloomCounts().
# intercept: whether to fit with intercept or not.
#
# Output:
# a vector of size ncol(X) of coefficients.
# TODO(mironov): Test cv.glmnet instead of glmnet
mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
lower.limits = 0, # outputs are non-negative
# Cap the number of non-zero coefficients to 500 or
# 80% of the length of Y, whichever is less. The 500 cap
# is for performance reasons, 80% is to avoid overfitting.
pmax = min(500, length(Y) * .8)),
silent = TRUE)
# If fitting fails, return an empty data.frame.
if (class(mod)[1] == "try-error") {
coefs <- setNames(rep(0, ncol(X)), colnames(X))
} else {
coefs <- coef(mod)
coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept
}
coefs
}
PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
m <- params$m
p <- params$p
q <- params$q
f <- params$f
h <- params$h
q2 <- .5 * f * (p + q) + (1 - f) * q
p2 <- .5 * f * (p + q) + (1 - f) * p
resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2
# Total Sum of Squares (SS).
TSS <- sum((Y - mean(Y))^2)
# Error Sum of Squares (ESS).
ESS <- resid_var * nrow(X)
betas <- matrix(mod$coefs, ncol = 1)
# mod_var <- summary(mod$fit)$sigma^2
# betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas))
#
# z_values <- betas / betas_sd
#
# # 1-sided t-test.
# p_values <- pnorm(z_values, lower = FALSE)
fit <- data.frame(string = colnames(X), Estimate = betas,
SD = mod$stds, # z_stat = z_values, pvalue = p_values,
stringsAsFactors = FALSE)
# if (correction == "FDR") {
# fit <- fit[order(fit$pvalue, decreasing = FALSE), ]
# ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit))
# if (length(ind) > 0) {
# fit <- fit[1:max(ind), ]
# } else {
# fit <- fit[numeric(0), ]
# }
# } else {
# fit <- fit[fit$p < alpha, ]
# }
fit <- fit[order(fit$Estimate, decreasing = TRUE), ]
if (nrow(fit) > 0) {
str_names <- fit$string
str_names <- str_names[!is.na(str_names)]
if (length(str_names) > 0 && length(str_names) < nrow(X)) {
this_data <- as.data.frame(as.matrix(X[, str_names]))
Y_hat <- predict(lm(Y ~ ., data = this_data))
RSS <- sum((Y_hat - mean(Y))^2)
} else {
RSS <- NA
}
} else {
RSS <- 0
}
USS <- TSS - ESS - RSS
SS <- c(RSS, USS, ESS) / TSS
list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var))
}
ComputePrivacyGuarantees <- function(params, alpha, N) {
# Compute privacy parameters and guarantees.
p <- params$p
q <- params$q
f <- params$f
h <- params$h
q2 <- .5 * f * (p + q) + (1 - f) * q
p2 <- .5 * f * (p + q) + (1 - f) * p
exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h
if (exp_e_one < 1) {
exp_e_one <- 1 / exp_e_one
}
e_one <- log(exp_e_one)
exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h)
e_inf <- log(exp_e_inf)
std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
detection_freq <- qnorm(1 - alpha) * std_dev_counts / N
privacy_names <- c("Effective p", "Effective q", "exp(e_1)",
"e_1", "exp(e_inf)", "e_inf", "Detection frequency")
privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq)
privacy <- data.frame(parameters = privacy_names,
values = privacy_vals)
privacy
}
FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
# Find a distribution over rows of map that approximates estimates_stds best
#
# Input:
# estimates_stds: a list of two m x k matrices, one for estimates, another
# for standard errors
# map : an (m * k) x S boolean matrix
#
# Output:
# a float vector of length S, so that a distribution over map's rows sampled
# according to this vector approximates estimates
S <- ncol(map) # total number of candidates
support_coefs <- 1:S
if (S > length(estimates_stds$estimates) * .8) {
# the system is close to being underdetermined
lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
# Select non-zero coefficients.
support_coefs <- which(lasso > 0)
if(!quiet)
cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
}
coefs <- setNames(rep(0, S), colnames(map))
if(length(support_coefs) > 0) { # LASSO may return an empty list
constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
estimates_stds)
coefs[support_coefs] <- constrained_coefs
}
coefs
}
Resample <- function(e) {
# Simulate resampling of the Bloom filter estimates by adding Gaussian noise
# with estimated standard deviation.
estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y),
e$estimates, e$stds),
nrow = nrow(e$estimates), ncol = ncol(e$estimates))
stds <- e$stds * 2^.5
list(estimates = estimates, stds = stds)
}
# Private function
# Decode for Boolean RAPPOR inputs
# Returns a list with attribute fit only. (Inference and other aspects
# currently not incorporated because they're unnecessary for association.)
.DecodeBoolean <- function(counts, params, num_reports) {
# Boolean variables are reported without cohorts and to estimate counts,
# first sum up counts across all cohorts and then run EstimateBloomCounts
# with the number of cohorts set to 1.
params$m <- 1 # set number of cohorts to 1
summed_counts <- colSums(counts) # sum counts across cohorts
es <- EstimateBloomCounts(params, summed_counts) # estimate boolean counts
ests <- es$estimates[[1]]
std <- es$stds[[1]]
fit <- data.frame(
string = c("TRUE", "FALSE"),
estimate = c(ests * num_reports,
num_reports - ests * num_reports),
std_error = c(std * num_reports, std * num_reports),
proportion = c(ests, 1 - ests),
prop_std_error = c(std, std))
low_95 <- fit$proportion - 1.96 * fit$prop_std_error
high_95 <- fit$proportion + 1.96 * fit$prop_std_error
fit$prop_low_95 <- pmax(low_95, 0.0)
fit$prop_high_95 <- pmin(high_95, 1.0)
rownames(fit) <- fit$string
return(list(fit = fit))
}
CheckDecodeInputs <- function(counts, map, params) {
# Returns an error message, or NULL if there is no error.
if (nrow(map) != (params$m * params$k)) {
return(sprintf(
"Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
params$m * params$k, nrow(map)))
}
if ((ncol(counts) - 1) != params$k) {
return(sprintf(paste0(
"Dimensions of counts file do not match: m = %d, k = %d, ",
"nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k,
nrow(counts), ncol(counts)))
}
# numerically correct comparison
if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) {
return("Information is lost. Cannot decode.")
}
return(NULL) # no error
}
#' Decode
#'
#' This function decodes rappor reports.
#'
#' @param counts_file Counts
#' @param map_file Map
#' @param params_file Params
#' @param alpha Alpha. Defaults to 0.05.
#' @param correction Correction. Defaults to Bonferroni.
#' @param quiet Quiet. Defaults to FALSE.
#' @keywords rappor
#' @export
#'
#' Decode(counts_file, map_file, params_file)
Decode <- function(counts_file, map_file, params_file, alpha = 0.05,
correction = c("Bonferroni"), quiet = FALSE, ...) {
library(Matrix)
######## Read params file ########
params <- as.list(read.csv(params_file))
if (length(params) != 6) {
stop("There should be exactly 6 columns in the parameter file.")
}
if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) {
stop("Parameter names must be k,h,m,p,q,f.")
}
cat("params:")
# cat(params)
######## Read counts file ########
counts <- as.matrix(read.csv(counts_file, header = FALSE))
# if (nrow(counts) != params$m) {
# stop(sprintf("Got %d rows in the counts file, expected m = %d",
# nrow(counts), params$m))
# }
# if ((ncol(counts) - 1) != params$k) {
# stop(paste0("Counts file: number of columns should equal to k + 1: ",
# ncol(counts)))
# }
# if (any(counts < 0)) {
# stop("Counts file: all counts must be positive.")
# }
cat("counts:")
# cat(counts)
######## Read map file ########
map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE)
strs <- map_pos[, 1]
strs[strs == ""] <- "Empty"
# Remove duplicated strings.
ind <- which(!duplicated(strs))
strs <- strs[ind]
map_pos <- map_pos[ind, ]
n <- ncol(map_pos) - 1
if (n != (params$h * params$m)) {
stop(paste0("Map file: number of columns should equal hm + 1:",
n, "_", params$h * params$m))
}
row_pos <- unlist(map_pos[, -1], use.names = FALSE)
col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1)
# TODO: When would this ever happen?
removed <- which(is.na(row_pos))
if (length(removed) > 0) {
Log("Removed %d entries", length(removed))
row_pos <- row_pos[-removed]
col_pos <- col_pos[-removed]
}
mapInt <- sparseMatrix(row_pos, col_pos,
dims = c(params$m * params$k, length(strs)))
colnames(mapInt) <- strs
map <- list(mapInt = mapInt, strs = strs, map_pos = map_pos)
cat("map:")
# cat(map)
########################
######## Decode ########
error_msg <- CheckDecodeInputs(counts, map, params)
if (!is.null(error_msg)) {
stop(error_msg)
}
k <- params$k
p <- params$p
q <- params$q
f <- params$f
h <- params$h
m <- params$m
S <- ncol(map) # total number of candidates
N <- sum(counts[, 1])
if (k == 1) {
return(.DecodeBoolean(counts, params, N))
}
filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports
# stretch cohorts to bits
filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE])
map_filtered <- map[filter_bits, , drop = FALSE]
es <- EstimateBloomCounts(params, counts)
estimates_stds_filtered <-
list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
stds = es$stds[filter_cohorts, , drop = FALSE])
coefs_all <- vector()
# Run the fitting procedure several times (5 seems to be sufficient and not
# too many) to estimate standard deviation of the output.
for(r in 1:5) {
if(r > 1)
e <- Resample(estimates_stds_filtered)
else
e <- estimates_stds_filtered
coefs_all <- rbind(coefs_all,
FitDistribution(e, map_filtered, quiet))
}
coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations
coefs_ave <- N * apply(coefs_all, 2, mean)
# Only select coefficients more than two standard deviations from 0. May
# inflate empirical SD of the estimates.
reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
coefs_ave_zeroed <- coefs_ave
coefs_ave_zeroed[-reported] <- 0
residual <- as.vector(t(estimates_stds_filtered$estimates)) -
map_filtered %*% coefs_ave_zeroed / N
if (correction == "Bonferroni") {
alpha <- alpha / S
}
inf <- PerformInference(map_filtered[, reported, drop = FALSE],
as.vector(t(estimates_stds_filtered$estimates)),
N, mod, params, alpha,
correction)
fit <- inf$fit
# If this is a basic RAPPOR instance, just use the counts for the estimate
# (Check if the map is diagonal to tell if this is basic RAPPOR.)
if (sum(map) == sum(diag(map))) {
fit$Estimate <- colSums(counts)[-1]
}
# Estimates from the model are per instance so must be multipled by h.
# Standard errors are also adjusted.
fit$estimate <- floor(fit$Estimate)
fit$proportion <- fit$estimate / N
fit$std_error <- floor(fit$SD)
fit$prop_std_error <- fit$std_error / N
# 1.96 standard deviations gives 95% confidence interval.
low_95 <- fit$proportion - 1.96 * fit$prop_std_error
high_95 <- fit$proportion + 1.96 * fit$prop_std_error
# Clamp estimated proportion. pmin/max: vectorized min and max
fit$prop_low_95 <- pmax(low_95, 0.0)
fit$prop_high_95 <- pmin(high_95, 1.0)
fit <- fit[, c("string", "estimate", "std_error", "proportion",
"prop_std_error", "prop_low_95", "prop_high_95")]
allocated_mass <- sum(fit$proportion)
num_detected <- nrow(fit)
ss <- round(inf$SS, digits = 3)
explained_var <- ss[[1]]
missing_var <- ss[[2]]
noise_var <- ss[[3]]
noise_std_dev <- round(inf$resid_sigma, digits = 3)
# Compute summary of the fit.
parameters <-
c("Candidate strings", "Detected strings",
"Sample size (N)", "Discovered Prop (out of N)",
"Explained Variance", "Missing Variance", "Noise Variance",
"Theoretical Noise Std. Dev.")
values <- c(S, num_detected, N, allocated_mass,
explained_var, missing_var, noise_var, noise_std_dev)
res_summary <- data.frame(parameters = parameters, values = values)
privacy <- ComputePrivacyGuarantees(params, alpha, N)
params <- data.frame(parameters =
c("k", "h", "m", "p", "q", "f", "N", "alpha"),
values = c(k, h, m, p, q, f, N, alpha))
# This is a list of decode stats in a better format than 'summary'.
metrics <- list(sample_size = N,
allocated_mass = allocated_mass,
num_detected = num_detected,
explained_var = explained_var,
missing_var = missing_var)
return(list(fit = fit, summary = res_summary, privacy = privacy, params = params,
lasso = NULL, residual = as.vector(residual),
counts = counts[, -1], resid = NULL, metrics = metrics,
ests = es$estimates # ests needed by Shiny rappor-sim app
))
}
ComputeCounts <- function(reports, cohorts, params) {
# Counts the number of times each bit in the Bloom filters was set for
# each cohort.
#
# Args:
# reports: A list of N elements, each containing the
# report for a given report
# cohorts: A list of N elements, each containing the
# cohort number for a given report
# params: A list of parameters for the problem
#
# Returns:
# An mx(k+1) array containing the number of times each bit was set
# in each cohort.
# Check that the cohorts are evenly assigned. We assume that if there
# are m cohorts, each cohort should have approximately N/m reports.
# The constraint we impose here simply says that cohort bins should
# each have within N/m reports of one another. Since the most popular
# cohort is expected to have about O(logN/loglogN) reports (which we )
# approximate as O(logN) bins for practical values of N, a discrepancy of
# O(N) bins seems significant enough to alter expected behavior. This
# threshold can be changed to be more sensitive if desired.
N <- length(reports)
cohort_freqs <- table(factor(cohorts, levels = 1:params$m))
imbalance_threshold <- N / params$m
if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) {
cat("\nNote: You are using unbalanced cohort assignments, which can",
"significantly degrade estimation quality!\n\n")
}
# Count the times each bit was set, and add cohort counts to first column
counts <- lapply(1:params$m, function(i)
Reduce("+", reports[which(cohorts == i)]))
counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k))
cbind(cohort_freqs, do.call("rbind", counts))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.