R/mechanism-stability.R

#' Stability mechanism
#'
#' @import methods
#' @export mechanismStability
#' @exportClass mechanismStability
#'
#' @include mechanism.R

mechanismStability <- setRefClass(
    Class = 'mechanismStability',
    contains = 'mechanism'
)

mechanismStability$methods(
    #' Stability Mechanism 
    #' 
    #' Differentially private evaluation of input function `fun` with sensitivity `sens` on input data `x` using 
    #' the stability mechanism. In general, the stability mechanism is one which takes advantage of `stable` 
    #' functions, i.e. ones where the function output is constant in some neighborhood around the input database. 
    #' In this library, the stability mechanism is implemented to be used specifically for the histogram statistic, 
    #' and should not be used for any other function except by users who are confident in their understanding of 
    #' the mechanism. See references for more information on the general stability mechanism.
    #' 
    #' For a histogram generated by the Stability mechanism, empty buckets will be removed and any
    #' buckets with a count below an accuracy threshold will also be removed. Removing these buckets based on
    #' low counts is what creates the added guarantee of privacy.
    #' 
    #' The accuracy threshold for removing bins with low counts is calculated by: \eqn{1 + (2 * ln(2 / \delta) / \epsilon)}.
    #' 
    #' @name Stability Mechanism
    #' @references
    #' S. Vadhan The Complexity of Differential Privacy, Section 3.3 Releasing Stable Values p.23-24. March 2017.
    #' 
    #' Also discussed in: 
    #' 
    #' A. Smith, A. Thakurta Differentially Private Model Selection via Stability Arguments and the Robustness of the Lasso. 2013.
    #' 
    #' C. Dwork, J. Lei Differential Privacy and Robust Statistics. November 2008.
    #' 
    #'
    #' @param fun function of input x to add Laplace noise to.
    #' @param x input that function fun will be evaluated on. 
    #' @param sens sensitivity of fun. Sensitivity is defined in above citation.
    #' @param ... any additional (optional) parameters
    #'
    #' @return result of post-processing on input function "fun" evaluated on database "x", assuming sensitivity of fun is "sens".
    #' 
    #' @examples 
    #' # the function in `statistic-histogram.R` that creates a histogram from input data
    #' histogram_function <- fun.hist 
    #' 
    #' # the data frame that holds the data we want to analyze, in this case the data is called "PUMS5extract10000"
    #' data <- data(PUMS5extract10000) 
    #' 
    #' # the variable for which we want a histogram
    #' variable <- "age"
    #' 
    #' # the sensitivity for the histogram, the default sensitivity for histograms is 2 
    #' sens <- 2 
    #' 
    #' # the post-processing function to use to format the histogram release correctly
    #' post_processing_function <- dpHistogram$postProcess 
    #' 
    #' stability_histogram <- mechanismStability$evaluate(histogram_function, data[, variable], sens, post_processing_function)
    #'
    evaluate = function(fun, x, sens, ...) {
        # before calculating the histogram statistic, confirm that delta is less than (1/n^2)
        # if delta is greater than or equal to (1/n^2), return an error message to the user
        if (.self$delta >= (1 / (.self$n)^2)) stop("A delta value on the order of 1/n^2 is a privacy risk, as it allows for additional data to leak beyond the privacy parameter epsilon. Choose a smaller value for delta to maintain your privacy guarantee.")
        
        # if the variable is numeric or integer and the stability mechanism is being used,
        # then the stability mechanism needs to determine the bins to maintain privacy.
        # Get the range of the data, then get the number of bins from the input number of
        # bins or the input granularity
        dataRange <- NULL
        numHistogramBins <- NULL
        imputationRange <- NULL
        histogramBins <- NULL
        if (.self$varType %in% c('numeric', 'integer')) {
            dataRange <- range(x)
            numHistogramBins <- ifelse(is.null(.self$nBins), .self$n / .self$granularity, .self$nBins)
            histogramBins <- seq(dataRange[1], dataRange[2], length.out=(numHistogramBins + 1))
            # set the imputation range to the detected data range to maintain privacy
            imputationRange <- dataRange
        }
        
        x <- censorData(x, .self$varType, dataRange, histogramBins, rngFormat)
        x <- fillMissing(x, .self$varType, imputeRng=imputationRange, categories=levels(x)) # levels(x) will be NULL for numeric variables, a vector of bins for character variables
        fun.args <- getFuncArgs(fun, inputList=list(bins=histogramBins), inputObject=.self)
        inputVals <- c(list(x=x), fun.args)
        trueVal <- do.call(fun, inputVals)  # Concern: are we confident that the environment this is happening in is getting erased.
        
        # remove empty bins before noise is added (per definition of stability mechanism)
        trueVal <- trueVal[trueVal > 0]
        
        scale <- sens / .self$epsilon
        release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace')
        
        # calculate the accuracy threshold, below which histogram buckets should be removed
        accuracyThreshold <- 1+2*log(2/.self$delta)/.self$epsilon
        # remove buckets below the threshold
        release <- release[release > accuracyThreshold]
        
        out <- list('release' = release)
        return(out)
    }
    
)
privacytoolsproject/PSI-Library documentation built on Feb. 17, 2020, 2:03 p.m.