PSI-Library: Differentially Private Statistical Releases for Privacy Preservation

Documented in amsweep checkAccuracyVals checkEpsilonDist coefficientRelease covarianceFormatRelease covariancePostLinearRegression extractIndices linearReg lowerTriangleSize mpinv

######### Statistic Initialization Helper Functions ##############################

#' Check validity of epsilonDist
#' 
#' Verifies that all percentages in epsilonDist are values between 0 and 1, sum to 1, and that epsilonDist
#' has expected length.
#'
#' @param epsilonDist Vector of percentages (valued 0 to 1) that describes how global epsilon \code{globalEps} 
#' should be split for each covariance calculation.
#'
#' @return epsilonDist or errors.
checkEpsilonDist <- function(epsilonDist, expectedLength=1){
    if (!all(epsilonDist > 0) || sum(epsilonDist) != 1){
        stop("All values in epsilonDist must be greater than 0 and be percentages that together sum to 1.")
    }
    else if (length(epsilonDist) != expectedLength){
        errorStr = paste("Epsilon parameter has improper length. Actual length is ", toString(actualLength),
                         " while expected length is ", toString(expectedLength),".")
        stop(errorStr)
    }
    else {
        return(epsilonDist)
    }
}


#' Check validity of accuracyVals
#' 
#' Verifies that accuracyVals has expected length and contains no values less than or equal to 0.
#'
#' @param accuracyVals Vector of accuracy values where the ith accuracy will be used for the ith covariance calculation in 
#'  flattened lower triangle of covariance matrix.
#' @param expectedLength Integer value. Specifies how long the output ought to be.
#'
#' @return accuracyVals or errors.
checkAccuracyVals <- function(accuracyVals, expectedLength){
    actualLength <- length(accuracyVals)
    if (actualLength != expectedLength){
        errorStr = paste("Parameter accuracyVals has improper length. Actual length is ", toString(actualLength),
                         " while expected length is ", toString(expectedLength),".")
        stop(errorStr)
    }
    else if (!all(accuracyVals > 0)){
        stop("All values specified in parameter accuracyVals must be greater than 0.")
    }
    else{
        return(accuracyVals)
    }
}

#' Size of the lower triangle of the covariance matrix that will be generated by covariance statistic.
#' 
#' Note that covariance matrices are symmetric and thus only the lower triangle needs to be calculated,
#' so this is equivalent to the number of values that will need to be differentially privately calculated.
#'
#' If the user input %n\% columns to the dpCovariance initialization, then the
#' output covariance matrix will have dimension %n \times n \% and thus the 
#' lower triangle of the matrix will have size %n(n+1)/2\%.
#'
#' @param columns Vector of column names, indicating columns of a data frame 
#' that the covariance matrix will be calculated over.
#'
#' @return Integer. Size of lower triangle of the covariance matrix that will be generated by covariance statistic. INCLUDING DIAGONAL
lowerTriangleSize <- function(columns){
    n <- length(columns)
    return(n*(n+1)/2)
}

distributeEpsilon <- function(globalEps, nCalcs, epsilonDist=NULL){
  if (is.null(epsilonDist)){
    eps <- rep(globalEps/nCalcs, nCalcs)
  }
  else {
    eps <- epsilonDist * globalEps
  }
  return(eps)
}

######### Linear Regression Functions ###########################################

#' Linear regression on covariance matrix
#' 
#' Function to extract regression coefficients from the covariance matrix via 
#'    the sweep operator.
#'
#' @param formula An object of class 'formula' containing the desired 
#'    regression formula.
#' @param release A numeric private release of the covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#'    observations in in the data.
#' @param intercept A logical vector indicating whether the intercept is 
#'    included in \code{formula}.
#'    
#' @return A numeric vector of regression coefficients corresponding 
#'    to \code{formula}.
#' @rdname linearReg
linearReg <- function(formula, release, n, intercept) {
    eigenvals <- eigen(release)$values
    if (!all(eigenvals != 0)) {  # could do is.positive.definite() but that requires matrixcalc package
        coefs <- "The input matrix is not invertible"
        return(coefs)
    } else if (!all(eigenvals > 0)){
        coefs <- "The input covariance matrix is not positive definite due to noise addition so linear regression will not be run."
        return(coefs)
    } else {
        xyLocs <- extractIndices(as.formula(formula), release, intercept)
        xLoc <- xyLocs$xLoc
        yLoc <- xyLocs$yLoc
        locVec <- rep(FALSE, length(release))
        locVec[xLoc] <- TRUE
        sweep <- amsweep((as.matrix(release) / n), locVec)
        coefs <- sweep[yLoc, xLoc]
        se <- sqrt(sweep[yLoc, yLoc] * diag(solve(release[xLoc, xLoc])))
        coefs <- data.frame(cbind(coefs, se))
        coefs <- format(round(coefs, 5), nsmall=5)
        rownames(coefs) <- xyLocs$xLabel
        names(coefs) <- c('Estimate', 'Std. Error')
        return(coefs)
    }
}


#' Moore Penrose Inverse Function
#' 
#' @references Gill, Jeff, and Gary King. "What to do when your Hessian is not invertible: Alternatives to model respecification in nonlinear estimation." Sociological methods & research 33, no. 1 (2004): 54-87.
#' 
#' Generate the Moore-Penrose pseudoinverse matrix of \code{X}.
#' 
#' @param X A numeric, symmetric covariance matrix.
#' @param tol Convergence requirement. 

mpinv <- function(X, tol = sqrt(.Machine$double.eps)) {
    ## Moore-Penrose Inverse function (aka Generalized Inverse)
    ##   X:    symmetric matrix
    ##   tol:  convergence requirement
    s <- svd(X)
    e <- s$d
    e[e > tol] <- 1/e[e > tol]
    s$v %*% diag(e,nrow=length(e)) %*% t(s$u)
}


#' Sweep operator
#' 
#' General sweep operator citation:
#' @references Goodnight, James H. "A tutorial on the SWEEP operator." The American Statistician 33, no. 3 (1979): 149-158.
#' This implementation is from pseudocode from:
#' @references Schafer, Joseph L. Analysis of incomplete multivariate data. Chapman and Hall/CRC, 1997.
#' Code ported from:
#' @references Honaker, James, Gary King, and Matthew Blackwell. "Amelia II: A program for missing data." Journal of statistical software 45, no. 7 (2011): 1-47.
#' 
#' Sweeps a covariance matrix to extract regression coefficients.
#' 
#' @param g Each unit of a numeric, symmetric covariance matrix divided by
#'    the number of observations in the data the covariance matrix was 
#'    derived from.
#' @param m A logical vector of length equal to the number of rows in \code{g}
#'    in which the \code{TRUE} values correspond to the x values in the matrix
#'    and the \code{FALSE} values correspond to the y value in the matrix.
#' @param reverse Logical vector specifying whether the sign of the matrix 
#'    should be flipped. Default to \code{FALSE}.
#' 
#' @return The coefficients from \code{g}.
#' @rdname amsweep
amsweep <- function(g, m, reverse=FALSE) {
    if (identical(m, vector(mode='logical', length=length(m)))) {
        return(g)
    } else {
        p <- nrow(g)
        rowsm <- sum(m)
        if (rowsm == p) {
            h <- solve(g)
            h <- -h
        } else {
            kseq <- 1:p
            k <- kseq[m]
            kcompl <- kseq[-k]
            g11 <- g[k, k, drop=FALSE]
            g12 <- g[k, kcompl, drop=FALSE]
            g21 <- t(g12)
            g22 <- g[kcompl, kcompl, drop=FALSE]
            h11a <- try(solve(g11), silent=TRUE)
            if (inherits(h11a, "try-error")) {
                h11a <- mpinv(g11)
            }
            h11 <- as.matrix((-h11a))
            if (reverse) {sgn2 <- -1} else {sgn2 <- 1}
            h12 <- as.matrix(sgn2 * (h11a %*% g12))
            h21 <- as.matrix(t(h12))
            h22 <- g22 - (g21 %*% h11a %*% g12)
            hwo <- rbind(cbind(h11, h12), cbind(h21, h22))
            xordering <- c(k, kcompl)
            h <- matrix(0, p, p)
            h[xordering, xordering] <- hwo
        }
        return(h)
    }
}

#' Extract regression indices
#' 
#' Function to obtain indices in data frame for dependent & independent variables from a formula.
#'
#' @param formula An object of class 'formula' containing the desired 
#'    regression formula.
#' @param data A numeric data frame with at least two columns.
#' @param intercept A logical vector indicating whether the intercept is 
#'    included in \code{formula}.
#' 
#' @return A named list with names corresponding to labels and locations 
#'    (i.e., columns) for variables in the specification.
#' @examples
#'
#' y <- rnorm(100) * 2
#' x <- (y + rnorm(100)) > 0
#' data <- data.frame(cbind(y, x))
#' f <- as.formula('y ~ x')
#' extractIndices(f, data, FALSE)
#' @rdname extractIndices
#' @export
extractIndices <- function(formula, data, intercept) {
    t <- terms(formula, data=data)
    yLoc <- attr(t, 'response')
    xLoc <- which(names(data) %in% attr(t, 'term.labels'))
    xLabel <- names(data)[xLoc]
    if (intercept) {
        interceptLoc <- which(names(data) == 'intercept')
        xLoc <- c(interceptLoc, xLoc)
        xLabel <- append(xLabel, 'Intercept', after=(interceptLoc - 1))
        if (interceptLoc <= yLoc) { yLoc <- yLoc + 1 }
    }
    return(list('yLoc' = yLoc,
                'xLoc' = xLoc,
                'xLabel' = xLabel))
}


################# Post-processing functions ##################################################################

#' Function to convert unique private covariances into symmetric matrix
#'
#' @param release Differentially private release of elements in lower triangle 
#'    of covariance matrix.
#' @param columns A character vector indicating columns in the private 
#'    covariance to be included in the output. Length should be equal to the 
#'    number of columns the user wants to include.
#' @return A symmetric differentially private covariance matrix.
#'
#' This function is used by the mechanism in post-processing and not intended for interactive post-processing
covarianceFormatRelease <- function(release, columns) {
    outDim <- length(columns)
    outMatrix <- matrix(0, nrow=outDim, ncol=outDim)
    outMatrix[lower.tri(outMatrix, diag=TRUE)] <- release
    outMatrix[upper.tri(outMatrix, diag=FALSE)] <- t(outMatrix)[upper.tri(outMatrix, diag=FALSE)]
    release <- data.frame(outMatrix)
    rownames(release) <- names(release) <- columns
    return(release)
}

#' Function to perform linear regression using private release of covariance matrix
#'
#' @param release Differentially private release of elements in lower triangle 
#'    of covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#'    observations in the data frame.
#' @param intercept Logical indicating whether the intercept is included in 
#'    \code{release}.
#' @param formula A list of the regression equations to be performed on the 
#'    covariance matrix.
#' @return Linear regression coefficients and standard errors for all specified
#'    \code{formula}.
covariancePostLinearRegression <- function(release, n, intercept, formula) {
    outSummaries <- vector('list', length(formula))
    for (f in 1:length(formula)) {
        outSummaries[[f]] <- linearReg(formula[[f]], release, n, intercept)
    }
    return(outSummaries)
}

#' Release additional model coefficients from DP covariance matrix
#' 
#' Function to extract regression coefficients using the differentially private covariance matrix 
#' via the sweep operator. This is the function to obtain coefficients for additional models
#' after the \code{covariance.release()} function has already been called. 
#' 
#' @examples 
#' \dontrun{
#' range.income <- c(-10000, 713000)
#' range.education <- c(1, 16)
#' range <- rbind(range.income, range.education)
#' dpCov <- dpCovariance$new(mechanism="mechanismLaplace",var.type = 'numeric', n = 10000,
#'                           epsilon = 1, columns = c("income", "educ"), rng = range, formula='income~educ')
#' out <- dpCov$release(PUMS5extract10000)
#' coefficient.release('educ~income', out$release, n=10000)
#' }
#' @param formula Formula, regression formula used on data
#' @param release Numeric, private release of covariance matrix
#' @param n Integer, indicating number of observations
#' @export

coefficientRelease <- function(formula, release, n) {
    intercept <- ifelse('intercept' %in% names(release), TRUE, FALSE)
    coefficients <- linearReg(formula, release, n, intercept)
    release <- list(name='Linear regression', 
                    n=n, 
                    formula=formula, 
                    coefficients=coefficients)
    return(release)
}

IQSS/PSI-Library documentation built on Feb. 15, 2020, 9:03 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

IQSS/PSI-Library
Differentially Private Statistical Releases for Privacy Preservation

R/utilities-covariance.R
In IQSS/PSI-Library: Differentially Private Statistical Releases for Privacy Preservation

Defines functions checkEpsilonDist checkAccuracyVals lowerTriangleSize distributeEpsilon linearReg mpinv amsweep extractIndices covarianceFormatRelease covariancePostLinearRegression coefficientRelease

Documented in amsweep checkAccuracyVals checkEpsilonDist coefficientRelease covarianceFormatRelease covariancePostLinearRegression extractIndices linearReg lowerTriangleSize mpinv

R Package Documentation

Browse R Packages

We want your feedback!

IQSS/PSI-Library Differentially Private Statistical Releases for Privacy Preservation

R/utilities-covariance.R In IQSS/PSI-Library: Differentially Private Statistical Releases for Privacy Preservation

Defines functions checkEpsilonDist checkAccuracyVals lowerTriangleSize distributeEpsilon linearReg mpinv amsweep extractIndices covarianceFormatRelease covariancePostLinearRegression coefficientRelease

Documented in amsweep checkAccuracyVals checkEpsilonDist coefficientRelease covarianceFormatRelease covariancePostLinearRegression extractIndices linearReg lowerTriangleSize mpinv

R Package Documentation

Browse R Packages

We want your feedback!

IQSS/PSI-Library
Differentially Private Statistical Releases for Privacy Preservation

R/utilities-covariance.R
In IQSS/PSI-Library: Differentially Private Statistical Releases for Privacy Preservation