R/utilities-covariance.R

Defines functions checkEpsilonDist checkAccuracyVals lowerTriangleSize distributeEpsilon linearReg mpinv amsweep extractIndices covarianceFormatRelease covariancePostLinearRegression coefficientRelease

Documented in amsweep checkAccuracyVals checkEpsilonDist coefficientRelease covarianceFormatRelease covariancePostLinearRegression extractIndices linearReg lowerTriangleSize mpinv

######### Statistic Initialization Helper Functions ##############################

#' Check validity of epsilonDist
#' 
#' Verifies that all percentages in epsilonDist are values between 0 and 1, sum to 1, and that epsilonDist
#' has expected length.
#'
#' @param epsilonDist Vector of percentages (valued 0 to 1) that describes how global epsilon \code{globalEps} 
#' should be split for each covariance calculation.
#'
#' @return epsilonDist or errors.
checkEpsilonDist <- function(epsilonDist, expectedLength=1){
    if (!all(epsilonDist > 0) || sum(epsilonDist) != 1){
        stop("All values in epsilonDist must be greater than 0 and be percentages that together sum to 1.")
    }
    else if (length(epsilonDist) != expectedLength){
        errorStr = paste("Epsilon parameter has improper length. Actual length is ", toString(actualLength),
                         " while expected length is ", toString(expectedLength),".")
        stop(errorStr)
    }
    else {
        return(epsilonDist)
    }
}


#' Check validity of accuracyVals
#' 
#' Verifies that accuracyVals has expected length and contains no values less than or equal to 0.
#'
#' @param accuracyVals Vector of accuracy values where the ith accuracy will be used for the ith covariance calculation in 
#'  flattened lower triangle of covariance matrix.
#' @param expectedLength Integer value. Specifies how long the output ought to be.
#'
#' @return accuracyVals or errors.
checkAccuracyVals <- function(accuracyVals, expectedLength){
    actualLength <- length(accuracyVals)
    if (actualLength != expectedLength){
        errorStr = paste("Parameter accuracyVals has improper length. Actual length is ", toString(actualLength),
                         " while expected length is ", toString(expectedLength),".")
        stop(errorStr)
    }
    else if (!all(accuracyVals > 0)){
        stop("All values specified in parameter accuracyVals must be greater than 0.")
    }
    else{
        return(accuracyVals)
    }
}

#' Size of the lower triangle of the covariance matrix that will be generated by covariance statistic.
#' 
#' Note that covariance matrices are symmetric and thus only the lower triangle needs to be calculated,
#' so this is equivalent to the number of values that will need to be differentially privately calculated.
#'
#' If the user input %n\% columns to the dpCovariance initialization, then the
#' output covariance matrix will have dimension %n \times n \% and thus the 
#' lower triangle of the matrix will have size %n(n+1)/2\%.
#'
#' @param columns Vector of column names, indicating columns of a data frame 
#' that the covariance matrix will be calculated over.
#'
#' @return Integer. Size of lower triangle of the covariance matrix that will be generated by covariance statistic. INCLUDING DIAGONAL
lowerTriangleSize <- function(columns){
    n <- length(columns)
    return(n*(n+1)/2)
}

distributeEpsilon <- function(globalEps, nCalcs, epsilonDist=NULL){
  if (is.null(epsilonDist)){
    eps <- rep(globalEps/nCalcs, nCalcs)
  }
  else {
    eps <- epsilonDist * globalEps
  }
  return(eps)
}

######### Linear Regression Functions ###########################################

#' Linear regression on covariance matrix
#' 
#' Function to extract regression coefficients from the covariance matrix via 
#'    the sweep operator.
#'
#' @param formula An object of class 'formula' containing the desired 
#'    regression formula.
#' @param release A numeric private release of the covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#'    observations in in the data.
#' @param intercept A logical vector indicating whether the intercept is 
#'    included in \code{formula}.
#'    
#' @return A numeric vector of regression coefficients corresponding 
#'    to \code{formula}.
#' @rdname linearReg
linearReg <- function(formula, release, n, intercept) {
    eigenvals <- eigen(release)$values
    if (!all(eigenvals != 0)) {  # could do is.positive.definite() but that requires matrixcalc package
        coefs <- "The input matrix is not invertible"
        return(coefs)
    } else if (!all(eigenvals > 0)){
        coefs <- "The input covariance matrix is not positive definite due to noise addition so linear regression will not be run."
        return(coefs)
    } else {
        xyLocs <- extractIndices(as.formula(formula), release, intercept)
        xLoc <- xyLocs$xLoc
        yLoc <- xyLocs$yLoc
        locVec <- rep(FALSE, length(release))
        locVec[xLoc] <- TRUE
        sweep <- amsweep((as.matrix(release) / n), locVec)
        coefs <- sweep[yLoc, xLoc]
        se <- sqrt(sweep[yLoc, yLoc] * diag(solve(release[xLoc, xLoc])))
        coefs <- data.frame(cbind(coefs, se))
        coefs <- format(round(coefs, 5), nsmall=5)
        rownames(coefs) <- xyLocs$xLabel
        names(coefs) <- c('Estimate', 'Std. Error')
        return(coefs)
    }
}


#' Moore Penrose Inverse Function
#' 
#' @references Gill, Jeff, and Gary King. "What to do when your Hessian is not invertible: Alternatives to model respecification in nonlinear estimation." Sociological methods & research 33, no. 1 (2004): 54-87.
#' 
#' Generate the Moore-Penrose pseudoinverse matrix of \code{X}.
#' 
#' @param X A numeric, symmetric covariance matrix.
#' @param tol Convergence requirement. 

mpinv <- function(X, tol = sqrt(.Machine$double.eps)) {
    ## Moore-Penrose Inverse function (aka Generalized Inverse)
    ##   X:    symmetric matrix
    ##   tol:  convergence requirement
    s <- svd(X)
    e <- s$d
    e[e > tol] <- 1/e[e > tol]
    s$v %*% diag(e,nrow=length(e)) %*% t(s$u)
}


#' Sweep operator
#' 
#' General sweep operator citation:
#' @references Goodnight, James H. "A tutorial on the SWEEP operator." The American Statistician 33, no. 3 (1979): 149-158.
#' This implementation is from pseudocode from:
#' @references Schafer, Joseph L. Analysis of incomplete multivariate data. Chapman and Hall/CRC, 1997.
#' Code ported from:
#' @references Honaker, James, Gary King, and Matthew Blackwell. "Amelia II: A program for missing data." Journal of statistical software 45, no. 7 (2011): 1-47.
#' 
#' Sweeps a covariance matrix to extract regression coefficients.
#' 
#' @param g Each unit of a numeric, symmetric covariance matrix divided by
#'    the number of observations in the data the covariance matrix was 
#'    derived from.
#' @param m A logical vector of length equal to the number of rows in \code{g}
#'    in which the \code{TRUE} values correspond to the x values in the matrix
#'    and the \code{FALSE} values correspond to the y value in the matrix.
#' @param reverse Logical vector specifying whether the sign of the matrix 
#'    should be flipped. Default to \code{FALSE}.
#' 
#' @return The coefficients from \code{g}.
#' @rdname amsweep
amsweep <- function(g, m, reverse=FALSE) {
    if (identical(m, vector(mode='logical', length=length(m)))) {
        return(g)
    } else {
        p <- nrow(g)
        rowsm <- sum(m)
        if (rowsm == p) {
            h <- solve(g)
            h <- -h
        } else {
            kseq <- 1:p
            k <- kseq[m]
            kcompl <- kseq[-k]
            g11 <- g[k, k, drop=FALSE]
            g12 <- g[k, kcompl, drop=FALSE]
            g21 <- t(g12)
            g22 <- g[kcompl, kcompl, drop=FALSE]
            h11a <- try(solve(g11), silent=TRUE)
            if (inherits(h11a, "try-error")) {
                h11a <- mpinv(g11)
            }
            h11 <- as.matrix((-h11a))
            if (reverse) {sgn2 <- -1} else {sgn2 <- 1}
            h12 <- as.matrix(sgn2 * (h11a %*% g12))
            h21 <- as.matrix(t(h12))
            h22 <- g22 - (g21 %*% h11a %*% g12)
            hwo <- rbind(cbind(h11, h12), cbind(h21, h22))
            xordering <- c(k, kcompl)
            h <- matrix(0, p, p)
            h[xordering, xordering] <- hwo
        }
        return(h)
    }
}

#' Extract regression indices
#' 
#' Function to obtain indices in data frame for dependent & independent variables from a formula.
#'
#' @param formula An object of class 'formula' containing the desired 
#'    regression formula.
#' @param data A numeric data frame with at least two columns.
#' @param intercept A logical vector indicating whether the intercept is 
#'    included in \code{formula}.
#' 
#' @return A named list with names corresponding to labels and locations 
#'    (i.e., columns) for variables in the specification.
#' @examples
#'
#' y <- rnorm(100) * 2
#' x <- (y + rnorm(100)) > 0
#' data <- data.frame(cbind(y, x))
#' f <- as.formula('y ~ x')
#' extractIndices(f, data, FALSE)
#' @rdname extractIndices
#' @export
extractIndices <- function(formula, data, intercept) {
    t <- terms(formula, data=data)
    yLoc <- attr(t, 'response')
    xLoc <- which(names(data) %in% attr(t, 'term.labels'))
    xLabel <- names(data)[xLoc]
    if (intercept) {
        interceptLoc <- which(names(data) == 'intercept')
        xLoc <- c(interceptLoc, xLoc)
        xLabel <- append(xLabel, 'Intercept', after=(interceptLoc - 1))
        if (interceptLoc <= yLoc) { yLoc <- yLoc + 1 }
    }
    return(list('yLoc' = yLoc,
                'xLoc' = xLoc,
                'xLabel' = xLabel))
}


################# Post-processing functions ##################################################################

#' Function to convert unique private covariances into symmetric matrix
#'
#' @param release Differentially private release of elements in lower triangle 
#'    of covariance matrix.
#' @param columns A character vector indicating columns in the private 
#'    covariance to be included in the output. Length should be equal to the 
#'    number of columns the user wants to include.
#' @return A symmetric differentially private covariance matrix.
#'
#' This function is used by the mechanism in post-processing and not intended for interactive post-processing
covarianceFormatRelease <- function(release, columns) {
    outDim <- length(columns)
    outMatrix <- matrix(0, nrow=outDim, ncol=outDim)
    outMatrix[lower.tri(outMatrix, diag=TRUE)] <- release
    outMatrix[upper.tri(outMatrix, diag=FALSE)] <- t(outMatrix)[upper.tri(outMatrix, diag=FALSE)]
    release <- data.frame(outMatrix)
    rownames(release) <- names(release) <- columns
    return(release)
}

#' Function to perform linear regression using private release of covariance matrix
#'
#' @param release Differentially private release of elements in lower triangle 
#'    of covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#'    observations in the data frame.
#' @param intercept Logical indicating whether the intercept is included in 
#'    \code{release}.
#' @param formula A list of the regression equations to be performed on the 
#'    covariance matrix.
#' @return Linear regression coefficients and standard errors for all specified
#'    \code{formula}.
covariancePostLinearRegression <- function(release, n, intercept, formula) {
    outSummaries <- vector('list', length(formula))
    for (f in 1:length(formula)) {
        outSummaries[[f]] <- linearReg(formula[[f]], release, n, intercept)
    }
    return(outSummaries)
}

#' Release additional model coefficients from DP covariance matrix
#' 
#' Function to extract regression coefficients using the differentially private covariance matrix 
#' via the sweep operator. This is the function to obtain coefficients for additional models
#' after the \code{covariance.release()} function has already been called. 
#' 
#' @examples 
#' \dontrun{
#' range.income <- c(-10000, 713000)
#' range.education <- c(1, 16)
#' range <- rbind(range.income, range.education)
#' dpCov <- dpCovariance$new(mechanism="mechanismLaplace",var.type = 'numeric', n = 10000,
#'                           epsilon = 1, columns = c("income", "educ"), rng = range, formula='income~educ')
#' out <- dpCov$release(PUMS5extract10000)
#' coefficient.release('educ~income', out$release, n=10000)
#' }
#' @param formula Formula, regression formula used on data
#' @param release Numeric, private release of covariance matrix
#' @param n Integer, indicating number of observations
#' @export

coefficientRelease <- function(formula, release, n) {
    intercept <- ifelse('intercept' %in% names(release), TRUE, FALSE)
    coefficients <- linearReg(formula, release, n, intercept)
    release <- list(name='Linear regression', 
                    n=n, 
                    formula=formula, 
                    coefficients=coefficients)
    return(release)
}
IQSS/PSI-Library documentation built on Feb. 15, 2020, 9:03 p.m.