######### Statistic Initialization Helper Functions ##############################
#' Check validity of epsilonDist
#'
#' Verifies that all percentages in epsilonDist are values between 0 and 1, sum to 1, and that epsilonDist
#' has expected length.
#'
#' @param epsilonDist Vector of percentages (valued 0 to 1) that describes how global epsilon \code{globalEps}
#' should be split for each covariance calculation.
#'
#' @return epsilonDist or errors.
checkEpsilonDist <- function(epsilonDist, expectedLength=1){
if (!all(epsilonDist > 0) || sum(epsilonDist) != 1){
stop("All values in epsilonDist must be greater than 0 and be percentages that together sum to 1.")
}
else if (length(epsilonDist) != expectedLength){
errorStr = paste("Epsilon parameter has improper length. Actual length is ", toString(actualLength),
" while expected length is ", toString(expectedLength),".")
stop(errorStr)
}
else {
return(epsilonDist)
}
}
#' Check validity of accuracyVals
#'
#' Verifies that accuracyVals has expected length and contains no values less than or equal to 0.
#'
#' @param accuracyVals Vector of accuracy values where the ith accuracy will be used for the ith covariance calculation in
#' flattened lower triangle of covariance matrix.
#' @param expectedLength Integer value. Specifies how long the output ought to be.
#'
#' @return accuracyVals or errors.
checkAccuracyVals <- function(accuracyVals, expectedLength){
actualLength <- length(accuracyVals)
if (actualLength != expectedLength){
errorStr = paste("Parameter accuracyVals has improper length. Actual length is ", toString(actualLength),
" while expected length is ", toString(expectedLength),".")
stop(errorStr)
}
else if (!all(accuracyVals > 0)){
stop("All values specified in parameter accuracyVals must be greater than 0.")
}
else{
return(accuracyVals)
}
}
#' Size of the lower triangle of the covariance matrix that will be generated by covariance statistic.
#'
#' Note that covariance matrices are symmetric and thus only the lower triangle needs to be calculated,
#' so this is equivalent to the number of values that will need to be differentially privately calculated.
#'
#' If the user input %n\% columns to the dpCovariance initialization, then the
#' output covariance matrix will have dimension %n \times n \% and thus the
#' lower triangle of the matrix will have size %n(n+1)/2\%.
#'
#' @param columns Vector of column names, indicating columns of a data frame
#' that the covariance matrix will be calculated over.
#'
#' @return Integer. Size of lower triangle of the covariance matrix that will be generated by covariance statistic. INCLUDING DIAGONAL
lowerTriangleSize <- function(columns){
n <- length(columns)
return(n*(n+1)/2)
}
distributeEpsilon <- function(globalEps, nCalcs, epsilonDist=NULL){
if (is.null(epsilonDist)){
eps <- rep(globalEps/nCalcs, nCalcs)
}
else {
eps <- epsilonDist * globalEps
}
return(eps)
}
######### Linear Regression Functions ###########################################
#' Linear regression on covariance matrix
#'
#' Function to extract regression coefficients from the covariance matrix via
#' the sweep operator.
#'
#' @param formula An object of class 'formula' containing the desired
#' regression formula.
#' @param release A numeric private release of the covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#' observations in in the data.
#' @param intercept A logical vector indicating whether the intercept is
#' included in \code{formula}.
#'
#' @return A numeric vector of regression coefficients corresponding
#' to \code{formula}.
#' @rdname linearReg
linearReg <- function(formula, release, n, intercept) {
eigenvals <- eigen(release)$values
if (!all(eigenvals != 0)) { # could do is.positive.definite() but that requires matrixcalc package
coefs <- "The input matrix is not invertible"
return(coefs)
} else if (!all(eigenvals > 0)){
coefs <- "The input covariance matrix is not positive definite due to noise addition so linear regression will not be run."
return(coefs)
} else {
xyLocs <- extractIndices(as.formula(formula), release, intercept)
xLoc <- xyLocs$xLoc
yLoc <- xyLocs$yLoc
locVec <- rep(FALSE, length(release))
locVec[xLoc] <- TRUE
sweep <- amsweep((as.matrix(release) / n), locVec)
coefs <- sweep[yLoc, xLoc]
se <- sqrt(sweep[yLoc, yLoc] * diag(solve(release[xLoc, xLoc])))
coefs <- data.frame(cbind(coefs, se))
coefs <- format(round(coefs, 5), nsmall=5)
rownames(coefs) <- xyLocs$xLabel
names(coefs) <- c('Estimate', 'Std. Error')
return(coefs)
}
}
#' Moore Penrose Inverse Function
#'
#' @references Gill, Jeff, and Gary King. "What to do when your Hessian is not invertible: Alternatives to model respecification in nonlinear estimation." Sociological methods & research 33, no. 1 (2004): 54-87.
#'
#' Generate the Moore-Penrose pseudoinverse matrix of \code{X}.
#'
#' @param X A numeric, symmetric covariance matrix.
#' @param tol Convergence requirement.
mpinv <- function(X, tol = sqrt(.Machine$double.eps)) {
## Moore-Penrose Inverse function (aka Generalized Inverse)
## X: symmetric matrix
## tol: convergence requirement
s <- svd(X)
e <- s$d
e[e > tol] <- 1/e[e > tol]
s$v %*% diag(e,nrow=length(e)) %*% t(s$u)
}
#' Sweep operator
#'
#' General sweep operator citation:
#' @references Goodnight, James H. "A tutorial on the SWEEP operator." The American Statistician 33, no. 3 (1979): 149-158.
#' This implementation is from pseudocode from:
#' @references Schafer, Joseph L. Analysis of incomplete multivariate data. Chapman and Hall/CRC, 1997.
#' Code ported from:
#' @references Honaker, James, Gary King, and Matthew Blackwell. "Amelia II: A program for missing data." Journal of statistical software 45, no. 7 (2011): 1-47.
#'
#' Sweeps a covariance matrix to extract regression coefficients.
#'
#' @param g Each unit of a numeric, symmetric covariance matrix divided by
#' the number of observations in the data the covariance matrix was
#' derived from.
#' @param m A logical vector of length equal to the number of rows in \code{g}
#' in which the \code{TRUE} values correspond to the x values in the matrix
#' and the \code{FALSE} values correspond to the y value in the matrix.
#' @param reverse Logical vector specifying whether the sign of the matrix
#' should be flipped. Default to \code{FALSE}.
#'
#' @return The coefficients from \code{g}.
#' @rdname amsweep
amsweep <- function(g, m, reverse=FALSE) {
if (identical(m, vector(mode='logical', length=length(m)))) {
return(g)
} else {
p <- nrow(g)
rowsm <- sum(m)
if (rowsm == p) {
h <- solve(g)
h <- -h
} else {
kseq <- 1:p
k <- kseq[m]
kcompl <- kseq[-k]
g11 <- g[k, k, drop=FALSE]
g12 <- g[k, kcompl, drop=FALSE]
g21 <- t(g12)
g22 <- g[kcompl, kcompl, drop=FALSE]
h11a <- try(solve(g11), silent=TRUE)
if (inherits(h11a, "try-error")) {
h11a <- mpinv(g11)
}
h11 <- as.matrix((-h11a))
if (reverse) {sgn2 <- -1} else {sgn2 <- 1}
h12 <- as.matrix(sgn2 * (h11a %*% g12))
h21 <- as.matrix(t(h12))
h22 <- g22 - (g21 %*% h11a %*% g12)
hwo <- rbind(cbind(h11, h12), cbind(h21, h22))
xordering <- c(k, kcompl)
h <- matrix(0, p, p)
h[xordering, xordering] <- hwo
}
return(h)
}
}
#' Extract regression indices
#'
#' Function to obtain indices in data frame for dependent & independent variables from a formula.
#'
#' @param formula An object of class 'formula' containing the desired
#' regression formula.
#' @param data A numeric data frame with at least two columns.
#' @param intercept A logical vector indicating whether the intercept is
#' included in \code{formula}.
#'
#' @return A named list with names corresponding to labels and locations
#' (i.e., columns) for variables in the specification.
#' @examples
#'
#' y <- rnorm(100) * 2
#' x <- (y + rnorm(100)) > 0
#' data <- data.frame(cbind(y, x))
#' f <- as.formula('y ~ x')
#' extractIndices(f, data, FALSE)
#' @rdname extractIndices
#' @export
extractIndices <- function(formula, data, intercept) {
t <- terms(formula, data=data)
yLoc <- attr(t, 'response')
xLoc <- which(names(data) %in% attr(t, 'term.labels'))
xLabel <- names(data)[xLoc]
if (intercept) {
interceptLoc <- which(names(data) == 'intercept')
xLoc <- c(interceptLoc, xLoc)
xLabel <- append(xLabel, 'Intercept', after=(interceptLoc - 1))
if (interceptLoc <= yLoc) { yLoc <- yLoc + 1 }
}
return(list('yLoc' = yLoc,
'xLoc' = xLoc,
'xLabel' = xLabel))
}
################# Post-processing functions ##################################################################
#' Function to convert unique private covariances into symmetric matrix
#'
#' @param release Differentially private release of elements in lower triangle
#' of covariance matrix.
#' @param columns A character vector indicating columns in the private
#' covariance to be included in the output. Length should be equal to the
#' number of columns the user wants to include.
#' @return A symmetric differentially private covariance matrix.
#'
#' This function is used by the mechanism in post-processing and not intended for interactive post-processing
covarianceFormatRelease <- function(release, columns) {
outDim <- length(columns)
outMatrix <- matrix(0, nrow=outDim, ncol=outDim)
outMatrix[lower.tri(outMatrix, diag=TRUE)] <- release
outMatrix[upper.tri(outMatrix, diag=FALSE)] <- t(outMatrix)[upper.tri(outMatrix, diag=FALSE)]
release <- data.frame(outMatrix)
rownames(release) <- names(release) <- columns
return(release)
}
#' Function to perform linear regression using private release of covariance matrix
#'
#' @param release Differentially private release of elements in lower triangle
#' of covariance matrix.
#' @param n A numeric vector of length one specifying the number of
#' observations in the data frame.
#' @param intercept Logical indicating whether the intercept is included in
#' \code{release}.
#' @param formula A list of the regression equations to be performed on the
#' covariance matrix.
#' @return Linear regression coefficients and standard errors for all specified
#' \code{formula}.
covariancePostLinearRegression <- function(release, n, intercept, formula) {
outSummaries <- vector('list', length(formula))
for (f in 1:length(formula)) {
outSummaries[[f]] <- linearReg(formula[[f]], release, n, intercept)
}
return(outSummaries)
}
#' Release additional model coefficients from DP covariance matrix
#'
#' Function to extract regression coefficients using the differentially private covariance matrix
#' via the sweep operator. This is the function to obtain coefficients for additional models
#' after the \code{covariance.release()} function has already been called.
#'
#' @examples
#' \dontrun{
#' range.income <- c(-10000, 713000)
#' range.education <- c(1, 16)
#' range <- rbind(range.income, range.education)
#' dpCov <- dpCovariance$new(mechanism="mechanismLaplace",var.type = 'numeric', n = 10000,
#' epsilon = 1, columns = c("income", "educ"), rng = range, formula='income~educ')
#' out <- dpCov$release(PUMS5extract10000)
#' coefficient.release('educ~income', out$release, n=10000)
#' }
#' @param formula Formula, regression formula used on data
#' @param release Numeric, private release of covariance matrix
#' @param n Integer, indicating number of observations
#' @export
coefficientRelease <- function(formula, release, n) {
intercept <- ifelse('intercept' %in% names(release), TRUE, FALSE)
coefficients <- linearReg(formula, release, n, intercept)
release <- list(name='Linear regression',
n=n,
formula=formula,
coefficients=coefficients)
return(release)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.