R/rr.R

Defines functions print.summary.rrreg print.rrreg summary.rrreg predict.rrreg coef.rrreg vcov.rrreg rrcd summarize.design rrreg logistic

Documented in predict.rrreg rrreg

logistic <- function(x) exp(x)/(1+exp(x))

#' Randomized Response Regression
#' 
#' \code{rrreg} is used to conduct multivariate regression analyses of survey
#' data using randomized response methods.
#' 
#' This function allows users to perform multivariate regression analysis on
#' data from the randomized response technique.  Four standard designs are
#' accepted by this function: mirrored question, forced response, disguised
#' response, and unrelated question. The method implemented by this function is
#' the Maximum Likelihood (ML) estimation for the Expectation-Maximization (EM)
#' algorithm.
#' 
#' @usage rrreg(formula, p, p0, p1, q, design, data, start = NULL, 
#' h = NULL, group = NULL, matrixMethod = "efficient",
#' maxIter = 10000, verbose = FALSE, optim = FALSE, em.converge = 10^(-8), 
#' glmMaxIter = 10000, solve.tolerance = .Machine$double.eps)
#' @param formula An object of class "formula": a symbolic description of the
#' model to be fitted.
#' @param p The probability of receiving the sensitive question (Mirrored
#' Question Design, Unrelated Question Design); the probability of answering
#' truthfully (Forced Response Design); the probability of selecting a red card
#' from the 'yes' stack (Disguised Response Design). For "mirrored" and
#' "disguised" designs, p cannot equal .5.
#' @param p0 The probability of forced 'no' (Forced Response Design).
#' @param p1 The probability of forced 'yes' (Forced Response Design).
#' @param q The probability of answering 'yes' to the unrelated question, which
#' is assumed to be independent of covariates (Unrelated Question Design).
#' @param design One of the four standard designs: "forced-known", "mirrored",
#' "disguised", or "unrelated-known".
#' @param data A data frame containing the variables in the model.
#' @param start Optional starting values of coefficient estimates for the
#' Expectation-Maximization (EM) algorithm.
#' @param h Auxiliary data functionality. Optional named numeric vector with 
#' length equal to number of groups. Names correspond to group labels and 
#' values correspond to auxiliary moments.
#' @param group Auxiliary data functionality. Optional character vector of 
#' group labels with length equal to number of observations.
#' @param matrixMethod Auxiliary data functionality. Procedure for estimating 
#' optimal weighting matrix for generalized method of moments. One of 
#' "efficient" for two-step feasible and "cue" for continuously updating. 
#' Default is "efficient". Only relevant if \code{h} and \code{group} are 
#' specified.
#' @param maxIter Maximum number of iterations for the Expectation-Maximization
#' algorithm. The default is \code{10000}.
#' @param verbose A logical value indicating whether model diagnostics counting
#' the number of EM iterations are printed out.  The default is \code{FALSE}.
#' @param optim A logical value indicating whether to use the quasi-Newton
#' "BFGS" method to calculate the variance-covariance matrix and standard
#' errors. The default is \code{FALSE}.
#' @param em.converge A value specifying the satisfactory degree of convergence
#' under the EM algorithm. The default is \code{10^(-8)}.
#' @param glmMaxIter A value specifying the maximum number of iterations to run
#' the EM algorithm. The default is \code{10000}.
#' @param solve.tolerance When standard errors are calculated, this option
#' specifies the tolerance of the matrix inversion operation solve.
#' @return \code{rrreg} returns an object of class "rrreg".  The function
#' \code{summary} is used to obtain a table of the results.  The object
#' \code{rrreg} is a list that contains the following components (the inclusion
#' of some components such as the design parameters are dependent upon the
#' design used):
#' 
#' \item{est}{Point estimates for the effects of covariates on the randomized
#' response item.} \item{vcov}{Variance-covariance matrix for the effects of
#' covariates on the randomized response item.} \item{se}{Standard errors for
#' estimates of the effects of covariates on the randomized response item.}
#' \item{data}{The \code{data} argument.} \item{coef.names}{Variable names as
#' defined in the data frame.} \item{x}{The model matrix of covariates.}
#' \item{y}{The randomized response vector.} \item{design}{Call of standard
#' design used: "forced-known", "mirrored", "disguised", or "unrelated-known".}
#' \item{p}{The \code{p} argument.} \item{p0}{The \code{p0} argument.}
#' \item{p1}{The \code{p1} argument.} \item{q}{The \code{q} argument.}
#' \item{call}{The matched call.}
#' @seealso \code{\link{predict.rrreg}} for predicted probabilities.
#' @references Blair, Graeme, Kosuke Imai and Yang-Yang Zhou. (2014) "Design
#' and Analysis of the Randomized Response Technique." Working Paper. Available
#' at \url{http://imai.princeton.edu/research/randresp.html}.
#' @keywords regression
#' @examples
#' 
#' data(nigeria)
#' 
#' set.seed(1)
#' 
#' ## Define design parameters
#' p <- 2/3  # probability of answering honestly in Forced Response Design
#' p1 <- 1/6 # probability of forced 'yes'
#' p0 <- 1/6 # probability of forced 'no'
#' 
#' ## Fit linear regression on the randomized response item of whether 
#' ## citizen respondents had direct social contacts to armed groups
#' 
#' rr.q1.reg.obj <- rrreg(rr.q1 ~ cov.asset.index + cov.married + 
#'                     I(cov.age/10) + I((cov.age/10)^2) + cov.education + cov.female,   
#'                     data = nigeria, p = p, p1 = p1, p0 = p0, 
#'                     design = "forced-known")
#'   
#' summary(rr.q1.reg.obj)
#' 
#' ## Replicates Table 3 in Blair, Imai, and Zhou (2014)
#' 
#' @export
#' 
#' @importFrom magic adiag
#' @importFrom stats pchisq
rrreg <- function(formula, p, p0, p1, q, design, data, start = NULL, 
                  h = NULL, group = NULL, matrixMethod = "efficient", 
                  maxIter = 10000, verbose = FALSE, 
                  optim = FALSE, em.converge = 10^(-8), 
                  glmMaxIter = 10000, solve.tolerance = .Machine$double.eps) {
  
    df <- model.frame(formula, data, na.action = na.omit)
    x1 <- model.matrix.default(formula, df)
    bscale <- c(1, unname(apply((as.matrix(x1[,-1])), 2, sd)))
    x <- sweep(x1, 2, bscale, `/`) #rescale/standardize each covar, not including intercept
    y <- model.response(df)
    n <- length(y)

    # Get c, d
    cd <- rrcd(p, p0, p1, q, design)
    c <- cd$c
    d <- cd$d

    if(missing(design)) {
        stop("Missing design specification, see documentation")
    } else if(design != "forced-known" & design != "mirrored" & design != "disguised" & design != "unrelated-known"
              ) {
        stop(paste("The design you specified,", design, "is not implemented in this version of the software."))
    }

    obs.llik <- function(par, y, x, c, d, gx = NULL, bayesglm = FALSE) {
        if (is.null(gx))
            gx <- logistic(x %*% par)
        llik <- sum( y * log( c * gx + d) + (1-y) * log(1 - c * gx - d))
        ##if(bayesglm = TRUE)
        ##    llik <- llik + sum(dcauchy(x = par, scale = rep(2.5, length(par), log = TRUE))
        return(llik)
    }
    
    # auxiliary data functionality -- check conditions
    if (!is.null(h) & is.null(group)) {
      stop("Need to specify character vector of group assignments. See documentation")
    } else if (is.null(h) & !is.null(group)) {
      stop("Need to specify named vector of group moments. See documentation")
    } 

    aux.check <- !(is.null(h) | is.null(group))    

    ## NOW THE ALGORITHM

    ## set some starting values
    if(!missing(start) & (length(start) != ncol(x)))
        stop("The starting values are not the right dimensions. start should be a vector of length the number of variables, including the intercept if applicable.")
        
    if(!is.null(start))
        par <- start
    else
        par <- rnorm(ncol(x))
    gx <- logistic(x %*% par)

    ## start off with an infinitely negative log likelihood
    pllik <- -Inf

    ## calculate the log likelihood at the starting values
    llik <- obs.llik(par, y = y, x = x, c = c, d = d, gx = gx)

    ## set a counter to zero, which you will iterate
    ## (this just allows to you stop after a maximum # of iterations if you want
    ## e.g. 10k

    counter <- 0

    ## begin the while loop, which goes until the log likelihood it calculates
    ## is very very close to the log likelihood at the last iteration (the
    ## difference is less than 10^(-8) different

    while (((llik - pllik) > em.converge) & (counter < maxIter)) {

        ## first the E step is run each time, from the parameters at the end of the
        ## last iteration, or the starting values in the first iteration
        w1 <- (c*gx) /  (c*gx + d)
        w0 <- c*(1-gx) / (1-c*gx - d)

        w <- rep(NA, n)
        w[y == 0] <- w0[y==0]
        w[y == 1] <- w1[y==1]

        ##if(bayesglm == FALSE) {
        lfit <- glm(cbind(y, 1-y) ~ x - 1, family = binomial("logit"), weights = w, start = par,
                    control = list(maxit = glmMaxIter))
        ##} else {
        ##    lfit <- bayesglm(as.formula(paste("cbind(", y.var, ", 1-", y.var, ") ~ ",
        ##                                           paste(x.vars.ceiling, collapse=" + "))),
        ##                          weights = dtmpC$w, family = binomial(logit),
        ##                          start = coef.qufit.start, data = dtmpC,
        ##                          control = glm.control(maxit = maxIter), scaled = F)
        ##}
        par <- coef(lfit)
        ## update gx
        gx <- logistic(x %*% par)

        pllik <- llik

        if(verbose==T)
            cat(paste(counter, round(llik, 4), "\n"))

        ## calculate the new log likelihood with the parameters you just
        ## estimated in the M step
        llik <- obs.llik(par, y = y, x = x, c = c, d = d, gx = gx)

        counter <- counter + 1

        ## stop for obvious reasons
        if (llik < pllik)
            stop("log-likelihood is not monotonically increasing.")

        if(counter == (maxIter-1))
            warning("number of iterations exceeded maximum in ML")

    } ## end of while loop
    
    # Rescale coefficients
    bpar <- c(par/bscale)
    
        if(optim) {
        MLEfit <- optim(par = par, fn = obs.llik, method = "BFGS",
                        y = y, x = x,
                        hessian = TRUE,
                        control = list(maxit = 0, fnscale = -1),
                        c = c, d = d)
        vcov <- solve(-MLEfit$hessian, tol = solve.tolerance)
        vcov <- vcov/(bscale %*% t(bscale))
        
        se <- sqrt(diag(vcov))
    } else { 
         score.rr <- as.vector(y * ( c * gx + d )^(-1) * c * gx * (1-gx)) * x +
           as.vector((1-y) * ( 1 - c * gx - d )^(-1) * (-c) * gx * (1-gx)) * x

         information.rr <- crossprod(score.rr) / nrow(x)
         vcov <- solve(information.rr, tol = solve.tolerance) / nrow(x)
         vcov <- vcov/(bscale %*% t(bscale))
         
         se <- sqrt(diag(vcov))
    }

    # auxiliary data functionality  
    if (aux.check) {     

        dlogistic.coef <- function(x) exp(x) / (1 + exp(x))^2

        score <- function (beta, y, x, w = NULL, c, d ) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            coef <- (c * y / (c * logistic(x %*% beta) + d) - 
                c * (1 - y) / (1 - c * logistic(x %*% beta) - d)) * c(dlogistic.coef(x %*% beta))
            w.coef <- c(coef) * w
            colSums(w.coef * x)/sum(w)
        }

        jbn <- function (beta, y, x, w = NULL, c, d ) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            coef1 <- (y * c/(c * logistic(x %*% beta) + d) -
                (1 - y) * c /(1 - c * logistic(x %*% beta) - d)) *
                    exp(x %*% beta) * (1 - exp(2 * x %*% beta)) / (1 + exp(x %*% beta))^4
            coef2 <- (y * c^2 / (c * logistic(x %*% beta) + d)^2 + 
                (1 - y) * c^2/(1 - c * logistic(x %*% beta) - d)^2) *
                    exp(2 * x %*% beta)/(1 + exp(x %*% beta))^4
            t(w * c(coef1 - coef2) * x) %*% x / sum(w)
        }

        # gmm objective function
        gmm <- function (beta, y, x, w = NULL, c, d, W = NULL, h, group) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            m1 <- score(beta, y, x, w, c, d)
            g <- c()
            group.labels <- names(h)
            for (i in 1:length(h)){
                wg <- w[group == group.labels[i]]
                g[i] <- sum(c(h[i]) - logistic(x[group == group.labels[i], , drop = FALSE] %*% beta))/sum(w)
            }
            if (missing(W)) W <- diag(length(c(m1, g)))
            return(t(c(m1, g)) %*% W %*% c(m1, g))
        }

        # gmm gradient
        grad <- function (beta, y, x, w = NULL, c, d , W = NULL, h, group) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            score <- score(beta, y, x, w, c, d)
            jbn <- jbn(beta, y, x, w, c, d)
            g <- c()
            group.labels <- names(h)
            for (i in 1:length(h)){
                wg <- w[group == group.labels[i]]
                g[i] <- sum(c(h[i]) - logistic(x[group == group.labels[i], , drop = FALSE] %*% beta))/sum(w)
                grad.iter <- colSums(-c(dlogistic.coef(x[group == group.labels[i], , drop = FALSE] %*% beta)) * x[group == group.labels[i], , drop = FALSE])/sum(w)
                jbn <- rbind(jbn, grad.iter)
            } 
            if (missing(W)) W <- diag(length(c(score, g)))
            t(2 * c(score, g)) %*% W %*% jbn
        }

        # gmm weighting matrix
        weight.matrix <- function (beta, y, x, w = NULL, c, d, h, group, matrixMethod) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            coef <- (c * y / (c * logistic(x %*% beta) + d) - 
                c * (1 - y) / (1 - c * logistic(x %*% beta) - d)) * c(dlogistic.coef(x %*% beta))
            w.coef <- c(coef) * w
            dg <- c()
            group.labels <- names(h)
            for (i in 1:length(group.labels)){
                wg <- w[group == group.labels[i]]
                dg[i] <- sum(c(h[i]) - logistic(x[group == group.labels[i], , drop = FALSE] %*% beta)^2)/sum(w)
            }
            matrix1 <- t(w.coef * x) %*% (w.coef * x)/sum(w)
            matrix2 <- diag(dg, nrow = length(dg))
            efficient.W <- solve(adiag(matrix1, matrix2))
            if (matrixMethod == "efficient" | matrixMethod == "cue") {
              efficient.W
            } else if (matrixMethod == "princomp") {
              decomp <- eigen(efficient.W)
              decomp$values * t(decomp$vectors) %*% decomp$vectors
            }
        }

        # continuously updating objective function
        gmm.cue <- function (beta, y, x, w = NULL, c, d, h, group, matrixMethod) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            m1 <- score(beta, y, x, w, c, d)
            g <- c()
            group.labels <- names(h)
            for (i in 1:length(h)){
                wg <- w[group == group.labels[i]]
                g[i] <- sum(c(h[i]) - logistic(x[group == group.labels[i], , drop = FALSE] %*% beta))/sum(w)
            }
            W <- weight.matrix(beta = beta, y = y, x = x, w = w, c = c, d = d, 
              h = h, group = group, matrixMethod = matrixMethod)
            t(c(m1, g)) %*% W %*% c(m1, g)
        }


        # coefs with auxiliary information
        rr.true.coefs <- function (beta, y, x, w = NULL, c, d , W = NULL, h = NULL, group = NULL, matrixMethod) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            if (missing(W) & missing(h)) {
                W <- diag(length(ncol(x)))
            } else if (missing(W)) {
                W <- diag(length(c(ncol(x), h)))
            }
            if (matrixMethod == "efficient" | matrixMethod == "princomp") {
              weight.matrix <- weight.matrix(beta = beta, y = y, x = x, h = h, group = group, c = c, d = d, matrixMethod = matrixMethod)
              fit1 <- optim(par = beta, fn = gmm, gr = grad, method = "BFGS", 
                control = list(maxit = 500), y = y, x = x, h = h, group = group, 
                c = c, d = d, W = weight.matrix)
              newpar <- fit1$par
              weight.matrix <- weight.matrix(beta = newpar, y = y, x = x, h = h, group = group, c = c, d = d, matrixMethod = matrixMethod)
              fit2 <- optim(par = newpar, fn = gmm, gr = grad, method = "BFGS", 
                control = list(maxit = 500), y = y, x = x, h = h, group = group, 
                c = c, d = d, W = weight.matrix)
              fit2             
            } else {
              fit <- optim(par = beta, fn = gmm.cue, method = "BFGS", 
                control = list(maxit = 500), y = y, x = x, h = h, group = group, 
                c = c, d = d, matrixMethod = matrixMethod)
              fit
            }
        }

        # vcov with auxiliary information
        rr.true.vcov <- function(beta, y, x, w, c, d, h = NULL, group = NULL, matrixMethod) {
            n <- length(y)
            if (missing(w)) w <- rep(1, n)
            jbn <- jbn(beta, y, x, w, c, d)
            group.labels <- names(h)
            for (i in 1:length(group.labels)) {
                wg <- w[group == group.labels[i]]
                grad.iter <- t(w[group == group.labels[i]]) %*% (-c(dlogistic.coef(x[group == group.labels[i], , drop = FALSE] %*% beta)) * x[group == group.labels[i], , drop = FALSE])/sum(w)
                jbn <- rbind(jbn, grad.iter)
            }
            weight.matrix <- weight.matrix(beta = beta, y = y, x = x, h = h, group = group, c = c, d = d, matrixMethod = matrixMethod)
            if (matrixMethod != "princomp") {
              solve(t(jbn) %*% weight.matrix %*% jbn)/(sum(w))
            } else {
              V <- solve(weight.matrix(beta = beta, y = y, x = x, h = h, group = group, c = c, d = d, matrixMethod = "efficient"))
              solve(t(jbn) %*% weight.matrix %*% jbn) %*% t(jbn) %*% weight.matrix %*% V %*% t(weight.matrix) %*% jbn %*% solve(t(jbn) %*% weight.matrix %*% jbn)/sum(w)
            }
        }



        true.fit <- rr.true.coefs(beta = bpar, y = y, x = x1, c = c, d = d, h = h, group = group, matrixMethod = matrixMethod)
        bpar <- true.fit$par
        vcov <- rr.true.vcov(beta = bpar, y = y, x = x1, c = c, d = d, h = h, group = group, matrixMethod = matrixMethod)
        se <- sqrt(diag(vcov))

        # Sargan-Hansen overidentification test
        J.stat <- true.fit$val * n
        overid.p <- round(1 - pchisq(J.stat, df = length(h)), 4)

    }

    return.object <- list(est = bpar, 
                          vcov = vcov,
                          se = se,
                          data = df,
                          coef.names = colnames(x),
                          x = x1, #unscaled data
                          y = y,
                          design = design,
                          call = match.call())

    if(design == "forced-known") {
      return.object$p <- p
      return.object$p1 <- p1
      return.object$p0 <- p0
    }
    
    if(design == "mirrored" | design == "disguised") {
      return.object$p <- p
    }
    
    if(design == "unrelated-known") {
      return.object$p <- p
      return.object$q <- q
    }
    
    # auxiliary data functionality -- set up return object
    return.object$aux <- aux.check

    if (aux.check) {
      return.object$nh <- length(h)
      return.object$wm <- ifelse(matrixMethod == "cue", "continuously updating", 
        ifelse(matrixMethod == "princomp", "principal components", "efficient"))
      return.object$J.stat <- round(J.stat, 4)
      return.object$overid.p <- overid.p
    }

    class(return.object) <- "rrreg"

    return(return.object)

}

summarize.design <- function(design, p, p0, p1, q) {
    
    design.short <- strsplit(design, "-")[[1]][1]
    if(design.short == "forced")
        param <- paste("p = ", round(p,2), ", p0 = ", round(p0,2), ", and p1 = ", round(p1, 2), sep = "")
    
    return(cat("Randomized response ", design.short, " design with ", param, ".", sep = ""))
    
}
    
rrcd <- function(p, p0, p1, q, design) {
   if(missing(design)) {
        stop("Missing design specification, see documentation")
    } else if(design != "forced-known" & design != "mirrored" & design != "disguised" & design != "unrelated-known") {
        stop(paste("The design you specified,", design, "is not implemented in this version of the software."))
    }
  
  if (design == "forced-known") {
    if(missing(p) & !missing(p1))
      stop("for the forced design with known probabilities, you must specify p")
    if(missing(p1) & !missing(p))
      stop("for the forced design with known probabilities, you must specify p1")
    if(missing(p) & missing(p1))
      stop("for the forced design with known probabilities, you must specify p and p1")
    
    if (any((p > 1) | (p < 0))) {
      stop("p must be a probability")
    }
    if (any((p1 > 1) | (p1 < 0))) {
      stop("p1 must be a probability")
    }
    if (any((p0 > 1) | (p0 < 0))) {
      stop("p0 must be a probability")
    }
    if (any( round(p + p0 + p1, 10) !=1)) {
      stop("p, p0, and p1 must sum to 1")
    }
    
    c <- p
    d <- p1
    
  } else if (design == "mirrored" | design == "disguised") {
   
    if(missing(p))
      stop(paste("for the", design, "design, you must specify p"))
    if (any((p > 1) | (p < 0)) | p == .5) {
      stop("p must be a probability that is not .5")
    }
    
    c <- 2 * p - 1
    d <- 1 - p
    
  } else if (design == "unrelated-known") {
    
    warning("q, the probability of answering 'yes' to the unrelated question, is assumed to be independent of covariates")
    
    if(missing(p) & !missing(q))
      stop(paste("for the", design, "design, you must specify p"))
    if(missing(q) & !missing(p))
      stop(paste("for the", design, "design, you must specify q"))
    if(missing(p) & missing(q))
      stop(paste("for the", design, "design, you must specify p and q"))
    
    if (any((p > 1) | (p < 0))) {
      stop("p must be a probability")
    }
    
    if (any((q > 1) | (q < 0))) {
      stop("q must be a probability")
    }
    
    c <- p
    d <- (1 - p) * q
  
  } 
  
  return(list(c = c,
              d = d))
}

#' @export
vcov.rrreg <- function(object, ...){

  vcov <- object$vcov

  rownames(vcov) <- colnames(vcov) <- object$coef.names

  return(vcov)

}

#' @export
coef.rrreg <- function(object, ...){

  coef <- object$est

  names(coef) <- object$coef.names

  return(coef)

}



#' Predicted Probabilities for Randomized Response Regression
#' 
#' \code{predict.rrreg} is used to generate predicted probabilities from a
#' multivariate regression object of survey data using randomized response
#' methods.
#' 
#' This function allows users to generate predicted probabilities for the
#' randomized response item given an object of class "rrreg" from the
#' \code{rrreg()} function. Four standard designs are accepted by this
#' function: mirrored question, forced response, disguised response, and
#' unrelated question. The design, already specified in the "rrreg" object, is
#' then directly inputted into this function.
#' 
#' @usage \method{predict}{rrreg}(object, given.y = FALSE, alpha = .05, n.sims =
#' 1000, avg = FALSE, newdata = NULL, quasi.bayes = FALSE, keep.draws = FALSE,
#' ...)
#' @param object An object of class "rrreg" generated by the \code{rrreg()}
#' function.
#' @param given.y Indicator of whether to use "y" the response vector to
#' calculate the posterior prediction of latent responses. Default is
#' \code{FALSE}, which simply generates fitted values using the logistic
#' regression.
#' @param alpha Confidence level for the hypothesis test to generate upper and
#' lower confidence intervals. Default is \code{ .05}.
#' @param n.sims Number of sampled draws for quasi-bayesian predicted
#' probability estimation. Default is \code{1000}.
#' @param avg Whether to output the mean of the predicted probabilities and
#' uncertainty estimates. Default is \code{FALSE}.
#' @param newdata Optional new data frame of covariates provided by the user.
#' Otherwise, the original data frame from the "rreg" object is used.
#' @param quasi.bayes Option to use Monte Carlo simulations to generate
#' uncertainty estimates for predicted probabilities. Default is \code{FALSE}.
#' @param keep.draws Option to return the Monte Carlos draws of the quantity of
#' interest, for use in calculating differences for example.
#' @param ... Further arguments to be passed to \code{predict.rrreg()} command.
#' 
#' @return \code{predict.rrreg} returns predicted probabilities either for each
#' observation in the data frame or the average over all observations. The
#' output is a list that contains the following components:
#' 
#' \item{est}{Predicted probabilities for the randomized response item
#' generated either using fitted values, posterior predictions, or
#' quasi-Bayesian simulations. If \code{avg} is set to \code{TRUE}, the output
#' will only include the mean estimate.} \item{se}{Standard errors for the
#' predicted probabilities of the randomized response item generated using
#' Monte Carlo simulations. If \code{quasi.bayes} is set to \code{FALSE}, no
#' standard errors will be outputted.} \item{ci.lower}{Estimates for the lower
#' confidence interval. If \code{quasi.bayes} is set to \code{FALSE}, no
#' confidence interval estimate will be outputted.} \item{ci.upper}{Estimates
#' for the upper confidence interval. If \code{quasi.bayes} is set to
#' \code{FALSE}, no confidence interval estimate will be outputted.}
#' \item{qoi.draws}{Monte Carlos draws of the quantity of interest, returned
#' only if \code{keep.draws} is set to \code{TRUE}.}
#' @seealso \code{\link{rrreg}} to conduct multivariate regression analyses in
#' order to generate predicted probabilities for the randomized response item.
#' @references Blair, Graeme, Kosuke Imai and Yang-Yang Zhou. (2014) "Design
#' and Analysis of the Randomized Response Technique."  \emph{Working Paper.}
#' Available at \url{http://imai.princeton.edu/research/randresp.html}.
#' @keywords predicted probabilities fitted values
#' @examples
#' 
#' data(nigeria)
#' 
#' set.seed(1)
#' 
#' ## Define design parameters
#' p <- 2/3  # probability of answering honestly in Forced Response Design
#' p1 <- 1/6 # probability of forced 'yes'
#' p0 <- 1/6 # probability of forced 'no'
#' 
#' ## Fit linear regression on the randomized response item of 
#' ## whether citizen respondents had direct social contacts to armed groups
#' 
#' rr.q1.reg.obj <- rrreg(rr.q1 ~ cov.asset.index + cov.married + I(cov.age/10) + 
#'                       I((cov.age/10)^2) + cov.education + cov.female,   
#'                       data = nigeria, p = p, p1 = p1, p0 = p0, 
#'                       design = "forced-known")
#' 
#' ## Generate the mean predicted probability of having social contacts to 
#' ## armed groups across respondents using quasi-Bayesian simulations. 
#' 
#' rr.q1.reg.pred <- predict(rr.q1.reg.obj, given.y = FALSE, 
#'                                 avg = TRUE, quasi.bayes = TRUE, 
#'                                 n.sims = 10000)
#' 
#' ## Replicates Table 3 in Blair, Imai, and Zhou (2014)
#' 
#' @importFrom MASS mvrnorm
#' 
#' @export
predict.rrreg <- function(object, given.y = FALSE, alpha = .05, 
                          n.sims = 1000, avg = FALSE, newdata = NULL, 
                          quasi.bayes = FALSE, keep.draws = FALSE, ...) {
  
  z.post <- function(object = NULL, x = NULL, y = NULL, coef = NULL) {
    
    if(is.null(coef))
      coef <- object$est
    
    if(object$design == "forced-known") {

        pred <- ((object$p + object$p1)^y * object$p0^(1 - y) * logistic(x %*% coef)) /
            (object$p * logistic(x %*% coef)^y * (1 - logistic(x %*% coef))^(1 - y) +
                 object$p1^y * object$p0^(1 - y))
      
    } else if (object$design == "disguised") {

        pred <- ( logistic(x %*% coef) * object$p^y * (1 - object$p)^(1 - y) ) /
            ( logistic(x %*% coef) * object$p^y * (1 - object$p)^(1 - y) +
                 (1 - logistic(x %*% coef)) * (1 - object$p)^y * object$p^(1 - y) )

    } else if (object$design == "mirrored") {

        pred <- ( object$p^y * (1 - object$p)^(1 - y) * logistic(x %*% coef) ) /
            ( object$p^y * (1 - object$p)^(1 - y) * logistic(x %*% coef) +
                 object$p^(1 - y) * (1 - object$p)^y * (1 - logistic(x %*% coef)) )
          
    } else if (object$design == "unrelated-known") {

        pred <- ( (object$p * y + (1 - object$p) * object$q^y * (1 - object$q)^(1 - y))
                 * logistic(x %*% coef) ) /
        ( (object$p * logistic(x %*% coef)^y * (1 - logistic(x %*% coef))^(1 - y)) +
           ((1 - object$p) * object$q^y * (1 - object$q)^(1 - y)) )

    }
    
    return(pred)
    
  }
  
  if(missing(newdata)) {
    xvar <- object$x
    y <- object$y
  } else {
    if(nrow(newdata)==0)
      stop("No data in the provided data frame.")
    xvar <- model.matrix(as.formula(paste("~", c(object$call$formula[[3]]))), newdata)
    
    if(given.y == TRUE){
    newdata1 <- model.frame(object$call$formula, newdata, na.action = na.omit)
    xvar <- model.matrix(as.formula(paste("~", c(object$call$formula[[3]]))), newdata1)
    y <- model.response(newdata1)
    }
  }
  
  if(quasi.bayes == FALSE) {
    
    if(given.y == TRUE) {
      pred <- z.post(object, x = xvar, y = y)
    } else {
      pred <- logistic(xvar %*% coef(object))
    }
    
    if(avg == TRUE)
      pred <- t(as.matrix(apply(pred, 2, mean)))
    
    return.object <- list(est = pred)
    
  } else {
    
    draws <- mvrnorm(n = n.sims, mu = object$est, Sigma = vcov(object))
    
    if(given.y == TRUE) {
      pred <- z.post(object, x = xvar, y = y, coef = t(draws))
    } else {
      pred <- logistic(xvar %*% t(draws))
    }
    
    if(avg == TRUE)
      pred <- t(as.matrix(apply(pred, 2, mean)))
    
    est <- apply(pred, 1, mean)
    lower.ci <- apply(pred, 1, function(x, a = alpha) quantile(x, a/2))  
    upper.ci <- apply(pred, 1, function(x, a = alpha) quantile(x, 1-a/2)) 
    se <- apply(pred, 1, sd)
    
    return.object <- list(est = est, 
                          se = se, 
                          ci.lower = lower.ci, 
                          ci.upper = upper.ci)
    
    if(keep.draws == TRUE)
      return.object$qoi.draws <- pred
  }
  
  return(return.object)
  
}

#' @export
summary.rrreg <- function(object, ...) {
  structure(object, class = c("summary.rrreg", class(object)))
}

#' @export
print.rrreg <- function(x, ...){

  cat("\nRandomized Response Technique Regression \n\nCall: ")

  dput(x$call)

  cat("\n")

  tb <- matrix(NA, ncol = 2, nrow = length(x$est))
  colnames(tb) <- c("est", "se")
  rownames(tb) <- colnames(x$x)

  tb[,1] <- x$est
  tb[,2] <- x$se

  print(as.matrix(round(tb,5)))

  cat("\n")

  summarize.design(x$design, x$p, x$p0, x$p1, x$q)

  cat("\n")

  # auxiliary data functionality -- print details
  if (x$aux) cat("Incorporating ", x$nh, " auxiliary moment(s). Weighting method: ", x$wm, ".\n", 
    "The overidentification test statistic was: ", x$J.stat, " (p < ", x$overid.p, ")", ".\n", sep = "")

  invisible(x)

}

#' @export
print.summary.rrreg <- function(x, ...){

  cat("\nRandomized Response Technique Regression \n\nCall: ")

  dput(x$call)

  cat("\n")

  tb <- matrix(NA, ncol = 2, nrow = length(x$est))
  colnames(tb) <- c("Est.", "S.E.")
  rownames(tb) <- colnames(x$x)

  tb[,1] <- x$est
  tb[,2] <- x$se

  print(as.matrix(round(tb,5)))

  cat("\n")

  summarize.design(x$design, x$p, x$p0, x$p1, x$q)

  cat("\n")

  # auxiliary data functionality -- print details
  if (x$aux) cat("Incorporating ", x$nh, " auxiliary moment(s). Weighting method: ", x$wm, ".\n", 
    "The overidentification test statistic was: ", x$J.stat, " (p < ", x$overid.p, ")", ".\n", sep = "")

  invisible(x)

}

Try the rr package in your browser

Any scripts or data that you put into this service are public.

rr documentation built on May 24, 2022, 5:05 p.m.