pstest: Specification Tests for Parametric Propensity Score Models

Documented in pstest

#' pstest: Tests for the Propensity Score
#'
#' \emph{pstest} computes Kolmogorov-Smirnov and Cramer-von Mises type tests
#' for the null hypothesis that a parametric model for the propensity score is
#' is correctly specified. For details of the testing procedure, see
#' Sant'Anna and Song (2019),'Specification Tests for the Propensity Score'  <doi:10.1016/j.jeconom.2019.02.002>.
#'
#'@param d a vector containing the binary treatment indicator.
#'@param pscore a vector containing the estimated propensity scores.
#'@param xpscore a matrix (or data frame) containing the covariates (and their
#'               transformations) included in the propensity score
#'               estimation. It should also include the constant term.
#'@param model  a description of the functional form (link function) used
#'              to estimated propensity score. The alternatives are:
#'              'logit' (default), 'probit', and het.probit
#' @param pscore.model in case you you set model="het.probit", pscore.model is the entire hetglm object.
#'                      Default for pscore.model is NULL.
#'@param w a description of which weight function the projection is based on.
#'            The alternatives are 'ind' (default), which set \eqn{w(q,u)=1(q<=u)},
#'            'exp', which set \eqn{w(q,u)=exp(qu)}, 'logistic', which set
#'            \eqn{w(q,u)=1/[1+exp(1-qu)]}, 'sin', which set \eqn{w(q,u)=sin(qu)}, and
#'            'sincos', which set \eqn{w(q,u)=sin(qu)+cos(qu)}.
#'@param dist a description of which distribution to use during the bootstrap.
#'            The alternatives are 'Mammen' (default), and 'Rademacher'.
#'@param nboot number of bootstrap replicates to perform. Default is 1,000.
#'@param cores number of cores to use during the bootstrap. Default is 1.
#'              If cores is greater than 1, the bootstrap is conducted using
#'              parLapply, instead of lapply type call.
#'@param chunk a value that determine the size of each 'tile'. Such argument is used
#'              to split the original data into chunks, saving memory.
#'              Default value is 1,000. If the \emph{pstest} function throw a
#'              memory error, you should choose a smaller value for \emph{chunk}.
#'
#'
#'@return a list containing the Kolmogorov-Smirnov and Cramer-von Mises test
#'        statistics for the null hypothesis of correctly specified propensity
#'        score model (kstest and cvmtest, respectively), and their associate
#'        bootstrapped p-values, pvks and pvcvm, respectively. All inputs are also
#'        returned.
#'
#'@references
#'       Sant'Anna, Pedro H. C, and Song, Xiaojun (2019), \emph{Specification Tests for the Propensity Score},
#'       Journal of Econometrics, vol. 210 (2), p. 379-404,  <doi:10.1016/j.jeconom.2019.02.002>.
#'
#'@examples
#' # Example based on simulation data
#' # Simulate vector of covariates
#' set.seed(1234)
#' x1 <- runif(100)
#' x2 <- rt(100, 5)
#' x3 <- rpois(100, 3)
#' # generate treatment status score based on Probit Specification
#' treat <- (x1 + x2 + x3 >= rnorm(100, 4, 5))
#' # estimate correctly specified propensity score based on Probit
#' pscore <- stats::glm(treat ~ x1 + x2 + x3, family = binomial(link = "probit"),
#'               x = TRUE)
#' # Test the correct specification of estimated propensity score, using
#' # the weight function 'ind', and bootstrap based on 'Mammen'.
#' pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x,
#'        model = "probit", w = "ind", dist = "Mammen")
#' # Alternatively, one can use the 'sin' weight function
#' pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x,
#'        model = "probit", w = "sin", dist = "Mammen")
#'
#'@export
#'
#'@importFrom stats binomial rbinom runif glm
#'@importFrom parallel makeCluster parLapply stopCluster nextRNGStream
#'@importFrom glmx hetglm.fit
#'@importFrom MASS ginv
#-------------------------------------------------------------------------------
pstest = function(d, pscore, xpscore, pscore.model = NULL,
                  model = "logit",
                  w = "ind",
                  dist = "Mammen",
                  nboot = 1000, cores = 1, chunk = 1000) {
    #-----------------------------------------------------------------------------
    # Define some underlying variables
    n <- length(d)
    xx <- as.matrix(xpscore)
    pscore.fit <- pscore
    uhat <- d - pscore.fit
    #-----------------------------------------------------------------------------
    # Run some tests
    if( !is.element(model,c("logit", "probit", "het.probit"))) {
      stop("model must be either 'logit', 'probit' or 'het.probit' ")
    }
    if( !is.element(dist,c("Mammen", "Rademacher"))) {
      stop("dist must be either 'Mammen', or 'Rademacher' ")
    }
    if( !is.element(w,c("ind", "exp", "logistic", "sin", "sincos"))) {
      stop("w must be either 'ind', 'exp', 'logistic', 'sin', or 'sincos' ")
    }
    #-----------------------------------------------------------------------------
    # #Define the score variables for the projection
    if (model == "logit") {
        g <- pscore.fit * (1 - pscore.fit) * xx
    }
    if (model == "probit") {
        beta.x <- stats::qnorm(pscore.fit)
        g <- stats::dnorm(beta.x) * xx
        rm(beta.x)
    }
    if (model == "het.probit") {
      if(is.null(pscore.model)){
        stop(" You must provide the entire hetglm model if you are using het. probit")
      }
      if(!class(pscore.model)=="hetglm"){
        stop(" pscore.model must be estimated using the hetglm function. See glmx package")
      }
      if(is.null(pscore.model$x$scale)){
        stop(" You must include the option x=T in your glmx model")
      }

      pp <- pscore.model
      index.mean <- as.numeric(pp$x$mean %*% pp$coefficients$mean)
      index.scale <- as.numeric(pp$x$scale %*% (pp$coefficients$scale))

      #beta.x <- stats::qnorm(pscore.fit)
      index <- index.mean * exp(-index.scale)
      g <- cbind(stats::dnorm(index) * exp(-index.scale) *pp$x$mean,
                 - stats::dnorm(index)*index.mean*exp(-index.scale)*pp$x$scale)


      xx <- as.matrix(cbind(pp$x$mean,pp$x$scale))
      #rm(pp,xx.scale,index.mean,index.scale,index )
    }

    gg <- crossprod(g)
    #-----------------------------------------------------------------------------
    # Define variables to be used in the loop
    # Number of covariates
    k.dim = dim(xx)[2]

    # unique pscores
    #un.pscores <- unique(pscore.fit)
    un.pscores <- (pscore.fit)
    n.unique <- length(un.pscores)

    # Initialize `beta` matrix (K coefficients for each of n.unique responses)
    beta <- matrix(0, k.dim, n.unique)

    # Initialize `Rw` row vector (n.unique dimension)
    Rw <- matrix(0, 1, n.unique)

    # We split n columns into l tiles, each with chunk columns
    l <- floor(n.unique/chunk) + 1

    # Initialize the bootststrap test statistics vectors
    ksb1 <- matrix(0, nboot, 1)
    cvmb1 <- matrix(0, nboot, 1)
    #-----------------------------------------------------------------------------
    # Let's define some parameters for the bootstrap
    # Better to define these outside the loop that will follow.

    if (dist == "Mammen"){
      # Use the Mammen(1993) binary V's
      k1 <- 0.5 * (1 - 5^0.5)
      k2 <- 0.5 * (1 + 5^0.5)
      pkappa <- 0.5 * (1 + 5^0.5)/(5^0.5)
    }

    if (dist == "Rademacher"){
      # Use the Rademacher V's
      k1 <- 1
      k2 <- -1
      pkappa <- 0.5
    }

    # function for the bootstrap
    bootapply <- function(nn, n, pkappa, k1, k2, uhat, w1.temp, Seed) {
        # to make each run fully reproducible, we set the seed
        seed.run <- Seed[nn, ]
        set.seed(seed.run, "L'Ecuyer-CMRG")
        v <- stats::rbinom(n, 1, pkappa)
        v <- ifelse(v == 1, k1, k2)
        # Bootstrapped emprirical process
        Rwb <- colSums(uhat * v * w1.temp)/n
        # KS test
        ksb <- sqrt(n) * max(abs(Rwb))
        # Cramer-von Mises test
        cvmb <- sum(Rwb^2)
        # Return both tests
        return(cbind(ksb, cvmb))
    }
    #-----------------------------------------------------------------------------
    # Define seeds: Guarantee reproducibility
    ss <- floor(stats::runif(1) * 10000)
    seed.temp <- gather.ps(nboot, seed = ss)

    Seed <- matrix(nrow = nboot, ncol = 6)
    for (i in 1:nboot) {
      Seed[i, ] <- seed.temp[[i]][2:7]
    }
    #-----------------------------------------------------------------------------
    # If we are going to use paralell coding, initialize the cores
    if (cores > 1) {
        cl <- parallel::makeCluster(cores)
    }
    #-----------------------------------------------------------------------------
    # Start the loop to compute the tests (this is more memory efficient)
    # we do a loop for each weight function, to avoid loss in speed
    # indicator weight
    if (w == "ind"){
        for (i in 1:l) {
          start <- min(chunk * (i - 1) + 1, n.unique)
          end <- min(chunk * i, n.unique)
          w.temp <- outer(pscore.fit, un.pscores[start:end], "<=")
          Gw <- crossprod(g, w.temp)

          beta[, start:end] <- MASS::ginv(crossprod(g)) %*% Gw
          #beta[, start:end] <- solve(gg, Gw)
          w1.temp <- (w.temp - g %*% beta[, start:end])
          Rw[start:end] <- colSums(uhat * w1.temp)/n
          # Now the bootstrapped test in the chunk
          if (cores == 1) {
              boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2,
                                   uhat, w1.temp, Seed)
          }
          if (cores > 1) {
              boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n,
                                                pkappa, k1, k2, uhat, w1.temp, Seed)
          }
          # Put the Bootstrap resuls in a matrix
          boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot))
          # Compute the KSb and CvMb over chunks
          if (1000 * (i - 1) + 1 <= n.unique) {
            ksb1 <- pmax(ksb1, boot.chunk[, 1])
            cvmb1 <- cvmb1 + boot.chunk[, 2]
          }
        }
    }
    #-----------------------------------------------------------------------------
    # exponential weight
    if (w == "exp"){
      for (i in 1:l) {
        start <- min(chunk * (i - 1) + 1, n.unique)
        end <- min(chunk * i, n.unique)
        w.temp <- tcrossprod(pscore.fit, un.pscores[start:end])
        w.temp <- exp(w.temp)
        Gw <- crossprod(g, w.temp)
        beta[, start:end] <- solve(gg, Gw)
        w1.temp <- (w.temp - g %*% beta[, start:end])
        Rw[start:end] <- colSums(uhat * w1.temp)/n
        # Now the bootstrapped test in the chunk
        if (cores == 1) {
          boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2,
                               uhat, w1.temp, Seed)
        }
        if (cores > 1) {
          boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n,
                                            pkappa, k1, k2, uhat, w1.temp, Seed)
        }
        # Put the Bootstrap resuls in a matrix
        boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot))
        # Compute the KSb and CvMb over chunks
        if (1000 * (i - 1) + 1 <= n.unique) {
          ksb1 <- pmax(ksb1, boot.chunk[, 1])
          cvmb1 <- cvmb1 + boot.chunk[, 2]
        }
      }
    }
    #-----------------------------------------------------------------------------
    # Logistic weight
    if (w == "logistic"){
      for (i in 1:l) {
        start <- min(chunk * (i - 1) + 1, n.unique)
        end <- min(chunk * i, n.unique)
        w.temp <- tcrossprod(pscore.fit, un.pscores[start:end])
        w.temp <- 1/(1+exp(1-w.temp))
        Gw <- crossprod(g, w.temp)
        beta[, start:end] <- solve(gg, Gw)
        w1.temp <- (w.temp - g %*% beta[, start:end])
        Rw[start:end] <- colSums(uhat * w1.temp)/n
        # Now the bootstrapped test in the chunk
        if (cores == 1) {
          boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2,
                               uhat, w1.temp, Seed)
        }
        if (cores > 1) {
          boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n,
                                            pkappa, k1, k2, uhat, w1.temp, Seed)
        }
        # Put the Bootstrap resuls in a matrix
        boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot))
        # Compute the KSb and CvMb over chunks
        if (1000 * (i - 1) + 1 <= n.unique) {
          ksb1 <- pmax(ksb1, boot.chunk[, 1])
          cvmb1 <- cvmb1 + boot.chunk[, 2]
        }
      }
    }
    #-----------------------------------------------------------------------------
    # Sine weight
    if (w == "sin"){
      for (i in 1:l) {
        start <- min(chunk * (i - 1) + 1, n.unique)
        end <- min(chunk * i, n.unique)
        w.temp <- tcrossprod(pscore.fit, un.pscores[start:end])
        w.temp <- sin(w.temp)
        Gw <- crossprod(g, w.temp)
        beta[, start:end] <- solve(gg, Gw)
        w1.temp <- (w.temp - g %*% beta[, start:end])
        Rw[start:end] <- colSums(uhat * w1.temp)/n
        # Now the bootstrapped test in the chunk
        if (cores == 1) {
          boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2,
                               uhat, w1.temp, Seed)
        }
        if (cores > 1) {
          boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n,
                                            pkappa, k1, k2, uhat, w1.temp, Seed)
        }
        # Put the Bootstrap resuls in a matrix
        boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot))
        # Compute the KSb and CvMb over chunks
        if (1000 * (i - 1) + 1 <= n.unique) {
          ksb1 <- pmax(ksb1, boot.chunk[, 1])
          cvmb1 <- cvmb1 + boot.chunk[, 2]
        }
      }
    }
    #-----------------------------------------------------------------------------
    # Sine and cosine weight
    if (w == "sincos"){
      for (i in 1:l) {
        start <- min(chunk * (i - 1) + 1, n.unique)
        end <- min(chunk * i, n.unique)
        w.temp <- tcrossprod(pscore.fit, un.pscores[start:end])
        w.temp <- sin(w.temp)+cos(w.temp)
        Gw <- crossprod(g, w.temp)
        beta[, start:end] <- solve(gg, Gw)
        w1.temp <- (w.temp - g %*% beta[, start:end])
        Rw[start:end] <- colSums(uhat * w1.temp)/n
        # Now the bootstrapped test in the chunk
        if (cores == 1) {
          boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2,
                               uhat, w1.temp, Seed)
        }
        if (cores > 1) {
          boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n,
                                            pkappa, k1, k2, uhat, w1.temp, Seed)
        }
        # Put the Bootstrap resuls in a matrix
        boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot))
        # Compute the KSb and CvMb over chunks
        if (1000 * (i - 1) + 1 <= n.unique) {
          ksb1 <- pmax(ksb1, boot.chunk[, 1])
          cvmb1 <- cvmb1 + boot.chunk[, 2]
        }
      }
    }
    #-----------------------------------------------------------------------------
    # close the clusters, if we used paralell
    if (cores > 1) {
        parallel::stopCluster(cl)
    }
    #-----------------------------------------------------------------------------
    # Compute our test statistics
    cvmtest1 <- sum(Rw^2)
    kstest1 <- sqrt(n) * max(abs(Rw))
    #-----------------------------------------------------------------------------
    # Put the bootstrap tests in a matrix
    boottest <- matrix(0, nboot, 2)
    boottest[, 1] <- ksb1
    boottest[, 2] <- cvmb1
    #-----------------------------------------------------------------------------
    # Name the Columns
    colnames(boottest) <- c("ksb", "cvmb")
    #-----------------------------------------------------------------------------
    # compute the Bootstrap P-value
    pvksb <- sum((boottest[, 1] > kstest1))/nboot
    pvcvmb <- sum((boottest[, 2] > cvmtest1))/nboot
    #---------------------------------------------------------------------
    # record the call
    call.param <- match.call()
    # Record all arguments used in the function
    argu <- mget(names(formals()),sys.frame(sys.nframe()))
    argu <- list(model = argu$model, w = argu$w, dist = argu$dist, nboot = argu$nboot )
    # Return these variables
    ret <- list(kstest = kstest1, cvmtest = cvmtest1, pvks = pvksb, pvcvm = pvcvmb,
                call.param = call.param, argu = argu)
    # Define a new class
    class(ret) <- "pstest"
    # return the list
    return(ret)
}