kmte: An R Package for Treatment Effects with Censored Outcomes

Documented in zcate

#' Tests for Zero Conditional Average Treatment Effects
#'
#' \emph{zcate} computes Kolmogorov-Smirnov and Cramer-von Mises type tests
#' for the null hypothesis of zero conditional average treatment effects.
#' The test is suitable for both censored and uncensored outcomes, and relies on
#' the unconfoundedness assumption. For details of the testing procedure, see
#' Sant'Anna (2016b),'Nonparametric Tests for Treatment Effect Heterogeneity with
#' Duration Outcomes'.
#'
#'@param out vector containing the outcome of interest
#'@param delta vector containing the censoring indicator (1 if observed, 0 if censored)
#'@param treat vector containing the treatment indicator (1 if treated, 0 if control)
#'@param xvector matrix (or data frame) containing the conditioning covariates
#'@param xpscore matrix (or data frame) containing the covariates (and their
#'               transformations) to be included in the propensity score estimation
#'@param b number of bootstrap draws
#'@param cores number of cores to use during the bootstrap (default is 1).
#'        If cores>1, the bootstrap is conducted using parLapply, instead
#'        of lapply type call.
#'
#'@return a list containing the Kolmogorov-Smirnov test statistic (kstest),
#'        the Cramer-von Mises test statistic (cvmtest), and their associated
#'        bootstrapped p-values, pvks and pvcvm, respectively.
#'@export
#'@importFrom stats binomial ecdf glm rbinom runif
#'@importFrom MASS ginv
#'@importFrom parallel makeCluster parLapply stopCluster
#'@importFrom harvestr gather



zcate <- function(out, delta, treat, xvector, xpscore, b, cores = 1) {
    # first, we merge all the data into a single datafile
    fulldata <- data.frame(cbind(out, delta, treat, xvector, xpscore))
    # Compute Kaplan-Meier Weigths - data is now sorted!
    fulldata <- kmweight(1, 2, fulldata)
    # Dimension of data matrix fulldata
    dim.all <- dim(fulldata)[2]
    # Next, we rename the variable in xpscore to avoid problems
    # Dimension of Xvector
    dimx <- dim(xvector)[2]
    # Where it ends
    dimxe <- 3 + dimx
    xpscore1 <- fulldata[, ((dimxe + 1):(dim.all - 1))]
    datascore <- data.frame(y = fulldata[, 3], xpscore1)
    # estimate the propensity score
    pscore <- stats::glm(y ~ ., data = datascore,
                  family = binomial("logit"), x = T)
    fulldata$pscore <- pscore$fit

    # Create id to help on ordering
    fulldata$id <- 1:length(fulldata[, 1])

    # Dimension of data matrix fulldata
    dim.all <- dim(fulldata)[2]

    # sample size
    n.total <- as.numeric(length(fulldata[, 1]))
    # subset of treated individuals
    data.treat <- subset(fulldata, fulldata[, 3] == 1)
    # subset of not-treated individuals
    data.control <- subset(fulldata, fulldata[, 3] == 0)

    # Compute Kaplan-Meier weigth for treated
    data.treat <- kmweight(1, 2, data.treat)
    n.treat <- as.numeric(length(data.treat[, 1]))
    data.treat$w <- data.treat$w * (n.treat/n.total)

    # Compute Kaplan-Meier weigth for control
    data.control <- kmweight(1, 2, data.control)
    n.control <- as.numeric(length(data.control[, 1]))
    data.control$w <- data.control$w * (n.control/n.total)

    # Let's put everything in a single data
    #correct KM weigths
    #First, the datasets
    fulldata <- data.frame(rbind(data.treat, data.control))
    # Sort wrt id
    fulldata <- fulldata[order(as.numeric(fulldata[, dim.all])), ]


    # Compute the indicators needed for the computation of the test
    indx <- outer(fulldata[, 4], fulldata[, 4], "<=")

    if (dimx > 1) {
        for (i in 5:dimxe) {
            indx <- indx * outer(fulldata[, i], fulldata[, i], "<=")
        }
    }
    ind <- indx
    rm(indx)

    # We compute the test statistic
    testdist <- (((fulldata[, 3]/fulldata[, (dim.all - 1)]) -
                    ((1 - fulldata[, 3])/(1 - fulldata[,
        (dim.all - 1)]))) * fulldata[, 1] * fulldata[, (dim.all - 2)])

    testdist <- matrix(rep(testdist, n.total), n.total)
    testdist <- testdist * ind
    testdist <- colSums(testdist)

    kstest <- (n.total^0.5) * max(abs(testdist))
    cvmtest <- sum(testdist^2)

    # Necessary ingridients for the linear represenation and
    # the bootstrap
    # Compute linear representation for these sub-samples
    linrep.treat <- lr.cate.treated(fulldata = fulldata,
                                    subdata = data.treat,
                                    dimx = dimx)
    linrep.control <- lr.cate.control(fulldata = fulldata,
                                      subdata = data.control,
                                      dimx = dimx)

    # Let's put the linear represenations in a single matrix
    linrep.tau <- rbind(linrep.treat, -linrep.control)
    # sort linrep.merged wrt id
    linrep.tau <- linrep.tau[order(as.numeric(
                  abs(linrep.tau[, 1]))), ]
    linrep.tau <- linrep.tau[, -1]

    # Remove what we won't need
    rm(linrep.treat, linrep.control)
    # Now, the estimation effect Prepare the matrix
    matest <- pscore$x
    # X'X
    mat1 <- (t(matest)) %*% matest
    mat1 <- mat1/n.total
    # (X'X)^-1
    mat1inv <- MASS::ginv(mat1)

    # Compute the terms in the 'Y'
    wt <- fulldata[, 3]/(fulldata[, (dim.all - 1)]^2)
    wc <- (1 - fulldata[, 3])/((1 - fulldata[, (dim.all - 1)])^2)

    yest <- (wt + wc) * fulldata[, (dim.all - 2)] * fulldata[, 1]
    yest <- matrix(rep(yest, n.total), n.total)
    yest <- yest * ind

    # X'Y
    xy <- (t(matest)) %*% yest

    # Beta1=(X'X)^-1 (X'Y)
    beta1 <- mat1inv %*% xy

    # Compute the 'prediction) X*Beta1
    esteff1 <- matest %*% beta1
    # Now, compute the estimation effect of the pscore
    esteff <- fulldata[, 3] - fulldata[, (dim.all - 1)]
    esteff <- matrix(rep(esteff, n.total), n.total)
    esteff <- esteff * esteff1

    # Remove what we won't need
    rm(esteff1, yest)
    # Now, we plug in everything to get the linear represenation
    #that we will bootstrap!
    taudist <- (linrep.tau - esteff)/n.total
    # taudist=(linrep.tau)/n.total Remove what I won't use
    rm(esteff)
    # Now, the bootstrap Number of bootstrap draws
    nboot <- b

    if (cores==1){
      tests <- b.km(n.total = n.total, taudist = taudist,
                    nboot = nboot, kstest = kstest, cvmtest = cvmtest)
    }
    if (cores>1){
      tests <- b.km.mult(n.total = n.total, taudist = taudist,
                         nboot = nboot, kstest = kstest,
                         cvmtest = cvmtest, cores = cores)
    }
    # remove what I do not need
    rm(taudist)

    # Return these
    list(kstest = tests$kstest, cvmtest = tests$cvmtest,
         pvks = tests$pvks, pvcvm = tests$pvcvm)
}