R/testdata.R

Defines functions gendistBasic gendistSplines gendistCovariates gendistMosquito gendist6e gendist5e gendist4 gendist3e gendist3 gendist2 gendist1e gendist1

Documented in gendist1 gendist1e gendist2 gendist3 gendist3e gendist4 gendist5e gendist6e gendistBasic gendistCovariates gendistMosquito gendistSplines

#' Generate test distribution 1
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u. All unobservables
#' u are integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @return a data.frame.
gendist1 <- function(subN = 5, p1 = 0.4, p2 = 0.6) {
    dt <- data.frame(i  = rep(seq(1, subN), 2),
                     z  = rep(c(1, 2), each = subN),
                     p = rep(c(p1, p2), each = subN))
    ## Assign treatment
    dt$d <- 0
    dt[dt$z == 1 & dt$i <= (p1 * subN), "d"] <- 1
    dt[dt$z == 2 & dt$i <= (p2 * subN), "d"] <- 1
    ## Now construct Y as E[Y | X, D, Z]
    ## m0 = 1 + u
    ## m1 = 1 + u
    plist <- polyparse(~ u, data = dt)
    plist0 <- genGamma(plist,
                       lb = dt$p,
                       ub = 1,
                       multiplier = 1 / (1 - dt$p),
                       means = FALSE)
    plist1 <- genGamma(plist,
                       lb = 0,
                       ub = dt$p,
                       multiplier = 1 / dt$p,
                       means = FALSE)
    m0coef <- c(0, 6)
    m1coef <- c(7, 8)
    dt$ey0 <- plist0 %*% m0coef
    dt$ey1 <- plist1 %*% m1coef
    dt$ey  <-  dt$d * dt$ey1 + (1 - dt$d) * dt$ey0
    ## Convert data back into data.frame
    return(dt)
}

#' Generate test distribution 1 with errors
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist1e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 0.5,
                      v1.sd = 0.75) {
    dt <- data.frame(z = runif(N),
                     u = runif(N))
    ## First construct the instruments independently from U
    dt$z  <- as.integer(dt$z > subN) + 1
    dt$d <- 0
    dt[dt$z == 1 & dt$u <= p1, "d"] <- 1
    dt[dt$z == 2 & dt$u <= p2, "d"] <- 1
    ## Now construct the error terms. The error terms are normally
    ## distributed, centered at the draw of u if D = 0, and -u if D =
    ## 1.
    ## dt$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
    ## dt$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
    dt$v0 <- rnorm(n = N, mean = 0, sd = v0.sd)
    dt$v1 <- rnorm(n = N, mean = 0, sd = v1.sd)
    ## Now construct the Y values
    ## m0 = 0 + u
    ## m1 = 1 + u
    dt[dt$d == 0, "y"] <- 0 + 6 * dt[dt$d == 0, "u"] + dt[dt$d == 0, "v0"]
    dt[dt$d == 1, "y"] <- 7 + 8 * dt[dt$d == 1, "u"] + dt[dt$d == 1, "v1"]
    ## Label the types
    dt[dt$d == 1 & dt$z == 1, "type"] <- 1
    dt[dt$d == 0 & dt$z == 1, "type"] <- 2
    dt[dt$d == 1 & dt$z == 2, "type"] <- 3
    dt[dt$d == 0 & dt$z == 2, "type"] <- 4
    return(dt)
}

#' Generate test distribution 2
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1, 2, or 3, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 + u and m1 ~ 1 + u. All unobservables
#' u are integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param p3 the probability of treatment for those with the
#'     instrument Z = 3.
#' @return a data.frame.
gendist2 <- function(subN = 5, p1 = 0.4, p2 = 0.6, p3 = 0.8) {
    dt <- data.frame(i  = rep(seq(1, subN), 3),
                     z  = rep(c(1, 2, 3), each = subN),
                     p = rep(c(p1, p2, p3), each = subN))
    ## Assign treatment
    dt$d <- 0
    dt[dt$z == 1 & dt$i <= p1 * subN, "d"] <- 1
    dt[dt$z == 2 & dt$i <= p2 * subN, "d"] <- 1
    dt[dt$z == 3 & dt$i <= p3 * subN, "d"] <- 1
    ## Now construct Y as E[Y | X, D, Z]
    ## m0 = 1 + u
    ## m1 = 1 + u
    plist <- polyparse(~ u, data = dt)
    plist0 <- genGamma(plist,
                       lb = dt$p,
                       ub = 1,
                       multiplier = 1 / (1 - dt$p),
                       means = FALSE)
    plist1 <- genGamma(plist,
                       lb = 0,
                       ub = dt$p,
                       multiplier = 1 / dt$p,
                       means = FALSE)
    m0coef <- c(0, 6)
    m1coef <- c(7, 8)
    dt$ey0 <- plist0 %*% m0coef
    dt$ey1 <- plist1 %*% m1coef
    dt$ey  <- dt$d * dt$ey1 + (1 - dt$d) * dt$ey0
    ## Convert data back into data.frame
    return(dt)
}


#' Generate test distribution 3
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 and 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 and m1 ~ 1. All unobservables u are
#' integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @return a data.frame.
gendist3 <- function(subN = 5, p1 = 0.4, p2 = 0.6) {
    dt <- data.frame(i  = rep(seq(1, subN), 2),
                     z  = rep(c(1, 2), each = subN),
                     p = rep(c(p1, p2), each = subN))
    ## Assign treatment
    dt$d <- 0
    dt[dt$z == 1 & dt$i <= (p1 * subN), "d"] <- 1
    dt[dt$z == 2 & dt$i <= (p2 * subN), "d"] <- 1
    ## Now construct Y as E[Y | X, D, Z]
    plist <- polyparse(~ 0 + 1, data = dt)
    plist0 <- genGamma(plist,
                       lb = dt$p,
                       ub = 1,
                       multiplier = 1 / (1 - dt$p),
                       means = FALSE)
    plist1 <- genGamma(plist,
                       lb = 0,
                       ub = dt$p,
                       multiplier = 1 / dt$p,
                       means = FALSE)
    m0coef <- c(3)
    m1coef <- c(9)
    dt$ey0 <- plist0 %*% m0coef
    dt$ey1 <- plist1 %*% m1coef
    dt$ey  <-  dt$d * dt$ey1 + (1 - dt$d) * dt$ey0
    ## Convert data back into data.frame
    return(dt)
}


#' Generate test distribution 3 with errors
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist3e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 0.5,
                      v1.sd = 0.75) {
    dt <- data.frame(z = runif(N),
                     u = runif(N))
    ## First construct the instruments independently from U
    dt$z  <- as.integer(dt$z > subN) + 1
    dt$d <- 0
    dt[dt$z == 1 & dt$u <= p1, "d"] <- 1
    dt[dt$z == 2 & dt$u <= p2, "d"] <- 1
    ## Now construct the error terms. The error terms are normally
    ## distributed, centered at the draw of u if D = 0, and -u if D =
    ## 1.
    dt$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
    dt$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
    ## Now construct the Y values
    ## m0 = 3
    ## m1 = 9
    dt[dt$d == 0, "y"] <- 3 + dt[dt$d == 0, "v0"]
    dt[dt$d == 1, "y"] <- 9 + dt[dt$d == 1, "v1"]
    ## Label the types
    dt[dt$d == 1 & dt$z == 1, "type"] <- 1
    dt[dt$d == 0 & dt$z == 1, "type"] <- 2
    dt[dt$d == 1 & dt$z == 2, "type"] <- 3
    dt[dt$d == 0 & dt$z == 2, "type"] <- 4
    return(dt)
}


#' Generate test distribution 4
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1, 2, and 3, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 and m1 ~ 1. All unobservables u are
#' integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param p3 the probability of treatment for those with the
#'     instrument Z = 3.
#' @return a data.frame.
gendist4 <- function(subN = 5, p1 = 0.4, p2 = 0.6, p3 = 0.8) {
    dt <- data.frame(i  = rep(seq(1, subN), 3),
                     z  = rep(c(1, 2, 3), each = subN),
                     p = rep(c(p1, p2, p3), each = subN))
    ## Assign treatment
    dt$d <- 0
    dt[dt$z == 1 & dt$i <= (p1 * subN), "d"] <- 1
    dt[dt$z == 2 & dt$i <= (p2 * subN), "d"] <- 1
    dt[dt$z == 3 & dt$i <= (p3 * subN), "d"] <- 1
    ## Now construct Y as E[Y | X, D, Z]
    plist <- polyparse(~ 0 + 1, data = dt)
    plist0 <- genGamma(plist,
                       lb = dt$p,
                       ub = 1,
                       multiplier = 1 / (1 - dt$p),
                       means = FALSE)
    plist1 <- genGamma(plist,
                       lb = 0,
                       ub = dt$p,
                       multiplier = 1 / dt$p,
                       means = FALSE)
    m0coef <- c(3)
    m1coef <- c(9)
    dt$ey0 <- plist0 %*% m0coef
    dt$ey1 <- plist1 %*% m1coef
    dt$ey  <-  dt$d * dt$ey1 + (1 - dt$d) * dt$ey0
    ## Convert data back into data.frame
    return(dt)
}



#' Generate test distribution 5 (has errors and a covariate)
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are both of the form m ~ 1 + x + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist5e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 1,
                      v1.sd = 1.55) {
    dt <- data.frame(z = runif(N),
                     x = rnorm(N, mean = 1),
                     u = runif(N))
    ## First construct the instruments independently from U
    dt$z  <- as.integer(dt$z > subN) + 1
    dt$d <- 0
    dt[dt$z == 1 & dt$u <= p1, "d"] <- 1
    dt[dt$z == 2 & dt$u <= p2, "d"] <- 1
    ## Now construct the error terms. The error terms are normally
    ## distributed, centered at the draw of u if D = 0, and -u if D =
    ## 1.
    dt$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
    dt$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
    ## Now construct the Y values
    dt[dt$d == 0, "y"] <- 3 + 0.4 * dt[dt$d == 0, "x"] - 1.3 *
        dt[dt$d == 0, "u"] + dt[dt$d == 0, "v0"]
    dt[dt$d == 1, "y"] <- 9 + 0.3 * dt[dt$d == 1, "x"] - 0.9 *
        dt[dt$d == 1, "u"] + dt[dt$d == 1, "v1"]
    ## Label the types
    dt[dt$d == 1 & dt$z == 1, "type"] <- 1
    dt[dt$d == 0 & dt$z == 1, "type"] <- 2
    dt[dt$d == 1 & dt$z == 2, "type"] <- 3
    dt[dt$d == 0 & dt$z == 2, "type"] <- 4
    return(dt)
}


#' Generate test distribution 6 (has errors and a covariate)
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that is uniformly distributed over [0, 1]. The
#' MTRs are both of the form m ~ 1 + x + x:u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist6e <- function(N = 100, v0.sd = 1,
                      v1.sd = 1.55) {
    dt <- data.frame(z = runif(N),
                     x = rnorm(N, mean = 1),
                     u = runif(N))
    ## First construct the instruments independently from U
    dt$d <- 0
    dt[dt$u <= dt$z, "d"] <- 1
    ## Now construct the error terms.
    dt$v0 <- rnorm(n = N, mean = 0, sd = v0.sd)
    dt$v1 <- rnorm(n = N, mean = 0, sd = v1.sd)
    ## Now construct the Y values
    dt[dt$d == 0, "y"] <- 3 + 0.4 * dt[dt$d == 0, "x"] - 1.3 *
        dt[dt$d == 0, "x"] * dt[dt$d == 0, "u"] + dt[dt$d == 0, "v0"]
    dt[dt$d == 1, "y"] <- 9 + 0.3 * dt[dt$d == 1, "x"] - 0.9 *
        dt[dt$d == 1, "x"] * dt[dt$d == 1, "u"] + dt[dt$d == 1, "v1"]
    return(dt)
}

#' Generate mosquito data set
#'
#' This code generates the population level data in Mogstad, Santos,
#' Torgovitsky (2018), i.e. the mosquito data set used as the running
#' example.
#'
#' @return data.frame.
gendistMosquito <- function() {
    set.seed(1)
    subN <- 100
    p1 <- 0.12
    p2 <- 0.29
    p3 <- 0.48
    p4 <- 0.78
    dt <- data.frame(i  = rep(seq(1, subN), 4),
                     z  = rep(c(1, 2, 3, 4), each = subN),
                     pz = rep(c(p1, p2, p3, p4), each = subN))
    ## Assign treatment
    dt$d <- 0
    dt[dt$z == 1 & dt$i <= 12, "d"] <- 1
    dt[dt$z == 2 & dt$i <= 29, "d"] <- 1
    dt[dt$z == 3 & dt$i <= 48, "d"] <- 1
    dt[dt$z == 4 & dt$i <= 78, "d"] <- 1
    ## Now construct Y as E[Y | X, D, Z]
    plist <- polyparse(~ u + I(u ^ 2), data = dt)
    plist0 <- genGamma(plist,
                           lb = dt$pz,
                           ub = 1,
                           multiplier = 1 / (1 - dt$pz),
                           means = FALSE)
    plist1 <- genGamma(plist,
                           lb = 0,
                           ub = dt$pz,
                           multiplier = 1 / dt$pz,
                           means = FALSE)
    m0coef <- c(0.9, -1.1, 0.3)
    m1coef <- c(0.35, -0.3, -0.05)
    dt$ey0 <- plist0 %*% m0coef
    dt$ey1 <- plist1 %*% m1coef
    dt$ey  <- dt$d * dt$ey1 + (1 - dt$d) * dt$ey0
    ## Convert data back into data.frame
    dtm <- data.frame(dt)
    return(dtm)
}

#' Generate test data set with covariates
#'
#' This code generates population level data to test the estimation
#' function. This data includes covariates. The data generated will
#' have already integrated over the unobservable terms U, where U | X,
#' Z ~ Unif[0, 1].
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistCovariates <- function() {
    set.seed(1)
    supp_x1 <- c(0, 1)
    supp_x2 <- c(1, 2, 3)
    supp_z1 <- c(0, 1)
    supp_z2 <- c(1, 2, 3)
    dtc <- data.frame(expand.grid(supp_x1, supp_x2,
                                  supp_z1, supp_z2))
    colnames(dtc) <- c("x1", "x2", "z1", "z2")
    ## Generate propensity scores
    g0 <- -0.5
    g1 <- -0.4
    g2 <- 0.2
    g3 <- -1
    g4 <- 0.3
    dtc$latent <- g0 + g1 * dtc$x1 + g2 * dtc$x2 +
        g3 * dtc$z1 + g4 * dtc$z2
    dtc$p <- round(1 / (1 + exp(-dtc$latent)), 2)
    ## Generate the counterfactual outcomes Note: the code is prepared
    ## such that the polynomials are all in terms of u. Thus, terms which
    ## have u components with the same power are all grouped together.
    ##
    ## So I use the following specifications
    ## m0 = 0.3 + 0.4 * x1 - 0.1 * x2 * u - 0.2 * x2 * u^2
    ## m1 = 0.5 + 0.2 * x1 - 0.1 * x1 * x2 - 0.02 * u +
    ##        0.3 * x1 * u - 0.05 * x2 * u^2
    plist0 <- polyparse(~ x1 + I(x2 * u) + I(x2 * u^2),
                            data = dtc)
    plist1 <- polyparse(~ x1 + I(x1 * x2) + u + I(x1 * u) + I(x2 * u^2),
                            data = dtc)
    glist0 <- genGamma(plist0,
                           lb = dtc$p,
                           ub = 1,
                           multiplier = 1 / (1 - dtc$p),
                           means = FALSE)
    glist1 <- genGamma(plist1,
                           lb = 0,
                           ub = dtc$p,
                           multiplier = 1 / dtc$p,
                           means = FALSE)
    g0coef <- c(0.3, 0.4, -0.1, -0.2)
    g1coef <- c(0.5, 0.2, -0.1, -0.02, 0.3, -0.05)
    dtc$ey0 <- glist0 %*% g0coef
    dtc$ey1 <- glist1 %*% g1coef

    ## Generate distribution: build the distributions marginally,
    ## allowing for correlations to occur, and then normalize the
    ## distribution.
    dtc$f <- 0
    dtc[dtc$x1 == 0, "f"] <- dtc[dtc$x1 == 0, "f"] + 0.1
    dtc[dtc$x1 == 1, "f"] <- dtc[dtc$x1 == 1, "f"] + 0.13
    dtc[dtc$x2 == 1, "f"] <- dtc[dtc$x2 == 1, "f"] + 0.05
    dtc[dtc$x2 == 2, "f"] <- dtc[dtc$x2 == 2, "f"] + 0.1
    dtc[dtc$x2 == 3, "f"] <- dtc[dtc$x2 == 3, "f"] + 0.01
    dtc[dtc$x1 == 0 & dtc$z1 == 1, "f"] <-
        dtc[dtc$x1 == 0 & dtc$z1 == 1, "f"] - 0.03
    dtc[dtc$x1 == 0 & dtc$z2 == 2, "f"] <-
        dtc[dtc$x1 == 0 & dtc$z2 == 2, "f"] - 0.01
    dtc[dtc$x1 == 0 & dtc$z2 == 3, "f"] <-
        dtc[dtc$x1 == 0 & dtc$z2 == 3, "f"] - 0.02
    dtc[dtc$z1 == 1 & dtc$z2 == 2, "f"] <-
        dtc[dtc$z1 == 1 & dtc$z2 == 2, "f"] + 0.02
    dtc[dtc$z1 == 1 & dtc$z2 == 3, "f"] <-
        dtc[dtc$z1 == 1 & dtc$z2 == 3, "f"] + 0.01
    dtc[dtc$z2 == 2, "f"] <- dtc[dtc$z2 == 2, "f"] + 0.05
    dtc[dtc$z2 == 3, "f"] <- dtc[dtc$z2 == 3, "f"] + 0.01
    dtc[dtc$x2 == 2 & dtc$z2 == 2, "f"] <-
        dtc[dtc$x2 == 2 & dtc$z2 == 2, "f"] + 0.01
    dtc[dtc$x2 == 2 & dtc$z2 == 3, "f"] <-
        dtc[dtc$x2 == 2 & dtc$z2 == 3, "f"] + 0.01
    dtc[dtc$x2 == 3 & dtc$z2 == 2, "f"] <-
        dtc[dtc$x2 == 3 & dtc$z2 == 2, "f"] + 0.02
    dtc[dtc$x2 == 3 & dtc$z2 == 3, "f"] <-
        dtc[dtc$x2 == 3 & dtc$z2 == 3, "f"] + 0.04
    dtc$f <- dtc$f / sum(dtc$f)

    ## Check distribution (requires data.table)
    ## length(unique(dtc$f))
    ## dtc[, sum(f), by = x1]
    ## dtc[, sum(f), by = x2]
    ## dtc[, sum(f), by = z1]
    ## dtc[, sum(f), by = z2]

    ## Since distributions were arbitrarily designed, round their
    ## densities, and ensure they sum to 1.
    dtc$f <- round(dtc$f, 2)
    sum(dtc$f) ## Sum of rounded densities is 1.02. Adjust the
               ## densities so they sum to 1.
    dtc[dtc$x1 == 1 & dtc$x2 == 1 & dtc$z1 == 0 & dtc$z2 == 1, "f"] <-
        dtc[dtc$x1 == 1 & dtc$x2 == 1 & dtc$z1 == 0 & dtc$z2 == 1, "f"] - 0.01
    dtc[dtc$x1 == 1 & dtc$x2 == 3 & dtc$z1 == 1 & dtc$z2 == 3, "f"] <-
        dtc[dtc$x1 == 1 & dtc$x2 == 3 & dtc$z1 == 1 & dtc$z2 == 3, "f"] - 0.01
    dtc$f <- round(dtc$f, 2)
    ## Expand data set according to distribution: We multiply each row
    ## by 100 * (100 * f). The first 100 is so that we assign the
    ## probability of treatment and control according to the variable
    ## p. The (100 * f) is so that we can get the distribution of
    ## covariates to be according to f.
    dtc[, "multiplier"] <- 100 * 100 * dtc$f
    sum(dtc$multiplier)
    dtcf <- dtc[rep(seq_len(nrow(dtc)), dtc$multiplier), ]

    ## Check if the distribution of this data set matches that of what we
    ## generated above (requires data.table)
    ## round(dtcf[, .N/nrow(dtcf), by = x1], 2) == round(dtc[, sum(f), by = x1], 2)
    ## round(dtcf[, .N/nrow(dtcf), by = x2], 2) == round(dtc[, sum(f), by = x2], 2)
    ## round(dtcf[, .N/nrow(dtcf), by = z1], 2) == round(dtc[, sum(f), by = z1], 2)
    ## round(dtcf[, .N/nrow(dtcf), by = z2], 2) == round(dtc[, sum(f), by = z2], 2)

    ## Now assign treatment
    dtcf$d <- 0
    for (ix1 in supp_x1) {
        for (ix2 in supp_x2) {
            for (iz1 in supp_z1) {
                for (iz2 in supp_z2) {
                    N <- nrow(dtcf[dtcf$x1 == ix1 &
                                   dtcf$x2 == ix2 &
                                   dtcf$z1 == iz1 &
                                   dtcf$z2 == iz2, ])
                    dtcf[dtcf$x1 == ix1 &
                         dtcf$x2 == ix2 &
                         dtcf$z1 == iz1 &
                         dtcf$z2 == iz2, "i"] <- seq(1, N)
                    dtcf[dtcf$x1 == ix1 &
                         dtcf$x2 == ix2 &
                         dtcf$z1 == iz1 &
                         dtcf$z2 == iz2, "dcut"] <-
                        round(dtcf[dtcf$x1 == ix1 &
                                   dtcf$x2 == ix2 &
                                   dtcf$z1 == iz1 &
                                   dtcf$z2 == iz2, "p"] * N)
                }
            }
        }
    }
    dtcf[dtcf$i <= dtcf$dcut, "d"] <- 1

    ## Check if empirical distribution differs from population
    ## (requires data.table)
    ## failed <- which(dtcf[, mean(d), by = .(x1, x2, z1, z2)]$V1 !=
    ##                 dtcf[, mean(p), by = .(x1, x2, z1, z2)]$V1)
    ## print(failed)
    ## We want this list to be empty (i.e. a failure occurs when the
    ## empirical distribution of treatment within group differs from
    ## what is specified in the population).

    ## Assign outcomes
    dtcf$ey <- dtcf$d * dtcf$ey1 + (1 - dtcf$d) * dtcf$ey0
    return(list(data.full = dtcf,
                data.dist = dtc))
}

#' Generate test data set with splines
#'
#' This code generates population level data to test the
#' estimation function. This data set incorporates splines in the
#' MTRs.
#'
#' The distribution of the data is as follows
#'
#'        |     Z
#'  X/Z   |  0     1
#' _______|___________
#'     -1 | 0.1   0.1
#'        |
#'  X   0 | 0.2   0.2
#'        |
#'      1 | 0.1   0.2
#'
#' The data presented below will have already integrated over the
#' unobservable terms U, and U | X, Z ~ Unif[0, 1].
#'
#' The propensity scores are generated according to the model
#'
#' p(x, z) = 0.5 - 0.1 * x + 0.2 * z
#'
#'        |     Z
#' p(X,Z) |  0     1
#' _______|___________
#'     -1 | 0.6   0.8
#'        |
#'  X   0 | 0.5   0.7
#'        |
#'      1 | 0.4   0.6
#'
#' The lowest common multiple of the first table is 12. The lowest
#' common multiple of the second table is 84. It turns out that 840 *
#' 5 = 4200 observations is enough to generate the population data
#' set, such that each group has a whole-number of observations.
#'
#' The MTRs are defined as follows:
#'
#' y1 ~ beta0 + beta1 * x + uSpline(degree = 2,
#'                                 knots  = c(0.3, 0.6),
#'                                 intercept = FALSE)
#'
#' The coefficients (beta1, beta2), and the coefficients on the
#' splines, will be defined below.
#'
#' y0 = x : uSpline(degree = 0,
#'                  knots  = c(0.2, 0.5, 0.8),
#'                  intercept = TRUE)
#'      + uSpline(degree = 1,
#'                knots  = c(0.4),
#'                intercept = TRUE)
#'      + beta3 * I(u ^ 2)
#'
#' The coefficient beta3, and the coefficients on the splines, will be
#' defined below.
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistSplines <- function() {
    set.seed(10L)
    ## Declare coefficients
    pCoef    <- c(0.5, -0.1, 0.2) ## Propensity score coefficients
    y1Coef   <- c(30, 35) ## Beta coefficients for y1
    y0Coef   <- c(20) ## Beta coefficients for y0
    u1s1Coef <- c(-75, 45, 75, 55) ## Coefficients for spline in y1
    u0s1Coef <- c(25, 60, 45, 30) ## Coefficients for first spline in y0
    u0s2Coef <- c(65, 70, 40) ## Coefficients for second spline in y0
    ## Generate distribution
    distr <- data.frame(group = seq(1, 6),
                        x = rep(c(-1, 0, 1), times = 2),
                        z = rep(c(0, 1), each = 3),
                        f = c(0.1, 0.2, 0.1, 0.1, 0.3, 0.2))
    distr$p <- pCoef[1] + pCoef[2] * distr$x + pCoef[3] * distr$z
    distr$ey1 <- y1Coef[1] * distr$p +
        y1Coef[2] * distr$x * distr$p +
        t(sapply(X = distr$p,
                 FUN = splineInt,
                 lb = 0,
                 degree = 2,
                 knots = c(0.3, 0.6),
                 intercept = FALSE)) %*% u1s1Coef
    distr$ey0 <- distr$x * t(sapply(X = distr$p,
                                    FUN = splineInt,
                                    ub = 1,
                                    degree = 0,
                                    knots = c(0.2, 0.5, 0.8),
                                    intercept = TRUE)) %*% u0s1Coef +
                           t(sapply(X = distr$p,
                                    FUN = splineInt,
                                    ub = 1,
                                    degree = 1,
                                    knots = c(0.4),
                                    intercept = TRUE)) %*% u0s2Coef +
                           y0Coef[1] / 3 * (1 - distr$p ^ 3)
    ## Expand distribution into data set
    N <- 4200
    distr$multiplier <- N * distr$f
    distr$controls <- distr$multiplier * (1 - distr$p)
    dtsf <- distr[rep(seq_len(nrow(distr)), distr$multiplier), c(1:7, 9)]
    rownames(dtsf) <- NULL
    ## Construct observed outcome
    dtsf$d <- 1
    dtsf$count <- 0
    for (g in unique(distr$group)) {
        dtsf[dtsf$group == g, "count"] <- seq(1, nrow(dtsf[dtsf$group == g, ]))
    }
    dtsf[round(dtsf$count) <= round(dtsf$controls), "d"] <- 0
    dtsf$ey <- (1 - dtsf$d) * dtsf$ey0 + dtsf$d * dtsf$ey1
    ## Check if the distribution is as planned
    ## for (z in c(0, 1)) {
    ##     for (x in c(-1, 0, 1)) {
    ##         a <- nrow(dtsf[dtsf$x == x & dtsf$z == z & dtsf$d == 1, ])
    ##         b <- nrow(dtsf[dtsf$x == x & dtsf$z == z, ])
    ##         cat("X:", x, ", Z:", z, ", a:", a, ", b:", b,
    ##             ", Prob of treat:", a / b, "\n")
    ##     }
    ## }
    ## Check if treatment effects are consistent
    as.numeric(t(distr$ey1 - distr$ey0) %*% distr$f)
    round(as.numeric(t(distr$ey1 - distr$ey0) %*% distr$f), 7) ==
        round(mean(dtsf$ey1 - dtsf$ey0), 7)
    mean(dtsf[dtsf$d == 1, "ey"]) - mean(dtsf[dtsf$d == 0, "ey"])
    ## Drop unnecssary columns
    dtsf[, c("group", "controls", "count")] <- NULL
    return(list(data.full = dtsf,
                data.dist = distr))
}

#' Generate basic data set for testing
#'
#' This code generates population level data to test the estimation
#' function. This is a simpler dataset, one in which we can more
#' easily estimate a correctly specified model.  The data presented
#' below will have already integrated over the # unobservable terms
#' U, where U | X, Z ~ Unif[0, 1].
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistBasic <- function() {
    set.seed(10L)
    supp_x <- c(1, 2, 3)
    supp_z <- c(1, 2, 3, 4)
    dtb <- data.frame(expand.grid(supp_x, supp_z))
    colnames(dtb) <- c("x", "z")
    ## Generate propensity scores
    g0 <- 0.2
    g1 <- -0.10
    g2 <- 0.2
    dtb$p <- g0 + g1 * dtb$x + g2 * dtb$z
    ## Generate the counterfactual outcomes.
    ## m0 = 2 + x + 2 * u
    ## m1 = 6 + 5 * u^2
    plist0 <- polyparse(~ x + u,
                            data = dtb)
    plist1 <- polyparse(~ I(u^2),
                            data = dtb)
    ## Remember that you are generating E[m | D = 0] and E[m | D = 1],
    ## which are analogous to E[m | u > p(X, Z)] and E[m | u < p(X,
    ## Z)]. This is why you include those multipliers: You're
    ## integrating with respect to a conditional distribution.
    glist0 <- genGamma(plist0,
                           lb = dtb$p,
                           ub = 1,
                           multiplier = 1 / (1 - dtb$p),
                           means = FALSE)
    glist1 <- genGamma(plist1,
                           lb = 0,
                           ub = dtb$p,
                           multiplier = 1 / dtb$p,
                           means = FALSE)
    g0coef <- c(2, 1, 2)
    g1coef <- c(6, 5)
    dtb$ey0 <- glist0 %*% g0coef
    dtb$ey1 <- glist1 %*% g1coef
    ## Generate distribution and expand data set
    p <- matrix(runif(12), ncol = 3)
    p <- p / sum(p)
    pmat <- matrix(c(0.02, 0.07, 0.02, 0.19,
                     0.09, 0.10, 0.17, 0.15,
                     0.01, 0.01, 0.16, 0.01),
                   ncol = 4)
    dtb$f <- c(pmat)
    dtb$multiplier <- dtb$f * 1000

    ## Check treatment effect (requires data.table)
    ## sum(dtb$ey1 * dtb$f) - sum(dtb$ey0 * dtb$f)
    ## sum(dtb[dtb$ey0 > dtb$ey1, f])

    ## Assign treatment
    dtbf <- dtb[rep(seq_len(nrow(dtb)), dtb$multiplier), ]
    dtbf$d <- 0
    for (ix in supp_x) {
        for (iz in supp_z) {
            N <- nrow(dtbf[dtbf$x == ix &
                           dtbf$z == iz, ])
            dtbf[dtbf$x == ix &
                 dtbf$z == iz, "i"] <- seq(1, N)
            dtbf[dtbf$x == ix &
                 dtbf$z == iz, "dcut"] <-
                round(dtbf[dtbf$x == ix &
                           dtbf$z == iz, "p"] * N)
        }
    }
    dtbf[dtbf$i <= dtbf$dcut, "d"] <- 1
    dtbf$dcut <- NULL

    ## Check if empirical distribution differs from population
    ## failed <- which(round(dtbf[, mean(d), by = .(x, z)]$V1, 2) !=
    ##                 round(dtbf[, mean(p), by = .(x, z)]$V1, 2))
    ## print(failed)

    ## Assign outcomes
    dtbf$ey <- dtbf$d * dtbf$ey1 + (1 - dtbf$d) * dtbf$ey0
    return(list(data.full = dtbf,
                data.dist = dtb))
}

Try the ivmte package in your browser

Any scripts or data that you put into this service are public.

ivmte documentation built on Sept. 17, 2021, 5:06 p.m.