# R/testdata.R In ivmte: Instrumental Variables: Extrapolation by Marginal Treatment Effects

#### Documented in gendist1gendist1egendist2gendist3gendist3egendist4gendist5egendist6egendistBasicgendistCovariatesgendistMosquitogendistSplines

```#' Generate test distribution 1
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u. All unobservables
#' u are integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @return a data.frame.
gendist1 <- function(subN = 5, p1 = 0.4, p2 = 0.6) {
dt <- data.frame(i  = rep(seq(1, subN), 2),
z  = rep(c(1, 2), each = subN),
p = rep(c(p1, p2), each = subN))
## Assign treatment
dt\$d <- 0
dt[dt\$z == 1 & dt\$i <= (p1 * subN), "d"] <- 1
dt[dt\$z == 2 & dt\$i <= (p2 * subN), "d"] <- 1
## Now construct Y as E[Y | X, D, Z]
## m0 = 1 + u
## m1 = 1 + u
plist <- polyparse(~ u, data = dt)
plist0 <- genGamma(plist,
lb = dt\$p,
ub = 1,
multiplier = 1 / (1 - dt\$p),
means = FALSE)
plist1 <- genGamma(plist,
lb = 0,
ub = dt\$p,
multiplier = 1 / dt\$p,
means = FALSE)
m0coef <- c(0, 6)
m1coef <- c(7, 8)
dt\$ey0 <- plist0 %*% m0coef
dt\$ey1 <- plist1 %*% m1coef
dt\$ey  <-  dt\$d * dt\$ey1 + (1 - dt\$d) * dt\$ey0
## Convert data back into data.frame
return(dt)
}

#' Generate test distribution 1 with errors
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist1e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 0.5,
v1.sd = 0.75) {
dt <- data.frame(z = runif(N),
u = runif(N))
## First construct the instruments independently from U
dt\$z  <- as.integer(dt\$z > subN) + 1
dt\$d <- 0
dt[dt\$z == 1 & dt\$u <= p1, "d"] <- 1
dt[dt\$z == 2 & dt\$u <= p2, "d"] <- 1
## Now construct the error terms. The error terms are normally
## distributed, centered at the draw of u if D = 0, and -u if D =
## 1.
## dt\$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
## dt\$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
dt\$v0 <- rnorm(n = N, mean = 0, sd = v0.sd)
dt\$v1 <- rnorm(n = N, mean = 0, sd = v1.sd)
## Now construct the Y values
## m0 = 0 + u
## m1 = 1 + u
dt[dt\$d == 0, "y"] <- 0 + 6 * dt[dt\$d == 0, "u"] + dt[dt\$d == 0, "v0"]
dt[dt\$d == 1, "y"] <- 7 + 8 * dt[dt\$d == 1, "u"] + dt[dt\$d == 1, "v1"]
## Label the types
dt[dt\$d == 1 & dt\$z == 1, "type"] <- 1
dt[dt\$d == 0 & dt\$z == 1, "type"] <- 2
dt[dt\$d == 1 & dt\$z == 2, "type"] <- 3
dt[dt\$d == 0 & dt\$z == 2, "type"] <- 4
return(dt)
}

#' Generate test distribution 2
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1, 2, or 3, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 + u and m1 ~ 1 + u. All unobservables
#' u are integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param p3 the probability of treatment for those with the
#'     instrument Z = 3.
#' @return a data.frame.
gendist2 <- function(subN = 5, p1 = 0.4, p2 = 0.6, p3 = 0.8) {
dt <- data.frame(i  = rep(seq(1, subN), 3),
z  = rep(c(1, 2, 3), each = subN),
p = rep(c(p1, p2, p3), each = subN))
## Assign treatment
dt\$d <- 0
dt[dt\$z == 1 & dt\$i <= p1 * subN, "d"] <- 1
dt[dt\$z == 2 & dt\$i <= p2 * subN, "d"] <- 1
dt[dt\$z == 3 & dt\$i <= p3 * subN, "d"] <- 1
## Now construct Y as E[Y | X, D, Z]
## m0 = 1 + u
## m1 = 1 + u
plist <- polyparse(~ u, data = dt)
plist0 <- genGamma(plist,
lb = dt\$p,
ub = 1,
multiplier = 1 / (1 - dt\$p),
means = FALSE)
plist1 <- genGamma(plist,
lb = 0,
ub = dt\$p,
multiplier = 1 / dt\$p,
means = FALSE)
m0coef <- c(0, 6)
m1coef <- c(7, 8)
dt\$ey0 <- plist0 %*% m0coef
dt\$ey1 <- plist1 %*% m1coef
dt\$ey  <- dt\$d * dt\$ey1 + (1 - dt\$d) * dt\$ey0
## Convert data back into data.frame
return(dt)
}

#' Generate test distribution 3
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 and 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 and m1 ~ 1. All unobservables u are
#' integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @return a data.frame.
gendist3 <- function(subN = 5, p1 = 0.4, p2 = 0.6) {
dt <- data.frame(i  = rep(seq(1, subN), 2),
z  = rep(c(1, 2), each = subN),
p = rep(c(p1, p2), each = subN))
## Assign treatment
dt\$d <- 0
dt[dt\$z == 1 & dt\$i <= (p1 * subN), "d"] <- 1
dt[dt\$z == 2 & dt\$i <= (p2 * subN), "d"] <- 1
## Now construct Y as E[Y | X, D, Z]
plist <- polyparse(~ 0 + 1, data = dt)
plist0 <- genGamma(plist,
lb = dt\$p,
ub = 1,
multiplier = 1 / (1 - dt\$p),
means = FALSE)
plist1 <- genGamma(plist,
lb = 0,
ub = dt\$p,
multiplier = 1 / dt\$p,
means = FALSE)
m0coef <- c(3)
m1coef <- c(9)
dt\$ey0 <- plist0 %*% m0coef
dt\$ey1 <- plist1 %*% m1coef
dt\$ey  <-  dt\$d * dt\$ey1 + (1 - dt\$d) * dt\$ey0
## Convert data back into data.frame
return(dt)
}

#' Generate test distribution 3 with errors
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 0 + u and m1 ~ 1 + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist3e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 0.5,
v1.sd = 0.75) {
dt <- data.frame(z = runif(N),
u = runif(N))
## First construct the instruments independently from U
dt\$z  <- as.integer(dt\$z > subN) + 1
dt\$d <- 0
dt[dt\$z == 1 & dt\$u <= p1, "d"] <- 1
dt[dt\$z == 2 & dt\$u <= p2, "d"] <- 1
## Now construct the error terms. The error terms are normally
## distributed, centered at the draw of u if D = 0, and -u if D =
## 1.
dt\$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
dt\$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
## Now construct the Y values
## m0 = 3
## m1 = 9
dt[dt\$d == 0, "y"] <- 3 + dt[dt\$d == 0, "v0"]
dt[dt\$d == 1, "y"] <- 9 + dt[dt\$d == 1, "v1"]
## Label the types
dt[dt\$d == 1 & dt\$z == 1, "type"] <- 1
dt[dt\$d == 0 & dt\$z == 1, "type"] <- 2
dt[dt\$d == 1 & dt\$z == 2, "type"] <- 3
dt[dt\$d == 0 & dt\$z == 2, "type"] <- 4
return(dt)
}

#' Generate test distribution 4
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1, 2, and 3, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are m0 ~ 1 and m1 ~ 1. All unobservables u are
#' integrated out.
#' @param subN integer, default set to 5. This is the number of
#'     individuals possessing each value of the instrument. So the
#'     total number of observations is subN * 2.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param p3 the probability of treatment for those with the
#'     instrument Z = 3.
#' @return a data.frame.
gendist4 <- function(subN = 5, p1 = 0.4, p2 = 0.6, p3 = 0.8) {
dt <- data.frame(i  = rep(seq(1, subN), 3),
z  = rep(c(1, 2, 3), each = subN),
p = rep(c(p1, p2, p3), each = subN))
## Assign treatment
dt\$d <- 0
dt[dt\$z == 1 & dt\$i <= (p1 * subN), "d"] <- 1
dt[dt\$z == 2 & dt\$i <= (p2 * subN), "d"] <- 1
dt[dt\$z == 3 & dt\$i <= (p3 * subN), "d"] <- 1
## Now construct Y as E[Y | X, D, Z]
plist <- polyparse(~ 0 + 1, data = dt)
plist0 <- genGamma(plist,
lb = dt\$p,
ub = 1,
multiplier = 1 / (1 - dt\$p),
means = FALSE)
plist1 <- genGamma(plist,
lb = 0,
ub = dt\$p,
multiplier = 1 / dt\$p,
means = FALSE)
m0coef <- c(3)
m1coef <- c(9)
dt\$ey0 <- plist0 %*% m0coef
dt\$ey1 <- plist1 %*% m1coef
dt\$ey  <-  dt\$d * dt\$ey1 + (1 - dt\$d) * dt\$ey0
## Convert data back into data.frame
return(dt)
}

#' Generate test distribution 5 (has errors and a covariate)
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that takes on values of 1 or 2, and the
#' distribution of the values for the binary instrument is
#' uniform. The MTRs are both of the form m ~ 1 + x + u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param subN , default set to 0.5. This is the probability the agent
#'     will have Z = 1.
#' @param p1 the probability of treatment for those with the
#'     instrument Z = 1.
#' @param p2 the probability of treatment for those with the
#'     instrument Z = 2.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist5e <- function(N = 100, subN = 0.5, p1 = 0.4, p2 = 0.6, v0.sd = 1,
v1.sd = 1.55) {
dt <- data.frame(z = runif(N),
x = rnorm(N, mean = 1),
u = runif(N))
## First construct the instruments independently from U
dt\$z  <- as.integer(dt\$z > subN) + 1
dt\$d <- 0
dt[dt\$z == 1 & dt\$u <= p1, "d"] <- 1
dt[dt\$z == 2 & dt\$u <= p2, "d"] <- 1
## Now construct the error terms. The error terms are normally
## distributed, centered at the draw of u if D = 0, and -u if D =
## 1.
dt\$v0 <- sapply(dt[, "u"], rnorm, n = 1, sd = v0.sd)
dt\$v1 <- sapply(-dt[, "u"], rnorm, n = 1, sd = v1.sd)
## Now construct the Y values
dt[dt\$d == 0, "y"] <- 3 + 0.4 * dt[dt\$d == 0, "x"] - 1.3 *
dt[dt\$d == 0, "u"] + dt[dt\$d == 0, "v0"]
dt[dt\$d == 1, "y"] <- 9 + 0.3 * dt[dt\$d == 1, "x"] - 0.9 *
dt[dt\$d == 1, "u"] + dt[dt\$d == 1, "v1"]
## Label the types
dt[dt\$d == 1 & dt\$z == 1, "type"] <- 1
dt[dt\$d == 0 & dt\$z == 1, "type"] <- 2
dt[dt\$d == 1 & dt\$z == 2, "type"] <- 3
dt[dt\$d == 0 & dt\$z == 2, "type"] <- 4
return(dt)
}

#' Generate test distribution 6 (has errors and a covariate)
#'
#' This function generates a data set for testing purposes. There is a
#' single instrument that is uniformly distributed over [0, 1]. The
#' MTRs are both of the form m ~ 1 + x + x:u.
#'@param N integer, default set to 100. Total number of observations
#'     in the data.
#' @param v0.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 0}
#' @param v1.sd numeric, standard deviation of error term for
#'     counterfactual \code{D = 1}
#' @return a data.frame.
gendist6e <- function(N = 100, v0.sd = 1,
v1.sd = 1.55) {
dt <- data.frame(z = runif(N),
x = rnorm(N, mean = 1),
u = runif(N))
## First construct the instruments independently from U
dt\$d <- 0
dt[dt\$u <= dt\$z, "d"] <- 1
## Now construct the error terms.
dt\$v0 <- rnorm(n = N, mean = 0, sd = v0.sd)
dt\$v1 <- rnorm(n = N, mean = 0, sd = v1.sd)
## Now construct the Y values
dt[dt\$d == 0, "y"] <- 3 + 0.4 * dt[dt\$d == 0, "x"] - 1.3 *
dt[dt\$d == 0, "x"] * dt[dt\$d == 0, "u"] + dt[dt\$d == 0, "v0"]
dt[dt\$d == 1, "y"] <- 9 + 0.3 * dt[dt\$d == 1, "x"] - 0.9 *
dt[dt\$d == 1, "x"] * dt[dt\$d == 1, "u"] + dt[dt\$d == 1, "v1"]
return(dt)
}

#' Generate mosquito data set
#'
#' This code generates the population level data in Mogstad, Santos,
#' Torgovitsky (2018), i.e. the mosquito data set used as the running
#' example.
#'
#' @return data.frame.
gendistMosquito <- function() {
set.seed(1)
subN <- 100
p1 <- 0.12
p2 <- 0.29
p3 <- 0.48
p4 <- 0.78
dt <- data.frame(i  = rep(seq(1, subN), 4),
z  = rep(c(1, 2, 3, 4), each = subN),
pz = rep(c(p1, p2, p3, p4), each = subN))
## Assign treatment
dt\$d <- 0
dt[dt\$z == 1 & dt\$i <= 12, "d"] <- 1
dt[dt\$z == 2 & dt\$i <= 29, "d"] <- 1
dt[dt\$z == 3 & dt\$i <= 48, "d"] <- 1
dt[dt\$z == 4 & dt\$i <= 78, "d"] <- 1
## Now construct Y as E[Y | X, D, Z]
plist <- polyparse(~ u + I(u ^ 2), data = dt)
plist0 <- genGamma(plist,
lb = dt\$pz,
ub = 1,
multiplier = 1 / (1 - dt\$pz),
means = FALSE)
plist1 <- genGamma(plist,
lb = 0,
ub = dt\$pz,
multiplier = 1 / dt\$pz,
means = FALSE)
m0coef <- c(0.9, -1.1, 0.3)
m1coef <- c(0.35, -0.3, -0.05)
dt\$ey0 <- plist0 %*% m0coef
dt\$ey1 <- plist1 %*% m1coef
dt\$ey  <- dt\$d * dt\$ey1 + (1 - dt\$d) * dt\$ey0
## Convert data back into data.frame
dtm <- data.frame(dt)
return(dtm)
}

#' Generate test data set with covariates
#'
#' This code generates population level data to test the estimation
#' function. This data includes covariates. The data generated will
#' have already integrated over the unobservable terms U, where U | X,
#' Z ~ Unif[0, 1].
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistCovariates <- function() {
set.seed(1)
supp_x1 <- c(0, 1)
supp_x2 <- c(1, 2, 3)
supp_z1 <- c(0, 1)
supp_z2 <- c(1, 2, 3)
dtc <- data.frame(expand.grid(supp_x1, supp_x2,
supp_z1, supp_z2))
colnames(dtc) <- c("x1", "x2", "z1", "z2")
## Generate propensity scores
g0 <- -0.5
g1 <- -0.4
g2 <- 0.2
g3 <- -1
g4 <- 0.3
dtc\$latent <- g0 + g1 * dtc\$x1 + g2 * dtc\$x2 +
g3 * dtc\$z1 + g4 * dtc\$z2
dtc\$p <- round(1 / (1 + exp(-dtc\$latent)), 2)
## Generate the counterfactual outcomes Note: the code is prepared
## such that the polynomials are all in terms of u. Thus, terms which
## have u components with the same power are all grouped together.
##
## So I use the following specifications
## m0 = 0.3 + 0.4 * x1 - 0.1 * x2 * u - 0.2 * x2 * u^2
## m1 = 0.5 + 0.2 * x1 - 0.1 * x1 * x2 - 0.02 * u +
##        0.3 * x1 * u - 0.05 * x2 * u^2
plist0 <- polyparse(~ x1 + I(x2 * u) + I(x2 * u^2),
data = dtc)
plist1 <- polyparse(~ x1 + I(x1 * x2) + u + I(x1 * u) + I(x2 * u^2),
data = dtc)
glist0 <- genGamma(plist0,
lb = dtc\$p,
ub = 1,
multiplier = 1 / (1 - dtc\$p),
means = FALSE)
glist1 <- genGamma(plist1,
lb = 0,
ub = dtc\$p,
multiplier = 1 / dtc\$p,
means = FALSE)
g0coef <- c(0.3, 0.4, -0.1, -0.2)
g1coef <- c(0.5, 0.2, -0.1, -0.02, 0.3, -0.05)
dtc\$ey0 <- glist0 %*% g0coef
dtc\$ey1 <- glist1 %*% g1coef

## Generate distribution: build the distributions marginally,
## allowing for correlations to occur, and then normalize the
## distribution.
dtc\$f <- 0
dtc[dtc\$x1 == 0, "f"] <- dtc[dtc\$x1 == 0, "f"] + 0.1
dtc[dtc\$x1 == 1, "f"] <- dtc[dtc\$x1 == 1, "f"] + 0.13
dtc[dtc\$x2 == 1, "f"] <- dtc[dtc\$x2 == 1, "f"] + 0.05
dtc[dtc\$x2 == 2, "f"] <- dtc[dtc\$x2 == 2, "f"] + 0.1
dtc[dtc\$x2 == 3, "f"] <- dtc[dtc\$x2 == 3, "f"] + 0.01
dtc[dtc\$x1 == 0 & dtc\$z1 == 1, "f"] <-
dtc[dtc\$x1 == 0 & dtc\$z1 == 1, "f"] - 0.03
dtc[dtc\$x1 == 0 & dtc\$z2 == 2, "f"] <-
dtc[dtc\$x1 == 0 & dtc\$z2 == 2, "f"] - 0.01
dtc[dtc\$x1 == 0 & dtc\$z2 == 3, "f"] <-
dtc[dtc\$x1 == 0 & dtc\$z2 == 3, "f"] - 0.02
dtc[dtc\$z1 == 1 & dtc\$z2 == 2, "f"] <-
dtc[dtc\$z1 == 1 & dtc\$z2 == 2, "f"] + 0.02
dtc[dtc\$z1 == 1 & dtc\$z2 == 3, "f"] <-
dtc[dtc\$z1 == 1 & dtc\$z2 == 3, "f"] + 0.01
dtc[dtc\$z2 == 2, "f"] <- dtc[dtc\$z2 == 2, "f"] + 0.05
dtc[dtc\$z2 == 3, "f"] <- dtc[dtc\$z2 == 3, "f"] + 0.01
dtc[dtc\$x2 == 2 & dtc\$z2 == 2, "f"] <-
dtc[dtc\$x2 == 2 & dtc\$z2 == 2, "f"] + 0.01
dtc[dtc\$x2 == 2 & dtc\$z2 == 3, "f"] <-
dtc[dtc\$x2 == 2 & dtc\$z2 == 3, "f"] + 0.01
dtc[dtc\$x2 == 3 & dtc\$z2 == 2, "f"] <-
dtc[dtc\$x2 == 3 & dtc\$z2 == 2, "f"] + 0.02
dtc[dtc\$x2 == 3 & dtc\$z2 == 3, "f"] <-
dtc[dtc\$x2 == 3 & dtc\$z2 == 3, "f"] + 0.04
dtc\$f <- dtc\$f / sum(dtc\$f)

## Check distribution (requires data.table)
## length(unique(dtc\$f))
## dtc[, sum(f), by = x1]
## dtc[, sum(f), by = x2]
## dtc[, sum(f), by = z1]
## dtc[, sum(f), by = z2]

## Since distributions were arbitrarily designed, round their
## densities, and ensure they sum to 1.
dtc\$f <- round(dtc\$f, 2)
sum(dtc\$f) ## Sum of rounded densities is 1.02. Adjust the
## densities so they sum to 1.
dtc[dtc\$x1 == 1 & dtc\$x2 == 1 & dtc\$z1 == 0 & dtc\$z2 == 1, "f"] <-
dtc[dtc\$x1 == 1 & dtc\$x2 == 1 & dtc\$z1 == 0 & dtc\$z2 == 1, "f"] - 0.01
dtc[dtc\$x1 == 1 & dtc\$x2 == 3 & dtc\$z1 == 1 & dtc\$z2 == 3, "f"] <-
dtc[dtc\$x1 == 1 & dtc\$x2 == 3 & dtc\$z1 == 1 & dtc\$z2 == 3, "f"] - 0.01
dtc\$f <- round(dtc\$f, 2)
## Expand data set according to distribution: We multiply each row
## by 100 * (100 * f). The first 100 is so that we assign the
## probability of treatment and control according to the variable
## p. The (100 * f) is so that we can get the distribution of
## covariates to be according to f.
dtc[, "multiplier"] <- 100 * 100 * dtc\$f
sum(dtc\$multiplier)
dtcf <- dtc[rep(seq_len(nrow(dtc)), dtc\$multiplier), ]

## Check if the distribution of this data set matches that of what we
## generated above (requires data.table)
## round(dtcf[, .N/nrow(dtcf), by = x1], 2) == round(dtc[, sum(f), by = x1], 2)
## round(dtcf[, .N/nrow(dtcf), by = x2], 2) == round(dtc[, sum(f), by = x2], 2)
## round(dtcf[, .N/nrow(dtcf), by = z1], 2) == round(dtc[, sum(f), by = z1], 2)
## round(dtcf[, .N/nrow(dtcf), by = z2], 2) == round(dtc[, sum(f), by = z2], 2)

## Now assign treatment
dtcf\$d <- 0
for (ix1 in supp_x1) {
for (ix2 in supp_x2) {
for (iz1 in supp_z1) {
for (iz2 in supp_z2) {
N <- nrow(dtcf[dtcf\$x1 == ix1 &
dtcf\$x2 == ix2 &
dtcf\$z1 == iz1 &
dtcf\$z2 == iz2, ])
dtcf[dtcf\$x1 == ix1 &
dtcf\$x2 == ix2 &
dtcf\$z1 == iz1 &
dtcf\$z2 == iz2, "i"] <- seq(1, N)
dtcf[dtcf\$x1 == ix1 &
dtcf\$x2 == ix2 &
dtcf\$z1 == iz1 &
dtcf\$z2 == iz2, "dcut"] <-
round(dtcf[dtcf\$x1 == ix1 &
dtcf\$x2 == ix2 &
dtcf\$z1 == iz1 &
dtcf\$z2 == iz2, "p"] * N)
}
}
}
}
dtcf[dtcf\$i <= dtcf\$dcut, "d"] <- 1

## Check if empirical distribution differs from population
## (requires data.table)
## failed <- which(dtcf[, mean(d), by = .(x1, x2, z1, z2)]\$V1 !=
##                 dtcf[, mean(p), by = .(x1, x2, z1, z2)]\$V1)
## print(failed)
## We want this list to be empty (i.e. a failure occurs when the
## empirical distribution of treatment within group differs from
## what is specified in the population).

## Assign outcomes
dtcf\$ey <- dtcf\$d * dtcf\$ey1 + (1 - dtcf\$d) * dtcf\$ey0
return(list(data.full = dtcf,
data.dist = dtc))
}

#' Generate test data set with splines
#'
#' This code generates population level data to test the
#' estimation function. This data set incorporates splines in the
#' MTRs.
#'
#' The distribution of the data is as follows
#'
#'        |     Z
#'  X/Z   |  0     1
#' _______|___________
#'     -1 | 0.1   0.1
#'        |
#'  X   0 | 0.2   0.2
#'        |
#'      1 | 0.1   0.2
#'
#' The data presented below will have already integrated over the
#' unobservable terms U, and U | X, Z ~ Unif[0, 1].
#'
#' The propensity scores are generated according to the model
#'
#' p(x, z) = 0.5 - 0.1 * x + 0.2 * z
#'
#'        |     Z
#' p(X,Z) |  0     1
#' _______|___________
#'     -1 | 0.6   0.8
#'        |
#'  X   0 | 0.5   0.7
#'        |
#'      1 | 0.4   0.6
#'
#' The lowest common multiple of the first table is 12. The lowest
#' common multiple of the second table is 84. It turns out that 840 *
#' 5 = 4200 observations is enough to generate the population data
#' set, such that each group has a whole-number of observations.
#'
#' The MTRs are defined as follows:
#'
#' y1 ~ beta0 + beta1 * x + uSpline(degree = 2,
#'                                 knots  = c(0.3, 0.6),
#'                                 intercept = FALSE)
#'
#' The coefficients (beta1, beta2), and the coefficients on the
#' splines, will be defined below.
#'
#' y0 = x : uSpline(degree = 0,
#'                  knots  = c(0.2, 0.5, 0.8),
#'                  intercept = TRUE)
#'      + uSpline(degree = 1,
#'                knots  = c(0.4),
#'                intercept = TRUE)
#'      + beta3 * I(u ^ 2)
#'
#' The coefficient beta3, and the coefficients on the splines, will be
#' defined below.
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistSplines <- function() {
set.seed(10L)
## Declare coefficients
pCoef    <- c(0.5, -0.1, 0.2) ## Propensity score coefficients
y1Coef   <- c(30, 35) ## Beta coefficients for y1
y0Coef   <- c(20) ## Beta coefficients for y0
u1s1Coef <- c(-75, 45, 75, 55) ## Coefficients for spline in y1
u0s1Coef <- c(25, 60, 45, 30) ## Coefficients for first spline in y0
u0s2Coef <- c(65, 70, 40) ## Coefficients for second spline in y0
## Generate distribution
distr <- data.frame(group = seq(1, 6),
x = rep(c(-1, 0, 1), times = 2),
z = rep(c(0, 1), each = 3),
f = c(0.1, 0.2, 0.1, 0.1, 0.3, 0.2))
distr\$p <- pCoef[1] + pCoef[2] * distr\$x + pCoef[3] * distr\$z
distr\$ey1 <- y1Coef[1] * distr\$p +
y1Coef[2] * distr\$x * distr\$p +
t(sapply(X = distr\$p,
FUN = splineInt,
lb = 0,
degree = 2,
knots = c(0.3, 0.6),
intercept = FALSE)) %*% u1s1Coef
distr\$ey0 <- distr\$x * t(sapply(X = distr\$p,
FUN = splineInt,
ub = 1,
degree = 0,
knots = c(0.2, 0.5, 0.8),
intercept = TRUE)) %*% u0s1Coef +
t(sapply(X = distr\$p,
FUN = splineInt,
ub = 1,
degree = 1,
knots = c(0.4),
intercept = TRUE)) %*% u0s2Coef +
y0Coef[1] / 3 * (1 - distr\$p ^ 3)
## Expand distribution into data set
N <- 4200
distr\$multiplier <- N * distr\$f
distr\$controls <- distr\$multiplier * (1 - distr\$p)
dtsf <- distr[rep(seq_len(nrow(distr)), distr\$multiplier), c(1:7, 9)]
rownames(dtsf) <- NULL
## Construct observed outcome
dtsf\$d <- 1
dtsf\$count <- 0
for (g in unique(distr\$group)) {
dtsf[dtsf\$group == g, "count"] <- seq(1, nrow(dtsf[dtsf\$group == g, ]))
}
dtsf[round(dtsf\$count) <= round(dtsf\$controls), "d"] <- 0
dtsf\$ey <- (1 - dtsf\$d) * dtsf\$ey0 + dtsf\$d * dtsf\$ey1
## Check if the distribution is as planned
## for (z in c(0, 1)) {
##     for (x in c(-1, 0, 1)) {
##         a <- nrow(dtsf[dtsf\$x == x & dtsf\$z == z & dtsf\$d == 1, ])
##         b <- nrow(dtsf[dtsf\$x == x & dtsf\$z == z, ])
##         cat("X:", x, ", Z:", z, ", a:", a, ", b:", b,
##             ", Prob of treat:", a / b, "\n")
##     }
## }
## Check if treatment effects are consistent
as.numeric(t(distr\$ey1 - distr\$ey0) %*% distr\$f)
round(as.numeric(t(distr\$ey1 - distr\$ey0) %*% distr\$f), 7) ==
round(mean(dtsf\$ey1 - dtsf\$ey0), 7)
mean(dtsf[dtsf\$d == 1, "ey"]) - mean(dtsf[dtsf\$d == 0, "ey"])
## Drop unnecssary columns
dtsf[, c("group", "controls", "count")] <- NULL
return(list(data.full = dtsf,
data.dist = distr))
}

#' Generate basic data set for testing
#'
#' This code generates population level data to test the estimation
#' function. This is a simpler dataset, one in which we can more
#' easily estimate a correctly specified model.  The data presented
#' below will have already integrated over the # unobservable terms
#' U, where U | X, Z ~ Unif[0, 1].
#'
#' @return a list of two data.frame objects. One is the distribution
#'     of the simulated data, the other is the full simulated data
#'     set.
gendistBasic <- function() {
set.seed(10L)
supp_x <- c(1, 2, 3)
supp_z <- c(1, 2, 3, 4)
dtb <- data.frame(expand.grid(supp_x, supp_z))
colnames(dtb) <- c("x", "z")
## Generate propensity scores
g0 <- 0.2
g1 <- -0.10
g2 <- 0.2
dtb\$p <- g0 + g1 * dtb\$x + g2 * dtb\$z
## Generate the counterfactual outcomes.
## m0 = 2 + x + 2 * u
## m1 = 6 + 5 * u^2
plist0 <- polyparse(~ x + u,
data = dtb)
plist1 <- polyparse(~ I(u^2),
data = dtb)
## Remember that you are generating E[m | D = 0] and E[m | D = 1],
## which are analogous to E[m | u > p(X, Z)] and E[m | u < p(X,
## Z)]. This is why you include those multipliers: You're
## integrating with respect to a conditional distribution.
glist0 <- genGamma(plist0,
lb = dtb\$p,
ub = 1,
multiplier = 1 / (1 - dtb\$p),
means = FALSE)
glist1 <- genGamma(plist1,
lb = 0,
ub = dtb\$p,
multiplier = 1 / dtb\$p,
means = FALSE)
g0coef <- c(2, 1, 2)
g1coef <- c(6, 5)
dtb\$ey0 <- glist0 %*% g0coef
dtb\$ey1 <- glist1 %*% g1coef
## Generate distribution and expand data set
p <- matrix(runif(12), ncol = 3)
p <- p / sum(p)
pmat <- matrix(c(0.02, 0.07, 0.02, 0.19,
0.09, 0.10, 0.17, 0.15,
0.01, 0.01, 0.16, 0.01),
ncol = 4)
dtb\$f <- c(pmat)
dtb\$multiplier <- dtb\$f * 1000

## Check treatment effect (requires data.table)
## sum(dtb\$ey1 * dtb\$f) - sum(dtb\$ey0 * dtb\$f)
## sum(dtb[dtb\$ey0 > dtb\$ey1, f])

## Assign treatment
dtbf <- dtb[rep(seq_len(nrow(dtb)), dtb\$multiplier), ]
dtbf\$d <- 0
for (ix in supp_x) {
for (iz in supp_z) {
N <- nrow(dtbf[dtbf\$x == ix &
dtbf\$z == iz, ])
dtbf[dtbf\$x == ix &
dtbf\$z == iz, "i"] <- seq(1, N)
dtbf[dtbf\$x == ix &
dtbf\$z == iz, "dcut"] <-
round(dtbf[dtbf\$x == ix &
dtbf\$z == iz, "p"] * N)
}
}
dtbf[dtbf\$i <= dtbf\$dcut, "d"] <- 1
dtbf\$dcut <- NULL

## Check if empirical distribution differs from population
## failed <- which(round(dtbf[, mean(d), by = .(x, z)]\$V1, 2) !=
##                 round(dtbf[, mean(p), by = .(x, z)]\$V1, 2))
## print(failed)

## Assign outcomes
dtbf\$ey <- dtbf\$d * dtbf\$ey1 + (1 - dtbf\$d) * dtbf\$ey0
return(list(data.full = dtbf,
data.dist = dtb))
}
```

## Try the ivmte package in your browser

Any scripts or data that you put into this service are public.

ivmte documentation built on Sept. 17, 2021, 5:06 p.m.