In opmc2/dHM2013R:

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# library(dHM2013R)
load_all()

In this vignette I replicate the simulation exercise from @dhaultfoeuille_inference_2013 to test my procedure for implementing their estimation strategy.

Simulating the data

The model they simulate is as follows: $$ Y_{0i} = x_{2i}\beta_{02} + x_{3i} \beta_{03} + \eta_{0i} + \nu_{0i} \ Y_{1i} = x_{1i}\beta_{11} + x_{3i} \beta_{13} + \eta_{1i} + \nu_{1i} \ D_i = \mathbb{1}{-\delta_0 + x_{1i}(\beta_{11} - \gamma_{01}) + x_{2i}(-\beta_{02} - \gamma_{02}) + x_{3i}(\beta_{13} - \beta_{03} - \gamma_{03}) + \eta_{1i} -\eta_{0i} > 0 }. $$

The true values of the parameters are $\beta_{02} = \beta_{03} = 1, \, \beta_{11} = 2, \, \beta_{13} = 0.5, \, \gamma_{01} = -0.5, \, \gamma_{02} = 0.5, \, \gamma_{03} = -0.8, \, \delta = 0.8$, so that the instruments / exclusion restrictions are $\beta_{01} = 0$ and $\beta_{12} = 0$. $x_1, x_2 \overset{iid}{\sim} U(0,4)$, $x_3 \sim Bernoulli(0.5)$. The unobserved heterogeneity terms are jointly normal, $(\eta_0, \eta_1)' \sim N(\mu, \Sigma)$, with $\mu = (0,1)'$ and $\Sigma_{11} = \Sigma_{22} = 1, \, \Sigma_{12} = \Sigma_{21} = 0.5$. The other error terms are from a heteroskedastic normal distribution, with zero mean and conditional matrix variance: $$ \Omega(x) = \begin{pmatrix} \exp{\frac{x_2}{5}} & 0.5\sqrt{\exp{\left(\frac{x_2}{5} + \frac{x_1}{5}\right)}} \ 0.5\sqrt{\exp{\left(\frac{x_2}{5} + \frac{x_1}{5}\right)}} & \exp{\frac{x_1}{5}} \end{pmatrix} $$

Run estimation using function `progDHM2013()`

listDt <- list()
res <- list()
for (i in 1:500) {
  listDt[[i]] <- simulateData()
  res[[i]] <- progDHM2013(
    dt = listDt[[i]], 
    varList = paste0("x", 1:3), 
    ivY1 = "x2", ivY0 = "x1"
  )
}

Results

Parameter estimates

for (i in 1:length(res)) {
  res[[i]][, replication := i]
}

dtRes <- rbindlist(res)

dtRes <- melt(
  dtRes, 
  id.vars = c("varName", "replication"),
  variable.name = "coefficient"
)

dtLabels <- dtRes[, .(
  maxValue = max(value), meanValue = mean(value), medianValue = median(value)
), by = .(varName, coefficient)]

library(ggplot2)
library(ggrepel)

ggplot(data = dtRes, 
       mapping = aes(x = varName, y = value, colour = coefficient)) +
  geom_boxplot() + 
  # geom_label(data = dtLabels[coefficient == "zeta0"], 
  #           aes(x = varName, y = maxValue, label = coefficient, colour = coefficient),
  #           position = position_nudge(x = -.3, y = .15)) +
  # geom_label(data = dtLabels[coefficient == "beta0"], 
  #           aes(x = varName, y = maxValue, label = coefficient, colour = coefficient),
  #           position = position_nudge(x = -.15, y = .15)) +
  # geom_label(data = dtLabels[coefficient == "beta1"], 
  #           aes(x = varName, y = maxValue, label = coefficient, colour = coefficient),
  #           position = position_nudge(x = .15, y = .15)) +
  # geom_label(data = dtLabels[coefficient == "gamma0"], 
  #           aes(x = varName, y = maxValue, label = coefficient, colour = coefficient),
  #           position = position_nudge(x = .3, y = .15)) +
  scale_x_discrete(name = "Variable") +
  theme_classic() +
  guides(colour = "none")

Distribution of non-pecuniary factors

# dtSim <- simulateData()
# resSim <- progDHM2013(
#   dt = dtSim, 
#   varList = paste0("x", 1:3), 
#   ivY1 = "x2", ivY0 = "x1"
# )

gamma0 <- resSim$dtEstimates[, gamma0]
names(gamma0) <- resSim$dtEstimates[, varName]
gamma0 <- gamma0[colnames(resSim$x[, 2:(length(gamma0)+1)])]
delta0 <- resSim$dtEstimates[, unique(delta0)]

# calculate np "value"
npValue <- colSums(t(resSim$x) * c(delta0, gamma0))

# plot density
plot(density(-npValue))
d <- dtSim[, d]

# plot density including d
npDensity <- density(-npValue)
plot(npDensity$x, npDensity$y*2.2, type = "l")
points(-npValue, d)

ex ante wage returns

library(np)

alpha0 <- resSim$dtEstimates[, unique(alpha0)]
delta0 <- resSim$dtEstimates[, unique(delta0)]

zeta0 <- resSim$dtEstimates[, zeta0]
names(zeta0) <- resSim$dtEstimates[, varName]
zeta0 <- zeta0[colnames(resSim$x[, 2:(length(zeta0)+1)])]

d <- as.numeric(dtSim[, d])


DeltaIndex <- colSums(t(resSim$x) * c(delta0, zeta0))
negDeltaIndex <- -DeltaIndex

DeltaModel <- np::npreg(1-d ~ DeltaIndex)

plot(DeltaModel)

beta0 <- resSim$dtEstimates[, beta0]
names(beta0) <- resSim$dtEstimates[, varName]
beta0 <- beta0[colnames(resSim$x[, 2:(length(beta0)+1)])]
beta1 <- resSim$dtEstimates[, beta1]
names(beta1) <- resSim$dtEstimates[, varName]
beta1 <- beta1[colnames(resSim$x[, 2:(length(beta1)+1)])]

Tx <- colSums(t(resSim$x[, 2:(length(zeta0)+1)]) * (beta0 - beta1))
Gx <- DeltaIndex - Tx

Fdelta <- npreg(txdat = DeltaIndex, tydat = 1-d, exdat = rep(Gx, each = length(Tx))+Tx)

Fd <- colSums(matrix(Fdelta$mean, nrow = length(Tx))) / length(Tx)

plot(x = -(Gx+Tx), y = Fd)
plot(deltaModel)