In egeminiani/GJRM: Generalised Joint Regression Modelling

knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  collapse = TRUE,
  comment = "#>"
)

Generate data with a non-random sample selection mechanism and exclusion restriction

library(GJRM)
set.seed(0)
n <- 2000
Sigma <- matrix(0.5, 2, 2); diag(Sigma) <- 1
u <- rMVN(n, rep(0,2), Sigma)
SigmaC <- matrix(0.5, 3, 3); diag(SigmaC) <- 1
cov <- rMVN(n, rep(0,3), SigmaC)
cov <- pnorm(cov)
bi <- round(cov[,1]); x1 <- cov[,2]; x2 <- cov[,3]
f11 <- function(x) -0.7*(4*x + 2.5*x^2 + 0.7*sin(5*x) + cos(7.5*x))
f12 <- function(x) -0.4*( -0.3 - 1.6*x + sin(5*x))
f21 <- function(x) 0.6*(exp(x) + sin(2.9*x))
ys <- 0.58 + 2.5*bi + f11(x1) + f12(x2) + u[, 1] > 0
y <- -0.68 - 1.5*bi + f21(x1) + + u[, 2] > 0
yo <- y*(ys > 0)
dataSim <- data.frame(y, ys, yo, bi, x1, x2)

Testing the hypothesis of absence of non-random sample selection...

LM.bpm(list(ys ~ bi + s(x1) + s(x2), yo ~ bi + s(x1)), dataSim, Model = "BSS")

The p-value suggests presence of sample selection, hence fit a bivariate model.

For a semiparametric sample selection bivariate probit model, the first equation MUST be the selection equation.

out <- gjrm(list(ys ~ bi + s(x1) + s(x2),
                 yo ~ bi + s(x1)),
            data = dataSim, Model = "BSS",
            margins = c("probit", "probit"))
conv.check(out)
gt.bpm(out)

Let us compare the two summary outputs. The second output produces a summary of the results obtained when selection bias is not accounted for.

summary(out)
summary(out$gam2)

Corrected predicted probability that 'yo' is equal to 1.

mb(ys, yo, Model = "BSS")
prev(out, hd.plot = TRUE)
prev(out, type = "univariate", hd.plot = TRUE)

The estimated smooth function plots are as follows. The red line is the true curve, whereas the blue line is the univariate model curve not accounting for selection bias.

x1.s   <- sort(x1[dataSim$ys>0])
f21.x1 <- f21(x1.s)[order(x1.s)]-mean(f21(x1.s))

plot(out, eq = 2, ylim = c(-1.65,0.95)); lines(x1.s, f21.x1, col="red")
plot(out$gam2, se = FALSE, col = "blue", ylim = c(-1.65,0.95),
     ylab = "", rug = FALSE)

Try a Clayton copula model...

outC <- gjrm(list(ys ~ bi + s(x1) + s(x2),
                  yo ~ bi + s(x1)),
             data = dataSim, Model = "BSS", BivD = "C0",
             margins = c("probit", "probit"))
conv.check(outC)
summary(outC)
prev(outC)

Impute using Mice

library(mice)
ys <- dataSim$ys
dataSim$yo[dataSim$ys == FALSE] <- NA
dataSim <- dataSim[, -c(1:2)]

imp <- mice(dataSim, m = 1, method = c("copulaSS", "", "", ""),
            mice.formula = outC$mice.formula, 
            margins = outC$margins, 
            BivD = outC$BivD, 
            maxit = 1)

comp.yo <- dataSim$yo
comp.yo[ys == 0] <- imp$imp$yo[[1]]
mean(comp.yo)