In egeminiani/GJRM: Generalised Joint Regression Modelling

knitr::opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  collapse = TRUE,
  comment = "#>"
)

Joint models with binary and continuous margins with sample selection.

Generate data. Correlation between the two equations and covariate correlation 0.5. Sample size 2000

library(GJRM)

set.seed(0)

n <- 2000

rh <- 0.5      

sigmau <- matrix(c(1, rh, rh, 1), 2, 2)
u      <- rMVN(n, rep(0,2), sigmau)

sigmac <- matrix(rh, 3, 3); diag(sigmac) <- 1
cov    <- rMVN(n, rep(0,3), sigmac)
cov    <- pnorm(cov)

bi <- round(cov[,1]); x1 <- cov[,2]; x2 <- cov[,3]

f11 <- function(x) -0.7*(4*x + 2.5*x^2 + 0.7*sin(5*x) + cos(7.5*x))
f12 <- function(x) -0.4*( -0.3 - 1.6*x + sin(5*x))  
f21 <- function(x) 0.6*(exp(x) + sin(2.9*x)) 

ys <-  0.58 + 2.5*bi + f11(x1) + f12(x2) + u[, 1] > 0
y  <- -0.68 - 1.5*bi + f21(x1) +           u[, 2]
yo <- y*(ys > 0)

dataSim <- data.frame(ys, yo, bi, x1, x2)

Classic sample selection model

The first equation MUST be the selection equation.

resp.check(yo[ys > 0], "N")

out <- gjrm(list(ys ~ bi + x1 + x2, 
                 yo ~ bi + x1), 
                 data = dataSim, Model = "BSS",
                 margins = c("probit", "N"))
conv.check(out)
post.check(out)
summary(out)

AIC(out)
BIC(out)

Semiparametric sample selection model

out <- gjrm(list(ys ~ bi + s(x1) + s(x2), 
                 yo ~ bi + s(x1)), 
                 data = dataSim, Model = "BSS",
                 margins = c("probit", "N"))
conv.check(out) 
post.check(out)
AIC(out)

Compare the two summary outputs. The second output produces a summary of the results obtained when only the outcome equation is fitted, i.e. selection bias is not accounted for.

summary(out)
summary(out$gam2)

Estimated smooth function plots. The red line is the true curve, whereas the blue line is the naive curve not accounting for selection bias.

x1.s   <- sort(x1[dataSim$ys>0])
f21.x1 <- f21(x1.s)[order(x1.s)] - mean(f21(x1.s))

plot(out, eq = 2, ylim = c(-1, 0.8)); lines(x1.s, f21.x1, col = "red")
plot(out$gam2, se = FALSE, lty = 3, lwd = 2, ylim = c(-1, 0.8), 
     ylab = "", rug = FALSE)

Impute missing values

n.m <- 10
res <- imputeSS(out, n.m)
bet <- NA

for(i in 1:n.m){

  dataSim$yo[dataSim$ys == 0] <- res[[i]]
  outg <- gamlss(list(yo ~ bi + s(x1)), data = dataSim)
  bet[i] <- coef(outg)["bi"]
  print(i)
}

mean(bet)

Semiparametric sample selection model with association and dispersion parameters depending on covariates as well.

eq.mu.1   <- ys ~ bi + s(x1) + s(x2)
eq.mu.2   <- yo ~ bi + s(x1)
eq.sigma  <-    ~ bi
eq.theta  <-    ~ bi + x1

fl <- list(eq.mu.1, eq.mu.2, eq.sigma, eq.theta)

out <- gjrm(fl, data = dataSim, Model = "BSS",
                 margins = c("probit", "N"))
conv.check(out)   
post.check(out)
summary(out)
out$sigma
# out$theta

jc.probs(out, 0, 0.3, intervals = TRUE)[1:4,]

outC0 <- gjrm(fl, data = dataSim, BivD = "C0", Model = "BSS",
              margins = c("probit", "N"))
conv.check(outC0)
post.check(outC0)
AIC(out, outC0)
BIC(out, outC0)

Impute missing values.

n.m <- 10
res <- imputeSS(outC0, n.m)

or using MICE:

library(mice)

ys <- dataSim$ys

dataSim$yo[dataSim$ys == FALSE] <- NA  
dataSim <- dataSim[, -1]

imp <- mice(dataSim, m = 1, method = c("copulaSS", "", "", ""),  
            mice.formula = outC0$mice.formula, margins = outC0$margins, 
            BivD = outC0$BivD, maxit = 1)

comp.yo <- dataSim$yo
comp.yo[ys == 0] <- imp$imp$yo[[1]]
mean(comp.yo)