knitr::opts_chunk$set( warning = FALSE, message = FALSE, collapse = TRUE, comment = "#>" )
Generate data. Correlation between the two equations and covariate correlation 0.5. Sample size 2000
library(GJRM) set.seed(0) n <- 2000 rh <- 0.5 sigmau <- matrix(c(1, rh, rh, 1), 2, 2) u <- rMVN(n, rep(0,2), sigmau) sigmac <- matrix(rh, 3, 3); diag(sigmac) <- 1 cov <- rMVN(n, rep(0,3), sigmac) cov <- pnorm(cov) bi <- round(cov[,1]); x1 <- cov[,2]; x2 <- cov[,3] f11 <- function(x) -0.7*(4*x + 2.5*x^2 + 0.7*sin(5*x) + cos(7.5*x)) f12 <- function(x) -0.4*( -0.3 - 1.6*x + sin(5*x)) f21 <- function(x) 0.6*(exp(x) + sin(2.9*x)) ys <- 0.58 + 2.5*bi + f11(x1) + f12(x2) + u[, 1] > 0 y <- -0.68 - 1.5*bi + f21(x1) + u[, 2] yo <- y*(ys > 0) dataSim <- data.frame(ys, yo, bi, x1, x2)
The first equation MUST be the selection equation.
resp.check(yo[ys > 0], "N") out <- gjrm(list(ys ~ bi + x1 + x2, yo ~ bi + x1), data = dataSim, Model = "BSS", margins = c("probit", "N")) conv.check(out) post.check(out) summary(out) AIC(out) BIC(out)
out <- gjrm(list(ys ~ bi + s(x1) + s(x2), yo ~ bi + s(x1)), data = dataSim, Model = "BSS", margins = c("probit", "N")) conv.check(out) post.check(out) AIC(out)
Compare the two summary outputs. The second output produces a summary of the results obtained when only the outcome equation is fitted, i.e. selection bias is not accounted for.
summary(out) summary(out$gam2)
Estimated smooth function plots. The red line is the true curve, whereas the blue line is the naive curve not accounting for selection bias.
x1.s <- sort(x1[dataSim$ys>0]) f21.x1 <- f21(x1.s)[order(x1.s)] - mean(f21(x1.s)) plot(out, eq = 2, ylim = c(-1, 0.8)); lines(x1.s, f21.x1, col = "red") plot(out$gam2, se = FALSE, lty = 3, lwd = 2, ylim = c(-1, 0.8), ylab = "", rug = FALSE)
Impute missing values
n.m <- 10 res <- imputeSS(out, n.m) bet <- NA for(i in 1:n.m){ dataSim$yo[dataSim$ys == 0] <- res[[i]] outg <- gamlss(list(yo ~ bi + s(x1)), data = dataSim) bet[i] <- coef(outg)["bi"] print(i) } mean(bet)
Semiparametric sample selection model with association and dispersion parameters depending on covariates as well.
eq.mu.1 <- ys ~ bi + s(x1) + s(x2) eq.mu.2 <- yo ~ bi + s(x1) eq.sigma <- ~ bi eq.theta <- ~ bi + x1 fl <- list(eq.mu.1, eq.mu.2, eq.sigma, eq.theta) out <- gjrm(fl, data = dataSim, Model = "BSS", margins = c("probit", "N")) conv.check(out) post.check(out) summary(out) out$sigma # out$theta jc.probs(out, 0, 0.3, intervals = TRUE)[1:4,] outC0 <- gjrm(fl, data = dataSim, BivD = "C0", Model = "BSS", margins = c("probit", "N")) conv.check(outC0) post.check(outC0) AIC(out, outC0) BIC(out, outC0)
Impute missing values.
n.m <- 10 res <- imputeSS(outC0, n.m)
or using MICE
:
library(mice) ys <- dataSim$ys dataSim$yo[dataSim$ys == FALSE] <- NA dataSim <- dataSim[, -1] imp <- mice(dataSim, m = 1, method = c("copulaSS", "", "", ""), mice.formula = outC0$mice.formula, margins = outC0$margins, BivD = outC0$BivD, maxit = 1) comp.yo <- dataSim$yo comp.yo[ys == 0] <- imp$imp$yo[[1]] mean(comp.yo)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.