vignettes/sim-CL.R
In sandwich: Robust Covariance Matrix Estimators

## Simulation infrastructure -----------------------------------------------------------------------

## data generating process (dgp)
dgp <- function(nid = 100L, nround = 5L,
  coef = c(0, 0.85, 0.5, 0.7), rho = 0.5, xrho = 0.5,
  dist = "gaussian", type = "copula", link = NULL, ...)
{
  ## match distribution and type of correlation
  dist <- match.arg(dist, c("gaussian", "poisson", "zeroinfl", "hurdle", "betareg", "binomial(logit)", "zerotrunc"))
  type <- match.arg(type, c("copula", "copula-ar1", "ranef"))
  
  ## sample size
  n <- nid * nround

  ## experimental design variables
  d <- data.frame(
    id = rep(1L:nid, each = nround),
    round = rep(1L:nround, nid)
  )

  ## subject covariates plus random effect
  make_x <- function(corr) {
      rnorm(nid, mean = 0, sd = sqrt(xrho))[d$id] +
      rnorm(n, mean = 0, sd = sqrt(1 - xrho))
  }
  d$x1 <- make_x(corr)
  d$x2 <- rnorm(nid, mean = 0, sd = 1)[d$id]
  d$x3 <- rnorm(n, mean = 0, sd = 1)
  d$ranef <- if(type == "ranef") rnorm(nid, mean = 0, sd = sqrt(rho/(1 - rho)))[d$id] else 0

  ## draw from a normal copula
  if(type == "copula") {
    nc <- copula::normalCopula(rho, dim = nround)
    rcopula <- function(n) {
      rval <- copula::rCopula(n/nround, nc)
      as.vector(t(rval))
    }
  }

  ## draw from a normal copula with AR(1) structure
  if(type == "copula-ar1") {
    nc <- copula::normalCopula(rho, dim = nround, dispstr = "ar1")
    rcopula <- function(n) {
      rval <- copula::rCopula(n/nround, nc)
      as.vector(t(rval))
    }
  }
        
  ## response distribution and link function
  if(is.character(dist)) {
    switch(dist,
      "gaussian" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) rnorm(n, mean = mu, ...)
	} else {
	  function(n, mu, ...) qnorm(rcopula(n), mean = mu, ...)
	}
        if(is.null(link)) link <- "identity"
      }, 
      
      "poisson" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) rpois(n, lambda = mu)
	} else {
	  function(n, mu, ...) qpois(rcopula(n), lambda = mu)
	}
        if(is.null(link)) link <- "log"
      },
      
      "zeroinfl" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) countreg::rzipois(n, lambda = mu, pi = 0.3)
	} else {
	  function(n, mu, ...) countreg::qzipois(rcopula(n), lambda = mu, pi = 0.3)
	}
        if(is.null(link)) link <- "log"
      },

      "hurdle" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) countreg::rhpois(n, lambda = mu, pi = 0.7)
	} else {
	  function(n, mu, ...) countreg::qhpois(rcopula(n), lambda = mu, pi = 0.7)
	}
        if(is.null(link)) link <- "log"
      },

      "betareg" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) rbeta(n, shape1 = mu * 10, shape2 = (1 - mu) * 10)
	} else {
	  function(n, mu, ...) qbeta(rcopula(n), shape1 = mu * 10, shape2 = (1 - mu) * 10)
	}
        if(is.null(link)) link <- "logit"
      },

      "binomial(logit)" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) rbinom(n, size = 1, prob = mu)
	} else {
	  function(n, mu, ...) qbinom(rcopula(n), size = 1, prob = mu)
	}
        if(is.null(link)) link <- "logit"
      },

      "zerotrunc" = {
        dist <- if(type == "ranef") {
	  function(n, mu, ...) countreg::rztpois(n, lambda = mu)
	} else {
	  function(n, mu, ...) countreg::qztpois(rcopula(n), lambda = mu)
	}
        if(is.null(link)) link <- "log"
      }

  )}

  if(is.null(link)) link <- "identity"
  if(is.character(link)) link <- make.link(link)
  if(inherits(link, "link-glm")) link <- link$linkinv

  ## compute linear predictor, expectation mu, and response
  d$eta <- coef[1L] + coef[2L] * d$x1 + coef[3L] * d$x2 + coef[4L] * d$x3 + d$ranef
  d$mu <- link(d$eta)
  d$response <- dist(n, d$mu, ...)
  
  ## categorical variables
  d <- transform(d,
    id = factor(id),
    round = factor(round)
  )
  
  ## store coefficients for future reference
  names(coef) <- c("(Intercept)", "x1", "x2", "x3")
  attr(d, "coef") <- coef
  
  return(d)
}
 

## model fitting and covariances
fit <- function(data,
  formula = response ~ x1 + x2 + x3,
  dist = c("gaussian", "poisson", "zeroinfl", "hurdle", "betareg", "binomial(logit)", "zerotrunc"),
  vcov = c("standard", "basic", "HC1", "HC2", "HC3", "CL-0", "CL-1", "CL-2", "CL-3", "fixed", "random", "gee", "PL", "PC", "BS"),
  level = 0.95)
{
  ## assure formula to be in current environment
  environment(formula) <- environment(response ~ x1 + x2 + x3)
  
  ## response distributions and vcov types
  dist <- match.arg(dist)
  
  ## pooled model
  m <- switch(dist,
    "gaussian" = lm(formula, data = data),
    "poisson" = glm(formula, data = data, family = poisson),
    "zeroinfl" = countreg::zeroinfl(formula, data = data),
    "hurdle" = countreg::hurdle(formula, data = data),
    "betareg" = betareg::betareg(formula, data = data, phi = FALSE),
    "binomial(logit)" = glm(formula, data = data, family = binomial),
    "zerotrunc" = countreg::zerotrunc(formula, data = data, dist = "poisson")
  )
  ## fixed effects
  if("fixed" %in% vcov) {
    formula_fe <- update(formula, . ~ . + id)
    m_fe <- switch(dist,
      "gaussian" = lm(formula_fe, data = data),
      "poisson" = glm(formula_fe, data = data, family = poisson),
      "zeroinfl" = countreg::zeroinfl(formula_fe, data = data),
      "hurdle" = countreg::hurdle(formula_fe, data = data),
      "betareg" = betareg::betareg(formula_fe, data = data, phi = FALSE),
      "binomial(logit)" = glm(formula_fe, data, family = binomial),
      "zerotrunc" = countreg::zerotrunc(formula_fe, data = data, dist = "poisson") 
    )
  } else {
    m_fe <- NULL
  }
  ## random effects
  if("random" %in% vcov) {
    formula_re <- update(formula, . ~ . + (1 | id))
    m_re <- switch(dist,
      "gaussian" = lme4::lmer(formula_re, data = data, REML = FALSE),
      "poisson" = lme4::glmer(formula_re, data = data, family = poisson),
      "zeroinfl" = NULL,
      "hurdle" = NULL,
      "betareg" = NULL,
      "binomial(logit)" = lme4::glmer(formula_re, data = data, family = binomial),
      "zerotrunc" = NULL
    )
  } else {
    m_re <- NULL
  }
  ## GEE
  if("gee" %in% vcov) {
    m_gee <- switch(dist,
      "gaussian" = geepack::geeglm(formula, data = data, id = id, corstr = "exchangeable", family = gaussian),
      "poisson" = geepack::geeglm(formula, data = data, id = id, corstr = "exchangeable", family = poisson),
      "zeroinfl" = NULL,
      "hurdle" = NULL,
      "betareg" = NULL,
      "binomial(logit)" = geepack::geeglm(formula, data = data, id = id, corstr = "exchangeable", family = binomial("logit")),
      "zerotrunc" = NULL
    )
  } else {
    m_gee <- NULL
  }
    
  ## return value: collect coefficients and standard errors
  rval <- data.frame(coef = numeric(0), se = numeric(0), par = character(0),
      vcov = character(0), stringsAsFactors = FALSE)

  if("standard" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcov(m))), par = names(coef(m)),
      vcov = "standard", stringsAsFactors = FALSE))
  }
  if("basic" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(sandwich(m))), par = names(coef(m)),
      vcov = "basic", stringsAsFactors = FALSE))
  }
  if("HC1" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovHC(m, type = "HC1"))), par = names(coef(m)),
      vcov = "HC1", stringsAsFactors = FALSE))
  }
  if("HC2" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovHC(m, type = "HC2"))), par = names(coef(m)),
      vcov = "HC2", stringsAsFactors = FALSE))
  }
  if("HC3" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovHC(m, type = "HC3"))), par = names(coef(m)),
      vcov = "HC3", stringsAsFactors = FALSE))
  }
  if("CL-0" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovCL(m, cluster = data$id, type = "HC0"))), par = names(coef(m)),
      vcov = "CL-0", stringsAsFactors = FALSE))
  }
  if("CL-1" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovCL(m, cluster = data$id, type = "HC1"))), par = names(coef(m)),
      vcov = "CL-1", stringsAsFactors = FALSE))
  }
  if("CL-2" %in% vcov & dist != "zeroinfl") {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovCL(m, cluster = data$id, type = "HC2"))), par = names(coef(m)),
      vcov = "CL-2", stringsAsFactors = FALSE))
  }
  if("CL-3" %in% vcov & dist != "zeroinfl") {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovCL(m, cluster = data$id, type = "HC3"))), par = names(coef(m)),
      vcov = "CL-3", stringsAsFactors = FALSE))
  }
  if("fixed" %in% vcov) {
    k <- length(coef(m))
    rval <- rbind(rval, data.frame(
      coef = coef(m_fe)[1L:k], se = sqrt(diag(vcov(m_fe)))[1L:k], par = names(coef(m_fe))[1L:k],
      vcov = "fixed", stringsAsFactors = FALSE))
  }
  if("random" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = fixef(m_re), se = sqrt(diag(vcov(m_re))), par = names(fixef(m_re)),
      vcov = "random", stringsAsFactors = FALSE))
  }
  if("gee" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m_gee), se = sqrt(diag(m_gee$geese$vbeta)), par = names(coef(m_gee)),
      vcov = "gee", stringsAsFactors = FALSE))
  }
  if("PL" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovPL(m, cluster = data$id, lag = "NW1987", adjust = FALSE))), par = names(coef(m)),
      vcov = "PL", stringsAsFactors = FALSE))
  }
  if("PC" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovPC(m, cluster = data$id, order.by = data$round))), par = names(coef(m)),
      vcov = "PC", stringsAsFactors = FALSE))
  }
  if("BS" %in% vcov) {
    rval <- rbind(rval, data.frame(
      coef = coef(m), se = sqrt(diag(vcovBS(m, cluster = data$id))), par = names(coef(m)),
      vcov = "BS", stringsAsFactors = FALSE))
  }
    
  ## reorder columns
  rownames(rval) <- NULL
  rval <- rval[, c(4, 3, 1, 2)]

  ## FIXME
  rval$par <- gsub("count_", "", rval$par, fixed = TRUE)

  ## further outcomes
  cr <- qnorm((1 - level)/2, lower.tail = FALSE)
  cf <- attr(data, "coef")[rval$par]
  rval$bias <- rval$coef - cf
  rval$mad <- abs(rval$coef - cf)
  rval$power <- as.numeric(abs(rval$coef/rval$se) > cr)
  rval$coverage <- as.numeric(abs(cf - rval$coef)/rval$se < cr)
  return(rval)
}

## loop over simulation scenarios
sim <- function(nrep = 1000, nid = 100L, nround = 5L,
  dist = "gaussian", rho = 0.5, xrho = 0.5,
  coef = c(0, 0.85, 0.5, 0.7), formula = response ~ x1 + x2 + x3,
  vcov = c("standard", "basic", "HC1", "HC2", "HC3", "CL-0", "CL-1", "CL-2", "CL-3", "fixed", "random", "gee", "PL", "PC", "BS"),
  ...,
  cores = NULL)
{
  ## parallelization support
  applyfun <- if(is.null(cores)) {
    lapply
  } else {
    function(X, FUN, ...) parallel::mclapply(X, FUN, ..., mc.cores = cores)
  }

  ## all factorial combinations of experimental conditions
  par <- expand.grid(nid = nid, nround = nround, dist = dist, rho = rho, xrho = xrho,
    stringsAsFactors = FALSE)

  ## conduct all simulations
  rval <- lapply(1L:nrow(par), function(i) {
      rvali <- applyfun(1L:nrep, function(j) {      
      d <- dgp(nid = par$nid[i], nround = par$nround[i], dist = par$dist[i],
               rho = par$rho[i], xrho = par$xrho[i], coef = coef, ...)
      ff <- formula
      try(fit(d, formula = ff, dist = par$dist[i], vcov = vcov))
    })
    rvali <- rvali[sapply(rvali, class) == "data.frame"]
    rvali[[1L]][, -(1L:2L)] <- Reduce("+", lapply(rvali, "[", , -(1:2)))/length(rvali)
    rvali <- rvali[[1L]]
    rvali$nid <- par$nid[i]
    rvali$nround <- par$nround[i]
    rvali$dist <- par$dist[i]
    rvali$rho <- par$rho[i]
    rvali$xrho <- par$xrho[i]
    return(rvali)
  })
  rval <- do.call("rbind", rval)

  ## turn all experimental condition variables into factors
  rval$dist <- factor(rval$dist)
  rval$vcov <- factor(rval$vcov)
  rval$par <- factor(rval$par)
  rval$nid <- factor(rval$nid)
  rval$nround <- factor(rval$nround)
  rval$rho <- factor(rval$rho)
  rval$xrho <- factor(rval$xrho)

  return(rval)
}


## Bootstrap for InstInnovation hurdle model -------------------------------------------------------

library("sandwich")
library("pscl")

data("InstInnovation", package = "sandwich")
h_innov <- hurdle(
  cites ~ institutions + log(capital/employment) + log(sales),
  data = InstInnovation, dist = "negbin")

suppressWarnings(RNGversion("3.5.0"))
set.seed(0)
vc_innov <- list(
  "standard" = vcov(h_innov),
  "basic" = sandwich(h_innov),
  "CL-1" = vcovCL(h_innov, cluster = InstInnovation$company),
  "boot" = vcovBS(h_innov, cluster = InstInnovation$company)
)


## Simulation study --------------------------------------------------------------------------------

library("copula")
library("lme4")
library("geepack")
library("countreg")
library("betareg")

RNGkind(kind = "L'Ecuyer-CMRG")

set.seed(1)
s01 <- sim(nrep = 10000, nid = 100, nround = 5,
           dist = "gaussian", rho = seq(0, 0.9, by = 0.1), xrho = 0.25,
           coef = c(0, 0.85, 0.5, 0.7), formula = response ~ x1 + x2 + x3,
           vcov = c("standard", "basic", "CL-0", "random", "gee", "PC", "PL", "BS"),
           type = "copula", cores = 16)

set.seed(2)
s02 <- sim(nrep = 10000, nid = 100, nround = 5,
           dist = c("gaussian", "binomial(logit)", "poisson"),
	   rho = seq(0, 0.9, by = 0.1), xrho = 0.25,
           coef = c(0, 0.85, 0, 0), formula = response ~ x1,
           vcov = c("standard", "basic", "CL-0", "random", "gee", "PC", "PL", "BS"),
           type = "copula", cores = 16)

set.seed(3)
s03 <- sim(nrep = 10000, nid = 100, nround = 5,
           dist = c("zerotrunc", "zeroinfl", "betareg"),
	   rho = seq(0, 0.9, by = 0.1), xrho = 0.25,
           coef = c(0, 0.85, 0, 0), formula = response ~ x1,
           vcov = c("standard", "basic", "CL-0"), ## BS separately below (s33)
           type = "copula", cores = 16)

set.seed(4)
s04 <- sim(nrep = 10000, nid = c(10, seq(50, 250, by = 50)), nround = 5,
           dist = c("gaussian","poisson", "binomial(logit)"),
	   rho = 0.25, xrho = 0.25,
           coef = c(0, 0.85, 0, 0), formula = response ~ x1,
           vcov = c("CL-0", "CL-1", "CL-2", "CL-3", "BS"),
           type = "copula", cores = 16)

set.seed(6)
s06 <- sim(nrep = 10000, nround = c(5, 10, 20, 50), nid = 100,
           dist = "gaussian", rho = 0.25, xrho = 0.25,
           coef = c(0, 0.85, 0.5, 0.7), formula = response ~ x1 + x2 + x3,
           vcov = c("CL-0", "PC", "PL"),
	   type = "copula", cores = 16)

set.seed(7)
s07 <- sim(nrep = 10000, nround = c(5, 10, 20, 50), nid = 100,
           dist = "gaussian", rho = 0.25, xrho = 0.25,
           coef = c(0, 0.85, 0.5, 0.7), formula = response ~ x1 + x2 + x3,
           vcov = c("CL-0", "PC", "PL"),
	   type = "copula-ar1", cores = 16)

set.seed(8)
s08 <- sim(nrep = 10000, nround = c(5, 10, 20, 50), nid = 100,
           dist = c("binomial(logit)", "poisson"), rho = 0.25, xrho = 0.25,
           coef = c(0, 0.85, 0.5, 0.7), formula = response ~ x1 + x2 + x3,
           vcov = c("CL-0", "PC", "PL"),
	   type = "copula-ar1", cores = 16)

set.seed(33)
s33 <- sim(nrep = 10000, nid = 100, nround = 5,
           dist = c("zerotrunc", "zeroinfl", "betareg"),
	   rho = seq(0, 0.9, by = 0.1), xrho = 0.25,
           coef = c(0, 0.85, 0, 0), formula = response ~ x1,
           vcov = "BS",
           type = "copula", cores = 16)


s06$copula <- factor(rep.int("copula",     nrow(s06)), levels = c("copula-ar1", "copula"), labels = c("AR(1)", "Exchangeable"))
s07$copula <- factor(rep.int("copula-ar1", nrow(s07)), levels = c("copula-ar1", "copula"), labels = c("AR(1)", "Exchangeable"))
s0607 <- rbind(s06, s07)

s03$vcov <- as.character(s03$vcov)
s33$vcov <- as.character(s33$vcov)
s33 <- rbind(s03, s33)
s33$vcov <- factor(s33$vcov)

save(s01, s02, s03, s04, s06, s07, s0607, s08, vc_innov, s33, file = "sim-CL.rda")

## -------------------------------------------------------------------------------------------------