tests/testthat/test-lm-robust-fes.R

context("Estimator - lm_robust, fixed effects")

set.seed(43)
N <- 20
dat <- data.frame(
  Y = rnorm(N),
  Y2 = rnorm(N),
  Z = rbinom(N, 1, .5),
  X = rnorm(N),
  B = factor(rep(1:2, times = c(8, 12))),
  B2 = factor(rep(1:4, times = c(3, 3, 4, 10))),
  cl = sample(1:4, size = N, replace = T),
  w = runif(N)
)
dat$Xdup <- dat$X
dat$Bdup <- dat$B

test_that("FE matches lm_robust with dummies", {
  ## One FE, one covar
  for (se_type in se_types) {
    ro <- tidy(lm_robust(Y ~ Z + factor(B), data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z, fixed_effects = ~ B, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z"), ],
      rfo[rfo$term %in% c("Z"), ]
    )
  }

  for (se_type in cr_se_types) {
    ro <- tidy(lm_robust(Y ~ Z + factor(B), clusters = cl, data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z, fixed_effects = ~ B, clusters = cl, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z"), ],
      rfo[rfo$term %in% c("Z"), ]
    )
  }
})

test_that("FE matches with multiple FEs and covars", {
  ## Multiple FEs, multiple covars
  for (se_type in se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z"), ],
      rfo[rfo$term %in% c("Z"), ]
    )

    # Weights
    ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = dat, weights = w, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = dat, weights = w, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z", "X"), ],
      rfo[rfo$term %in% c("Z", "X"), ]
    )
  }

  for (se_type in cr_se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z"), ],
      rfo[rfo$term %in% c("Z"), ]
    )

    # Weights
    if (se_type %in% c("CR2", "CR3")) {
      expect_error(
        rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type)),
        "Cannot use `fixed_effects` with weighted"
      )

      # ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, weights = w, se_type = "CR2"))
      #
      # expect_equivalent(
      #   ro[ro$term %in% c("Z", "X"), ],
      #   rfo[rfo$term %in% c("Z", "X"), ]
      # )
    } else {
      ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, weights = w, se_type = se_type))
      rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type))

      expect_equivalent(
        ro[ro$term %in% c("Z", "X"), ],
        rfo[rfo$term %in% c("Z", "X"), ]
      )
    }

  }
  ## HC3

  # Uncomment for perfect fits which reveal problems for our estimators
  # set.seed(41)
  # N <- 10
  # dat <- data.frame(
  #   Y = rnorm(N),
  #   Z = rbinom(N, 1, .5),
  #   X = rnorm(N),
  #   B = factor(rep(1:2, times = c(4, 6))),
  #   B2 = factor(rep(1:3, times = c(3, 3, 4))),
  #   cl = sample(1:4, size = N, replace = T)
  # )
  # lmo <- lm(Y ~ Z + X + factor(B) + factor(B2), data = dat)
  # summary(lmo)
  # sandwich::vcovHC(lmo, "HC3")
})

test_that("FEs work with multiple outcomes", {
  ## Multiple Outcomes
  for (se_type in se_types) {
    ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, se_type = se_type)
    rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, se_type = se_type)

    tro <- tidy(ro)
    trfo <- tidy(rfo)

    expect_equivalent(
      tro[tro$term %in% c("Z", "X"), ],
      trfo[trfo$term %in% c("Z", "X"), ]
    )

    expect_equivalent(
      ro$fitted.values,
      rfo$fitted.values
    )

    expect_equal(
      ro[c("r.squared", "adj.r.squared")],
      rfo[c("r.squared", "adj.r.squared")]
    )

    # Weights
    ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, weights = w, se_type = se_type)
    rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, weights = w, se_type = se_type)

    tro <- tidy(ro)
    trfo <- tidy(rfo)

    expect_equivalent(
      tro[tro$term %in% c("Z", "X"), ],
      trfo[trfo$term %in% c("Z", "X"), ]
    )

    expect_equivalent(
      ro$fitted.values,
      rfo$fitted.values
    )

    expect_equal(
      ro[c("r.squared", "adj.r.squared")],
      rfo[c("r.squared", "adj.r.squared")]
    )
  }

  # clusters
  for (se_type in cr_se_types) {
    ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type)
    rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type)

    tro <- tidy(ro)
    trfo <- tidy(rfo)

    expect_equivalent(
      tro[tro$term %in% c("Z", "X"), ],
      trfo[trfo$term %in% c("Z", "X"), ]
    )

    expect_equivalent(
      ro$fitted.values,
      rfo$fitted.values
    )

    expect_equal(
      ro[c("r.squared", "adj.r.squared")],
      rfo[c("r.squared", "adj.r.squared")]
    )

    # Weights
    if (se_type %in% c("CR2", "CR3")) {
      expect_error(
        rfo <- tidy(lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = dat, weights = w, se_type = se_type)),
        "Cannot use `fixed_effects` with weighted"
      )
    } else {
      ro <- lm_robust(cbind(Y, Y2) ~ Z + X + factor(B) + factor(B2), data = dat, clusters = cl, weights = w, se_type = se_type)
      rfo <- lm_robust(cbind(Y, Y2) ~ Z + X, fixed_effects = ~ B + B2, data = dat, clusters = cl, weights = w, se_type = se_type)

      tro <- tidy(ro)
      trfo <- tidy(rfo)

      expect_equivalent(
        tro[tro$term %in% c("Z", "X"), ],
        trfo[trfo$term %in% c("Z", "X"), ]
      )

      expect_equivalent(
        ro$fitted.values,
        rfo$fitted.values
      )

      expect_equal(
        ro[c("r.squared", "adj.r.squared")],
        rfo[c("r.squared", "adj.r.squared")]
      )
    }
  }
})


test_that("FEs work with missingness", {

  skip_if_not_installed("sandwich")
  # In outcome
  datmiss <- dat
  datmiss$Y[5] <- NA
  datmiss$B[1] <- NA

  for (se_type in se_types) {

    expected_warning <- "Some observations have missingness in the fixed_effects variable(s) but not in the outcome or covariates. These observations have been dropped."
    ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = datmiss, se_type = se_type)
    expect_warning(
      rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = datmiss, se_type = se_type),
      expected_warning,
      fixed = TRUE
    )

    expect_equivalent(
      tidy(ro)[ro$term %in% c("Z", "X"), ],
      tidy(rfo)[rfo$term %in% c("Z", "X"), ]
    )

    expect_equal(
      ro[c("r.squared", "adj.r.squared")],
      rfo[c("r.squared", "adj.r.squared")]
    )
  }

  # Check to make sure with only one FE when missingness works
  expect_warning(
    lm_robust(Y ~ Z + X, fixed_effects = ~ B, data = datmiss, se_type = "HC2"),
    expected_warning,
    fixed = TRUE
  )

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## HC3
  ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), data = datmiss, se_type = "HC3")
  expect_warning(
    rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, data = datmiss, se_type = "HC3"),
    expected_warning,
    fixed = TRUE
  )
  lfo <- lm(Y ~ Z + X + factor(B) + factor(B2), data = datmiss)

  expect_equivalent(
    rfo$std.error[rfo$term %in% c("Z", "X")],
    sqrt(diag(sandwich::vcovHC(lfo, type = "HC3"))[2:3])
  )

  for (se_type in cr_se_types) {
    ro <- lm_robust(Y ~ Z + X + factor(B) + factor(B2), clusters = cl, data = datmiss, se_type = se_type)
    expect_warning(
      rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ B + B2, clusters = cl, data = datmiss, se_type = se_type),
      expected_warning,
      fixed = TRUE
    )

    expect_equivalent(
      tidy(ro)[ro$term %in% c("Z", "X"), ],
      tidy(rfo)[rfo$term %in% c("Z", "X"), ]
    )

    expect_equal(
      ro[c("r.squared", "adj.r.squared")],
      rfo[c("r.squared", "adj.r.squared")]
    )

  }
})

test_that("FEs handle collinear FEs", {
  ## Collinear factors
  for (se_type in se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(Bdup) + factor(B2), data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + Bdup + B2, data = dat, se_type = se_type))

    expect_equivalent(
      ro$estimate[ro$term %in% c("Z", "X")],
      rfo$estimate[rfo$term %in% c("Z", "X")]
    )


    if (se_type %in% c("HC2", "HC3")) {
      # HC2 or HC3 work because we can get the collinearity in the FEs for free as we have to invert
      # UtU anyways (where U is cbind(X, FE_dummy_mat))
      expect_equivalent(
        ro[ro$term %in% c("Z", "X"), ],
        rfo[rfo$term %in% c("Z", "X"), ]
      )
    } else {
      # DoF is wrong because we count the FEs incorrectly for the finite sample correction with collinearity
      expect_false(
        all(
          ro$df[ro$term %in% c("Z", "X")] ==
            rfo$df[rfo$term %in% c("Z", "X")]
        )
      )
      if (se_type == "HC0") {
        # But std errors work here bc no DoF correction
        expect_equivalent(
          ro$std.error[ro$term %in% c("Z", "X")],
          rfo$std.error[rfo$term %in% c("Z", "X")]
        )
      } else {
        expect_false(
          all(
            ro$std.error[ro$term %in% c("Z", "X")] ==
              rfo$std.error[rfo$term %in% c("Z", "X")]
          )
        )
      }
    }
  }

  ## Collinear factors
  for (se_type in cr_se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + factor(B) + factor(Bdup) + factor(B2), clusters = cl, data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X, fixed_effects = ~ B + Bdup + B2, clusters = cl, data = dat, se_type = se_type))

    # DoF for CR0/CR3 works, unlike HC0, because our DoF for CR0/CR3 is N_clust - 1, not N - total_rank
    if (se_type %in% c("CR0", "CR2", "CR3")) {
      expect_equivalent(
        ro[ro$term %in% c("Z", "X"), ],
        rfo[rfo$term %in% c("Z", "X"), ]
      )
    } else {
      expect_equivalent(
        ro$estimate[ro$term %in% c("Z", "X")],
        rfo$estimate[rfo$term %in% c("Z", "X")]
      )
      expect_false(
        all(
          ro$std.error[ro$term %in% c("Z", "X")] ==
            rfo$std.error[rfo$term %in% c("Z", "X")]
        )
      )
    }

  }
})

test_that("FEs work with collinear covariates", {
  ## Classical

  sum(dat$X^2) * 1e-4 * 1e-8
  sum(dat$Xdup^2)
  for (se_type in se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + Xdup + factor(B) + factor(B2), data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X + Xdup, fixed_effects = ~ B + B2, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z", "X", "Xdup"), ],
      rfo[rfo$term %in% c("Z", "X", "Xdup"), ]
    )
  }

  # Clustered methods
  for (se_type in cr_se_types) {
    ro <- tidy(lm_robust(Y ~ Z + X + Xdup + factor(B) + factor(B2), clusters = cl, data = dat, se_type = se_type))
    rfo <- tidy(lm_robust(Y ~ Z + X + Xdup, fixed_effects = ~ B + B2, clusters = cl, data = dat, se_type = se_type))

    expect_equivalent(
      ro[ro$term %in% c("Z", "X", "Xdup"), ],
      rfo[rfo$term %in% c("Z", "X", "Xdup"), ]
    )
  }
})


test_that("test matches stata absorb", {

  # write.csv(mtcars,
  #           file = 'tests/testthat/mtcars.csv',
  #           row.names = F)

  stata_ests <- read.table(
    "stata-fe-ests.txt",
    col.names = c("model", "var", "fstat"),
    stringsAsFactors = FALSE
  )

  mtcars$w <- mtcars$drat / 5

  estimatr_mat <- matrix(NA, 6, 1)

  rfo <- lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, se_type = "classical") # areg mpg hp, absorb(carb)
  estimatr_mat[1, ] <- c(rfo$std.error ^ 2)

  rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, se_type = "HC1")) # areg mpg hp, absorb(carb) rob
  estimatr_mat[2, ] <- c(rfo$std.error ^ 2)

  rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, clusters = cyl, se_type = "stata")) # areg mpg hp, absorb(carb) cl(cyl)
  estimatr_mat[3, ] <- c(rfo$std.error ^ 2)


  rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, se_type = "classical")) # areg mpg hp [aweight=w], absorb(carb)
  estimatr_mat[4, ] <- c(rfo$std.error ^ 2)

  rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, se_type = "HC1")) # areg mpg hp [aweight=w], absorb(carb) rob
  estimatr_mat[5, ] <- c(rfo$std.error ^ 2)

  rfo <- tidy(lm_robust(mpg ~ hp, mtcars, fixed_effects = ~ carb, weights = w, clusters = cyl, se_type = "stata")) # areg mpg hp [aweight=w], absorb(carb) cl(cyl)
  estimatr_mat[6, ] <- c(rfo$std.error ^ 2)

  expect_equal(
    estimatr_mat[, 1],
    stata_ests[, 2]
  )

})

test_that("FE matches lm_robust with one block", {

  skip_if_not_installed("sandwich")

  # In outcome
  datmiss <- dat
  datmiss$Y[5] <- NA
  datmiss$oneB <- as.factor("A")

  ## Classical
  ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "classical")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "classical")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## HC0
  ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC0")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC0")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## HC1
  ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC1")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC1")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## HC2
  ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC2")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC2")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## HC3
  ro <- lm_robust(Y ~ Z + X, data = datmiss, se_type = "HC3")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, data = datmiss, se_type = "HC3")
  lfo <- lm(Y ~ Z + X, data = datmiss)

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equivalent(
    rfo$std.error[rfo$term %in% c("Z", "X")],
    sqrt(diag(sandwich::vcovHC(lfo, type = "HC3"))[2:3])
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## CR0
  ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "CR0")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "CR0")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## CR stata
  ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "stata")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "stata")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## CR2
  ro <- lm_robust(Y ~ Z + X, clusters = cl, data = datmiss, se_type = "CR2")
  rfo <- lm_robust(Y ~ Z + X, fixed_effects = ~ oneB, clusters = cl, data = datmiss, se_type = "CR2")

  expect_equivalent(
    tidy(ro)[ro$term %in% c("Z", "X"), ],
    tidy(rfo)[rfo$term %in% c("Z", "X"), ]
  )

  expect_equal(
    ro[c("r.squared", "adj.r.squared")],
    rfo[c("r.squared", "adj.r.squared")]
  )

  ## Error when combined with other blocks
  expect_error(
    lm_robust(Y ~ Z + X, fixed_effects = ~ oneB + B, data = datmiss),
    "Can't have a fixed effect with only one group AND multiple fixed effect variables"
  )
})

test_that("FEs handle collinear covariates", {

  mtcars2 <- mtcars
  mtcars2$cyl2 <- mtcars2$cyl
  for (se_type in se_types) {
    lmo <- lm_robust(mpg ~ cyl + cyl2 + factor(gear), data = mtcars2)
    lmfo <- lm_robust(mpg ~ cyl + cyl2, fixed_effects = ~ gear, data = mtcars2)

    expect_equivalent(
      tidy(lmo)[grepl("cyl", lmo$term), ],
      tidy(lmfo)[grepl("cyl", lmfo$term), ]
    )
  }

  for (se_type in cr_se_types) {
    lmo <- lm_robust(mpg ~ cyl + cyl2 + factor(gear), clusters = am, data = mtcars2)
    lmfo <- lm_robust(mpg ~ cyl + cyl2, fixed_effects = ~ gear,  clusters = am, data = mtcars2)

    expect_equivalent(
      tidy(lmo)[grepl("cyl", lmo$term), ],
      tidy(lmfo)[grepl("cyl", lmfo$term), ]
    )
  }

})

test_that("FEs handle large numbers", {
  df <- data.frame(
    i = rep(1:100,100),
    x = rnorm(10000) * 1000000 + 10000000,
    y = rnorm(10000)
  )

  fefit <- tidy(lm_robust(y ~ x, data = df, fixed_effects = ~i, se_type = "HC1"))
  lmfit <- tidy(lm_robust(y ~ x - 1 + factor(i), data = df, se_type = "HC1"))

  expect_equal(
    fefit[fefit$term == "x", ],
    lmfit[lmfit$term == "x", ],
  )

})

test_that("Handle perfect fits appropriately", {

  skip_on_os("solaris")

  dat$Bsingle <- c(1, 2, rep(3:4, each = 9))
  rfo <- lm_robust(Y ~ X, fixed_effects = ~ Bsingle, data = dat)
  ro <- lm_robust(Y ~ X + factor(Bsingle), data = dat)

  expect_equivalent(
    tidy(rfo)[rfo$term == "X", ],
    tidy(ro)[ro$term == "X", ]
  )
})
graemeblair/DDestimate documentation built on April 2, 2024, 2:07 p.m.