tests/testthat/test-get_data.R

skip_on_os("mac")

test_that("retrieve from same environment", {
  foo <- data.frame(x = 1:10, y = 2:11)
  fit <- lm(y ~ x, data = foo)

  expect_no_warning({
    cols <- names(get_data(fit))
  })
  expect_setequal(cols, c("x", "y"))
})

test_that("retrieve from correct environment", {
  foo <- function() {
    foo <- data.frame(x = 1:10, y = 2:11)

    return(lm(y ~ x, data = foo))
  }

  # There should be no warning about "Could not recover model data from
  # environment"
  expect_no_warning({
    cols <- names(get_data(foo()))
  })
  expect_setequal(cols, c("x", "y"))
})

test_that("fetch from local, not global, environment", {
  # See #760. If the local environment has a modified version of data also in
  # the global environment, we should find the local version first, not the
  # global version.

  foo <- function() {
    mtcars$cylinders <- factor(mtcars$cyl)

    return(lm(mpg ~ cylinders + disp, data = mtcars))
  }

  expect_setequal(
    names(get_data(foo())),
    c("mpg", "disp", "cylinders")
  )
})

test_that("retrieve from call formula environment", {
  skip_if_not_installed("AER")

  foo <- function() {
    d <- data.frame(
      y = rnorm(100),
      x = rnorm(100)
    )

    # find_formula(fit)$conditional happens to not have an environment for tobit
    # models, so get_data() should check environment(get_call(fit)$formula). See
    # #666
    return(AER::tobit(y ~ x, data = d, right = 1.5))
  }

  expect_setequal(
    names(get_data(foo())),
    c("x", "y")
  )
})

test_that("lme", {
  skip_if_not_installed("nlme")
  data("Orthodont", package = "nlme")
  m <- nlme::lme( # a model of variance only
    distance ~ 1,
    data = Orthodont, # grand mean
    weights = nlme::varConstPower(form = ~ age | Sex)
  )
  expect_identical(dim(get_data(m, source = "mf")), c(108L, 3L))
  expect_identical(colnames(get_data(m, source = "mf")), c("distance", "age", "Sex"))
})


test_that("lme4", {
  skip_if_not_installed("lme4")
  data("cbpp", package = "lme4")
  set.seed(123)
  cbpp$cont <- rnorm(nrow(cbpp))
  m <- lme4::glmer(cbind(incidence, size - incidence) ~ poly(cont, 2) + (1 | herd),
    data = cbpp, family = binomial
  )
  expect_s3_class(get_data(m), "data.frame")
})


test_that("additional_variables = TRUE", {
  k <- mtcars
  k$qsec[1:10] <- NA
  k <- k
  mod <- lm(mpg ~ hp, k)
  n1 <- nrow(k)
  n2 <- nrow(insight::get_data(mod))
  n3 <- nrow(insight::get_data(mod, additional_variables = TRUE))
  expect_identical(n1, n2)
  expect_identical(n1, n3)
})


test_that("lm", {
  set.seed(1023)
  x <- rnorm(1000, sd = 4)
  y <- cos(x) + rnorm(1000)

  dat <- data.frame(x, y)
  mod1 <- lm(y ~ x, data = dat)
  mod2 <- lm(y ~ cos(x), data = dat)
  expect_equal(get_data(mod1), get_data(mod2), ignore_attr = TRUE)
  expect_equal(get_data(mod1)$x, dat$x, ignore_attr = TRUE)
  expect_equal(get_data(mod2)$x, dat$x, ignore_attr = TRUE)
})


test_that("get_data lavaan", {
  skip_if_not_installed("lavaan")
  data(PoliticalDemocracy, package = "lavaan")
  model <- "
    # latent variable definitions
      ind60 =~ x1 + x2 + x3
      dem60 =~ y1 + a*y2 + b*y3 + c*y4
      dem65 =~ y5 + a*y6 + b*y7 + c*y8

    # regressions
      dem60 ~ ind60
      dem65 ~ ind60 + dem60

    # residual correlations
      y1 ~~ y5
      y2 ~~ y4 + y6
      y3 ~~ y7
      y4 ~~ y8
      y6 ~~ y8
  "
  m <- lavaan::sem(model, data = PoliticalDemocracy)
  expect_s3_class(get_data(m, verbose = FALSE), "data.frame")
  expect_equal(head(get_data(m, verbose = FALSE)), head(PoliticalDemocracy), ignore_attr = TRUE, tolerance = 1e-3)

  # works when data not in environment
  holz_data <<- lavaan::HolzingerSwineford1939
  HS.model <- " visual  =~ x1 + x2 + x3
                textual =~ x4 + x5 + x6
                speed   =~ x7 + x8 + x9 "
  m_holz <- lavaan::lavaan(HS.model,
    data = holz_data, auto.var = TRUE, auto.fix.first = TRUE,
    auto.cov.lv.x = TRUE
  )

  skip_on_os(c("mac", "linux"))
  out1 <- get_data(m_holz)
  expect_named(
    out1,
    c(
      "id", "sex", "ageyr", "agemo", "school", "grade", "x1", "x2",
      "x3", "x4", "x5", "x6", "x7", "x8", "x9"
    )
  )
  expect_identical(nrow(out1), 301L)

  # rm(holz_data)
  # out2 <- get_data(m_holz)
  # expect_named(
  #   out2,
  #   c("x1", "x2","x3", "x4", "x5", "x6", "x7", "x8", "x9")
  # )
  # expect_identical(nrow(out2), 301L)
})


test_that("get_data include weights, even if ones", {
  set.seed(123)
  y <- rnorm(100)
  x <- rnorm(100)
  wn <- runif(100)
  w1 <- rep(1, 100)

  # Model with nonuniform weights
  fn <- lm(y ~ x, weights = wn)
  expect_identical(colnames(get_data(fn, verbose = FALSE)), c("y", "x", "(weights)", "wn"))

  # Model with weights equal to 1
  f1 <- lm(y ~ x, weights = w1)
  expect_identical(colnames(get_data(f1, verbose = FALSE)), c("y", "x", "(weights)", "w1"))

  # Model with no weights
  f0 <- lm(y ~ x)
  expect_identical(colnames(get_data(f0, verbose = FALSE)), c("y", "x"))

  # check get_weights still works
  expect_null(get_weights(f0))
  expect_identical(get_weights(f0, null_as_ones = TRUE), w1)
})


test_that("lm with transformations", {
  d <- data.frame(
    time = as.factor(c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5)),
    group = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2),
    sum = c(0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50, 0, 5, 10, 15, 20, 0, 20, 25, 45, 50)
  )
  m <- lm(log(sum + 1) ~ as.numeric(time) * group, data = d)
  expect_identical(colnames(get_data(m)), c("sum", "time", "group"))
})


test_that("lm with poly and NA in response", {
  d <- iris
  d[1:25, "Sepal.Length"] <- NA
  d2 <- d
  m <- lm(Sepal.Length ~ Species / poly(Petal.Width, 2), data = d2)
  expect_equal(get_data(m), iris[26:150, c("Sepal.Length", "Species", "Petal.Width")], ignore_attr = TRUE)
})


test_that("mgcv", {
  skip_if_not_installed("mgcv")

  # mgcv::gam() deliberately does not keep its environment, so get_data() has to
  # fall back to the model frame. See
  # https://github.com/cran/mgcv/blob/a4e69cf44a49c84a41a42e90c86995a843733968/R/mgcv.r#L2156-L2159
  d <- iris
  d$NewFac <- rep(c(1, 2), length.out = 150)
  model <- mgcv::gam(Sepal.Length ~ s(Petal.Length, by = interaction(Species, NewFac)), data = d)

  # There should be two warnings: One for failing to get the data from the
  # environment, and one for not recovering interaction() accurately
  expect_warning(expect_warning({
    model_data <- get_data(model)
  }))
  expect_equal(
    head(model_data),
    head(d[c("Sepal.Length", "Petal.Length", "Species", "NewFac")]),
    ignore_attr = TRUE
  )
})

test_that("lm with poly and NA in response", {
  s1 <- summary(iris$Sepal.Length)
  model <- lm(Petal.Length ~ log(Sepal.Width) + Sepal.Length,
    data = iris
  )
  # Same min-max
  s2 <- summary(insight::get_data(model)$Sepal.Length)

  model <- lm(Petal.Length ~ log(1 + Sepal.Width) + Sepal.Length,
    data = iris
  )
  s3 <- summary(insight::get_data(model)$Sepal.Length)

  model <- lm(Petal.Length ~ log(Sepal.Width + 1) + Sepal.Length,
    data = iris
  )
  s4 <- summary(insight::get_data(model)$Sepal.Length)

  model <- lm(Petal.Length ~ log1p(Sepal.Width) + Sepal.Length,
    data = iris
  )
  s5 <- summary(insight::get_data(model)$Sepal.Length)

  expect_equal(s1, s2, tolerance = 1e-4)
  expect_equal(s1, s3, tolerance = 1e-4)
  expect_equal(s1, s4, tolerance = 1e-4)
  expect_equal(s1, s5, tolerance = 1e-4)
  expect_equal(s2, s3, tolerance = 1e-4)
  expect_equal(s2, s4, tolerance = 1e-4)
  expect_equal(s2, s5, tolerance = 1e-4)
  expect_equal(s3, s4, tolerance = 1e-4)
  expect_equal(s3, s5, tolerance = 1e-4)
  expect_equal(s4, s5, tolerance = 1e-4)
})


mod <- lm(mpg ~ as.logical(am) + factor(cyl) + as.factor(gear), mtcars)
out <- get_data(mod)
test_that("logicals", {
  expect_equal(out$am, mtcars$am, ignore_attr = TRUE)
})


# See #689
test_that("get_data() log transform", {
  set.seed(123)
  x <- abs(rnorm(100, sd = 5)) + 5
  y <- exp(2 + 0.3 * x + rnorm(100, sd = 0.4))
  dat <- data.frame(y, x)

  mod <- lm(log(y) ~ log(x), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_identical(find_response(mod), "y")
  expect_identical(find_response(mod, combine = FALSE), "y")

  mod <- lm(log(y) ~ x, data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_identical(find_response(mod), "y")

  mod <- lm(y ~ log(x), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_identical(find_response(mod), "y")

  mod <- lm(y ~ log(1 + x), data = dat)
  expect_equal(
    head(insight::get_data(mod)[c("y", "x")]),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_identical(find_response(mod), "y")

  mod <- lm(y ~ log(x + 1), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  mod <- lm(log(y) ~ log(1 + x), data = dat)
  expect_equal(
    head(insight::get_data(mod)[c("y", "x")]),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  mod <- lm(log(y) ~ log(x + 1), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )

  mod <- lm(log(1 + y) ~ log(1 + x), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
  expect_identical(find_response(mod), "y")
  expect_identical(find_response(mod, combine = FALSE), "y")

  mod <- lm(log(y + 1) ~ log(x + 1), data = dat)
  expect_equal(
    head(insight::get_data(mod)),
    head(dat),
    tolerance = 1e-3,
    ignore_attr = TRUE
  )
})

skip_on_cran()

m <- lm(Sepal.Length ~ Sepal.Width, data = iris)
out <- get_data(m)
test_that("subsets", {
  expect_identical(colnames(out), c("Sepal.Length", "Sepal.Width"))
  expect_identical(nrow(out), 150L)
})

m <- lm(Sepal.Length ~ Sepal.Width, data = iris, subset = Species == "versicolor")
out <- get_data(m)
test_that("subsets", {
  expect_identical(colnames(out), c("Sepal.Length", "Sepal.Width", "Species"))
  expect_identical(nrow(out), 50L)
})

# d <- iris
# m <- lm(Petal.Length ~ poly(Sepal.Length), data = d)
# d <<- mtcars
# expect_warning(expect_warning(out <- get_data(m)))
# expect_equal(colnames(out), c("Petal.Length", "Sepal.Length"))

test_that("log", {
  m <- lm(log(Sepal.Length) ~ sqrt(Sepal.Width), data = iris)
  out <- get_data(m)
  expect_equal(out, iris[c("Sepal.Length", "Sepal.Width")], ignore_attr = TRUE)
})

test_that("log II", {
  m <- lm(log(Sepal.Length) ~ scale(Sepal.Width), data = iris)
  out <- get_data(m)
  expect_equal(out, iris[c("Sepal.Length", "Sepal.Width")], ignore_attr = TRUE)
})


test_that("workaround bug in estimatr", {
  skip_if_not_installed("ivreg")
  skip_if_not_installed("estimatr")
  data("CigaretteDemand", package = "ivreg")
  m <- estimatr::iv_robust(
    log(packs) ~ log(rprice) + log(rincome) | salestax + log(rincome),
    data = CigaretteDemand
  )
  out <- get_data(m)
  expect_equal(
    head(out$packs),
    c(101.08543, 111.04297, 71.95417, 56.85931, 82.58292, 79.47219),
    tolerance = 1e-3
  )
  expect_equal(
    colnames(out),
    c("packs", "rprice", "rincome", "salestax"),
    tolerance = 1e-3
  )
})


test_that("get_data colnames", {
  skip_on_os("windows")
  skip_if_not(getRversion() >= "4.0.0")
  skip_if_not_installed("brms")
  m <- suppressMessages(suppressWarnings(brms::brm(mpg ~ hp + mo(cyl), data = mtcars, refresh = 0, iter = 200, chains = 1)))
  out <- get_data(m)
  expect_type(out$cyl, "double")
  expect_true(all(colnames(out) %in% c("mpg", "hp", "cyl")))
  out <- get_data(m, additional_variables = TRUE)
  expect_true("qsec" %in% colnames(out))
})


test_that("get_data works for fixest inside functions", {
  skip_if_not_installed("fixest")
  data(mtcars)

  # fit within function
  fixest_wrapper1 <- function(data) {
    data$cylinders <- factor(data$cyl)
    fit <- fixest::feglm(mpg ~ cylinders * disp + hp, data = data)
    return(fit)
  }
  global_fixest1 <- fixest_wrapper1(data = mtcars)
  data <- mtcars[, c("mpg", "disp")]
  expect_named(
    get_data(global_fixest1),
    c("mpg", "cylinders", "disp", "hp")
  )

  # fit within function, subset
  fixest_wrapper2 <- function(data) {
    data$cylinders <- factor(data$cyl)
    fit <- fixest::feglm(mpg ~ cylinders * disp + hp, data = data)
    return(fit)
  }
  data <- mtcars
  global_fixest2 <- fixest_wrapper2(data = data[1:20, ])
  expect_identical(nrow(get_data(global_fixest2)), 20L)
  expect_named(
    get_data(global_fixest2),
    c("mpg", "cylinders", "disp", "hp")
  )

  data(mtcars)
  d_cyl <- mtcars
  d_cyl$cylinders <- factor(d_cyl$cyl)
  global_fixest3 <- fixest::feglm(mpg ~ cylinders * disp + hp, data = d_cyl)
  expect_named(
    get_data(global_fixest3),
    c("mpg", "cylinders", "disp", "hp")
  )

  # regular example
  data(iris)
  res <- fixest::feglm(Sepal.Length ~ Sepal.Width + Petal.Length | Species, iris, "poisson")
  expect_named(
    get_data(res),
    c("Sepal.Length", "Sepal.Width", "Petal.Length", "Species")
  )
})

Try the insight package in your browser

Any scripts or data that you put into this service are public.

insight documentation built on Nov. 26, 2023, 5:08 p.m.