tests/testthat/test-build_model_matrix.R

test_that("build_model_matrix returns expected structure without interactions", {
  withr::local_options(contrasts = c("contr.treatment", "contr.poly"))

  set.seed(1)
  df <- data.frame(
    x_num = rnorm(10),
    x_fac = factor(sample(c("A", "B"), 10, replace = TRUE), levels = c("A", "B"))
  )
  aux <- c("x_num", "x_fac")

  res <- build_model_matrix(df, auxiliary_vars = aux, check_twoway_int = FALSE)

  expect_type(res, "list")
  expect_true(all(c("X", "formula_str") %in% names(res)))
  expect_identical(res$formula_str, "~ x_num + x_fac -1")

  expect_true(is.matrix(res$X))
  expect_equal(nrow(res$X), nrow(df))

  cn <- colnames(res$X)
  expect_true("x_num" %in% cn)

  # Factor main effects: columns that start with x_fac AND have no ':'
  f_cols <- grep("^x_fac", cn, value = TRUE)
  f_cols <- f_cols[!grepl(":", f_cols)]
  # With -1, expect either k or (k-1) depending on contrasts; here we pinned treatment, so k
  expect_true(length(f_cols) %in% c(nlevels(df$x_fac) - 1, nlevels(df$x_fac)))

  expect_false("(Intercept)" %in% cn)
})

test_that("build_model_matrix returns expected structure with two-way interactions", {
  withr::local_options(contrasts = c("contr.treatment", "contr.poly"))

  set.seed(2)
  df <- data.frame(
    x1 = rnorm(12),
    x2 = rnorm(12),
    g  = factor(sample(c("G1", "G2"), 12, replace = TRUE), levels = c("G1", "G2"))
  )
  aux <- c("x1", "x2", "g")

  res <- build_model_matrix(df, auxiliary_vars = aux, check_twoway_int = TRUE)
  expect_identical(res$formula_str, "~ (x1 + x2 + g)^2 -1")

  X <- res$X
  cn <- colnames(X)
  expect_true(is.matrix(X))

  # Main effects (exclude any column with ':')
  expect_true("x1" %in% cn)
  expect_true("x2" %in% cn)
  g_cols <- grep("^g", cn, value = TRUE)
  g_cols <- g_cols[!grepl(":", g_cols)]
  expect_true(length(g_cols) %in% c(nlevels(df$g) - 1, nlevels(df$g)))

  # Interactions: numeric:factor -> expect k-1 or k columns
  x1g <- grep("^(x1:g|g:x1)", cn, value = TRUE)
  x2g <- grep("^(x2:g|g:x2)", cn, value = TRUE)
  expect_true(length(unique(x1g)) %in% c(nlevels(df$g) - 1, nlevels(df$g)))
  expect_true(length(unique(x2g)) %in% c(nlevels(df$g) - 1, nlevels(df$g)))

  # numeric:numeric -> one column, either order
  expect_true(any(c("x1:x2", "x2:x1") %in% cn))

  expect_false("(Intercept)" %in% cn)
  expect_equal(nrow(X), nrow(df))
})

test_that("build_model_matrix handles single auxiliary variable", {
  withr::local_options(contrasts = c("contr.treatment", "contr.poly"))

  df <- data.frame(z = rnorm(7))
  aux <- "z"

  res <- build_model_matrix(df, auxiliary_vars = aux, check_twoway_int = FALSE)
  expect_identical(res$formula_str, "~ z -1")
  expect_true(is.matrix(res$X))
  expect_identical(colnames(res$X), "z")
  expect_equal(nrow(res$X), nrow(df))
})

test_that("build_model_matrix errors when an auxiliary variable is missing in df", {
  withr::local_options(contrasts = c("contr.treatment", "contr.poly"))

  df <- data.frame(a = rnorm(5))
  aux <- c("a", "b_missing")

  expect_error(
    build_model_matrix(df, auxiliary_vars = aux, check_twoway_int = FALSE),
    regexp = "object 'b_missing' not found|undefined columns selected"
  )
})

test_that("column set contains a reasonable number of interactions for factor:factor combos", {
  withr::local_options(contrasts = c("contr.treatment", "contr.poly"))

  set.seed(3)
  df <- data.frame(
    f1 = factor(sample(c("L1", "L2"), 20, TRUE), levels = c("L1", "L2")),
    f2 = factor(sample(c("M1", "M2", "M3"), 20, TRUE), levels = c("M1", "M2", "M3"))
  )
  aux <- c("f1", "f2")

  res <- build_model_matrix(df, auxiliary_vars = aux, check_twoway_int = TRUE)
  cn <- colnames(res$X)

  # Main effects: only columns without ':' and starting with f1 / f2
  f1_cols <- grep("^f1", cn, value = TRUE)
  f1_cols <- f1_cols[!grepl(":", f1_cols)]
  f2_cols <- grep("^f2", cn, value = TRUE)
  f2_cols <- f2_cols[!grepl(":", f2_cols)]

  # Allow either k or k-1 columns depending on contrasts
  expect_true(length(f1_cols) %in% c(nlevels(df$f1) - 1, nlevels(df$f1)))
  expect_true(length(f2_cols) %in% c(nlevels(df$f2) - 1, nlevels(df$f2)))

  # Interactions between f1 and f2: columns containing BOTH f1 and f2 with a ':'
  int_cols <- grep(":", cn, value = TRUE)
  int_cols <- int_cols[grepl("^f1", int_cols) | grepl("^f2", int_cols)]
  int_cols <- int_cols[grepl("f1", int_cols) & grepl("f2", int_cols)]
  int_cols <- unique(int_cols)

  # Lower bound (treatment contrasts): (k1-1)*(k2-1)
  lower_bound <- (nlevels(df$f1) - 1) * (nlevels(df$f2) - 1)
  # Upper bound (no contrasts / full dummies): k1 * k2
  upper_bound <- nlevels(df$f1) * nlevels(df$f2)

  expect_gte(length(int_cols), lower_bound)
  expect_lte(length(int_cols), upper_bound)
  expect_gt(length(int_cols), 0)
})

test_that("build_model_matrix correctly handles factor variables", {
  # Sample dataframe with a factor variable
  df <- data.frame(
    stype = factor(c("E", "M", "H", "E", "M")),
    age = c(25, 30, 35, 40, 45),
    gender = factor(c("Male", "Female", "Female", "Male", "Female"))
  )

  # Auxiliary variables (including the factor variable "stype")
  auxiliary_vars <- c("stype", "age", "gender")

  # Call build_model_matrix
  mm <- build_model_matrix(df, auxiliary_vars, check_twoway_int = TRUE)

  # Check the structure of the model matrix
  X <- mm$X
  colnames_X <- colnames(X)

  # Ensure that dummy variables for "stype" are included (i.e., stypeE, stypeM, stypeH)
  expect_true(any(grepl("^stype", colnames_X)))

  # Ensure all main effects for factors are captured correctly
  expect_true("stype" %in% auxiliary_vars) # should keep "stype"
})

test_that("must_have_vars correctly includes all dummy variables for factor variables", {
  # Sample dataframe with factor variables
  df <- data.frame(
    stype = factor(c("E", "M", "H", "E", "M")),
    age = c(25, 30, 35, 40, 45),
    gender = factor(c("Male", "Female", "Female", "Male", "Female"))
  )

  auxiliary_vars <- c("stype", "age", "gender")

  # User inputs "stype" as must-have variable
  must_have_vars <- c("stype")

  # Build model matrix
  mm <- build_model_matrix(df, auxiliary_vars, check_twoway_int = TRUE)
  X <- mm$X
  colnames_X <- colnames(X)

  # Ensure that all dummy variables for "stype" (stypeE, stypeM, stypeH) are in the model
  must_have_idx <- grep("stype", must_have_vars)
  for (i in must_have_idx) {
    # Check that all columns corresponding to factor "stype" are included
    factor_cols <- grep(paste0("^", must_have_vars[i]), colnames_X)
    expect_true(length(factor_cols) > 0) # should include all dummy variables for "stype"
  }
})

Try the auxvecLASSO package in your browser

Any scripts or data that you put into this service are public.

auxvecLASSO documentation built on Aug. 28, 2025, 9:09 a.m.