test-select_auxiliary_variables_lasso_cv.R
In auxvecLASSO: LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

test_that("returns expected structure and names for two binary outcomes", {
  set.seed(1)
  n <- 80
  df <- data.frame(
    y1 = factor(rbinom(n, 1, 0.5)),
    y2 = factor(rbinom(n, 1, 0.6)),
    x1 = rnorm(n),
    x2 = rnorm(n),
    g  = factor(sample(c("A", "B"), n, TRUE))
  )
  outcome_vars <- c("y1", "y2")
  auxiliary_vars <- c("x1", "x2", "g")

  res <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = outcome_vars,
    auxiliary_vars = auxiliary_vars,
    must_have_vars = NULL,
    check_twoway_int = FALSE,
    nfolds = 3, # keep tests quick
    verbose = FALSE,
    standardize = TRUE,
    return_models = FALSE,
    parallel = FALSE
  )

  # top-level
  expect_s3_class(res, "select_auxiliary_variables_lasso_cv")
  expect_true(all(c(
    "selected_variables", "by_outcome", "selected_lambdas",
    "penalty_factors", "models", "goodness_of_fit", "interaction_metadata"
  ) %in% names(res)))

  # names & lengths
  expect_true(all(names(res$by_outcome) %in% outcome_vars))
  expect_true(all(names(res$selected_lambdas) %in% outcome_vars))
  expect_type(res$penalty_factors, "double")
  expect_true(is.list(res$goodness_of_fit))
  expect_true(length(res$models) == 0) # return_models = FALSE

  # goodness_of_fit structure per outcome
  for (y in outcome_vars) {
    gf <- res$goodness_of_fit[[y]]
    expect_true(all(c("cross_validated", "full_data") %in% names(gf)))
    expect_true(all(c("cv_error", "cv_error_sd") %in% names(gf$cross_validated)))
    expect_true(all(c("deviance_explained", "auc", "accuracy", "brier_score", "raw_coefs", "abs_coefs")
    %in% names(gf$full_data)))
    expect_gte(gf$full_data$accuracy, 0)
    expect_lte(gf$full_data$accuracy, 1)
  }

  # interaction metadata when interactions are off
  expect_identical(res$interaction_metadata$interaction_terms, character(0))
})

test_that("includes main effects for selected interactions and must-have vars in selected_variables", {
  set.seed(2)
  n <- 70
  df <- data.frame(
    y = factor(rbinom(n, 1, 0.55)),
    x = rnorm(n),
    z = rnorm(n),
    f = factor(sample(c("L", "H"), n, TRUE))
  )
  outcome_vars <- "y"
  auxiliary_vars <- c("x", "z", "f")

  # Force a must-have that is present in X; with interactions on, its interactions should be zero-penalty
  must_have_vars <- "x"

  res <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = outcome_vars,
    auxiliary_vars = auxiliary_vars,
    must_have_vars = must_have_vars,
    check_twoway_int = TRUE,
    nfolds = 3,
    verbose = FALSE,
    standardize = TRUE,
    return_models = FALSE,
    parallel = FALSE
  )

  # selected_lambdas named correctly
  expect_true(all(names(res$selected_lambdas) == "y"))

  # must-have appears in selected_variables even if not selected by glmnet (function appends them)
  expect_true("x" %in% res$selected_variables)

  # If any interaction made it into combined, its main effects must also be included
  ints <- grep(":", res$selected_variables, value = TRUE)
  if (length(ints) > 0) {
    mains <- unique(unlist(strsplit(ints, ":"), use.names = FALSE))
    expect_true(all(mains %in% res$selected_variables))
  }

  # interaction metadata should reflect whatever ended up in combined
  expect_true(all(res$interaction_metadata$interaction_terms %in% res$selected_variables))
})

test_that("return_models switches models list on/off", {
  set.seed(3)
  n <- 60
  df <- data.frame(
    y1 = factor(rbinom(n, 1, 0.5)),
    y2 = factor(rbinom(n, 1, 0.5)),
    a = rnorm(n),
    b = rnorm(n)
  )
  aux <- c("a", "b")
  outs <- c("y1", "y2")

  res_off <- select_auxiliary_variables_lasso_cv(
    df, outs, aux,
    check_twoway_int = FALSE,
    nfolds = 3,
    return_models = FALSE,
    verbose = FALSE
  )
  expect_true(is.list(res_off$models))
  expect_length(res_off$models, 0)

  res_on <- select_auxiliary_variables_lasso_cv(
    df, outs, aux,
    check_twoway_int = FALSE,
    nfolds = 3,
    return_models = TRUE,
    verbose = FALSE
  )
  expect_true(is.list(res_on$models))
  expect_true(all(names(res_on$models) %in% outs))
  # model objects may be NULL if an outcome fails, but generally should be cv.glmnet
  for (m in res_on$models) {
    expect_true(is.null(m) || inherits(m, "cv.glmnet"))
  }
})

test_that("early exits: zero-row df or no auxiliary vars return empty structures", {
  # zero-row df
  df0 <- data.frame(y = factor(), x = numeric())
  res0 <- select_auxiliary_variables_lasso_cv(
    df = df0,
    outcome_vars = "y",
    auxiliary_vars = "x",
    check_twoway_int = FALSE,
    nfolds = 3,
    verbose = FALSE
  )
  expect_identical(res0$selected_variables, character(0))
  expect_true(is.list(res0$by_outcome))
  expect_true(length(res0$by_outcome) == 1) # names match supplied outcome_vars
  expect_true(all(names(res0$by_outcome) == "y"))

  # no auxiliary vars
  set.seed(4)
  df <- data.frame(y = factor(rbinom(10, 1, 0.5)), x = rnorm(10))
  res1 <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = "y",
    auxiliary_vars = character(0),
    check_twoway_int = FALSE,
    nfolds = 3,
    verbose = FALSE
  )
  expect_identical(res1$selected_variables, character(0))
})

test_that("penalty_factors is a named numeric vector aligned with model matrix columns", {
  set.seed(5)
  n <- 50
  df <- data.frame(
    y = factor(rbinom(n, 1, 0.5)),
    x1 = rnorm(n),
    g = factor(sample(c("A", "B"), n, TRUE))
  )
  aux <- c("x1", "g")

  res <- select_auxiliary_variables_lasso_cv(
    df, "y", aux,
    must_have_vars = "x1",
    check_twoway_int = TRUE,
    nfolds = 3,
    verbose = FALSE
  )

  pf <- res$penalty_factors
  expect_type(pf, "double")
  expect_false(is.null(names(pf)))
  # ensure the must-have got zero penalty
  expect_true(pf["x1"] == 0)
})

test_that("parallel path creates/uses a cluster and returns sensible results (skipped if doParallel missing)", {
  if (!requireNamespace("doParallel", quietly = TRUE)) {
    skip("doParallel is not installed; skipping parallel path test.")
  }

  set.seed(6)
  n <- 60
  df <- data.frame(
    y  = factor(rbinom(n, 1, 0.5)),
    x1 = rnorm(n),
    x2 = rnorm(n)
  )

  res <- select_auxiliary_variables_lasso_cv(
    df, "y", c("x1", "x2"),
    check_twoway_int = FALSE,
    nfolds = 3,
    verbose = FALSE, # avoid message assertions
    return_models = FALSE,
    parallel = TRUE
  )

  # Always reset foreach backend
  if (requireNamespace("foreach", quietly = TRUE)) foreach::registerDoSEQ()

  # Structure checks
  expect_s3_class(res, "select_auxiliary_variables_lasso_cv")
  expect_true(all(c(
    "selected_variables", "by_outcome", "selected_lambdas",
    "penalty_factors", "models", "goodness_of_fit", "interaction_metadata"
  ) %in% names(res)))

  # Lambda may be NA if glmnet bails, but usually numeric
  expect_true(is.numeric(res$selected_lambdas[["y"]]) || is.na(res$selected_lambdas[["y"]]))

  # Only assert GOF if it exists (parallel backends can be finicky on CI)
  gf <- res$goodness_of_fit[["y"]]
  if (is.null(gf) || !all(c("cross_validated", "full_data") %in% names(gf))) {
    skip("cv.glmnet did not return goodness_of_fit under parallel (environment/backend quirk).")
  }

  expect_true(all(c("cv_error", "cv_error_sd") %in% names(gf$cross_validated)))
  expect_true(all(c("deviance_explained", "auc", "accuracy", "brier_score", "raw_coefs", "abs_coefs")
  %in% names(gf$full_data)))
  expect_gte(gf$full_data$accuracy, 0)
  expect_lte(gf$full_data$accuracy, 1)
})

test_that("select_auxiliary_variables_lasso_cv drops rows with NA outcomes and returns results", {
  set.seed(42)
  n <- 120

  # Build data: X has NO missing values, outcomes may have NAs
  df <- data.frame(
    x1 = rnorm(n),
    x2 = rnorm(n),
    x3 = rnorm(n)
  )
  # two binary outcomes
  df$y1 <- factor(rbinom(n, 1, 0.55))
  df$y2 <- factor(rbinom(n, 1, 0.50))

  # Inject NA only into outcomes (10 each, disjoint indices)
  na_y1 <- 1:10
  na_y2 <- 21:30
  df$y1[na_y1] <- NA
  df$y2[na_y2] <- NA

  # Sanity: auxiliary vars have no missing values
  expect_false(anyNA(df[c("x1", "x2", "x3")]))

  # Run with verbose = TRUE so we can assert the drop messages
  msgs <- testthat::capture_messages({
    res <- select_auxiliary_variables_lasso_cv(
      df = df,
      outcome_vars = c("y1", "y2"),
      auxiliary_vars = c("x1", "x2", "x3"),
      must_have_vars = "x1",
      check_twoway_int = FALSE,
      nfolds = 3,
      verbose = TRUE,
      standardize = TRUE,
      return_models = FALSE,
      parallel = FALSE
    )
    # Basic structure assertions inside the capture to keep timing realistic
    expect_s3_class(res, "select_auxiliary_variables_lasso_cv")
    expect_true(all(c("y1", "y2") %in% names(res$by_outcome)))
    expect_true(all(c("y1", "y2") %in% names(res$selected_lambdas)))
    expect_true(is.numeric(res$selected_lambdas[["y1"]]) || is.na(res$selected_lambdas[["y1"]]))
    expect_true(is.numeric(res$selected_lambdas[["y2"]]) || is.na(res$selected_lambdas[["y2"]]))

    # must-have should be in the combined selection even if glmnet didn't pick it
    expect_true("x1" %in% res$selected_variables)

    # Goodness-of-fit structure present for each outcome
    for (y in c("y1", "y2")) {
      gf <- res$goodness_of_fit[[y]]
      expect_true(all(c("cross_validated", "full_data") %in% names(gf)))
      expect_true(all(c("cv_error", "cv_error_sd") %in% names(gf$cross_validated)))
      expect_true(all(c("deviance_explained", "auc", "accuracy", "brier_score", "raw_coefs", "abs_coefs")
      %in% names(gf$full_data)))
      expect_gte(gf$full_data$accuracy, 0)
      expect_lte(gf$full_data$accuracy, 1)
      expect_gte(gf$full_data$auc, 0)
      expect_lte(gf$full_data$auc, 1)
    }
  })

  # Messages should indicate rows dropped and rows used per outcome
  msgs_c <- paste(msgs, collapse = " ")
  expect_true(grepl("Outcome 'y1': dropping 10 row\\(s\\) with NA\\.", msgs_c))
  expect_true(grepl("Outcome 'y2': dropping 10 row\\(s\\) with NA\\.", msgs_c))
  # after dropping, both should fit "on 110 rows."
  expect_true(grepl("Fitting LASSO for binary outcome 'y1' on 110 rows\\.", msgs_c))
  expect_true(grepl("Fitting LASSO for binary outcome 'y2' on 110 rows\\.", msgs_c))
})

test_that("select_auxiliary_variables_lasso_cv correctly includes factor variables in must_have_vars", {
  # Sample dataframe with factor variables
  df <- data.frame(
    stype = factor(c(
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M"
    )),
    age = c(
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45
    ),
    gender = factor(c(
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female"
    )),
    outcome1 = c(
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1
    ),
    outcome2 = c(
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0
    )
  )

  # Auxiliary and must-have variables
  auxiliary_vars <- c("stype", "age", "gender")
  must_have_vars <- c("stype")

  # Simulate the selection of auxiliary variables via LASSO with cross-validation
  result <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = c("outcome1", "outcome2"),
    auxiliary_vars = auxiliary_vars,
    must_have_vars = must_have_vars,
    check_twoway_int = TRUE,
    nfolds = 5,
    verbose = FALSE,
    standardize = TRUE,
    return_models = FALSE,
    parallel = FALSE
  )

  # List of expected dummy variables for the "stype" factor
  expected_stype_dummies <- c("stypeE", "stypeM", "stypeH")

  # Ensure that the expected dummy variables for "stype" are included in the selected variables
  selected_variables <- result$selected_variables
  expect_true(all(expected_stype_dummies %in% selected_variables))
})



test_that("Interaction terms are included when check_twoway_int is TRUE", {
  # Sample dataframe with factor variables
  df <- data.frame(
    stype = factor(c(
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M"
    )),
    age = c(
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45
    ),
    gender = factor(c(
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female"
    )),
    outcome1 = c(
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
      1, 0, 1, 0, 1, 1, 0, 1, 0, 1
    ),
    outcome2 = c(
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
      0, 1, 0, 1, 0, 0, 1, 0, 1, 0
    )
  )

  auxiliary_vars <- c("stype", "age", "gender")

  # Call select_auxiliary_variables_lasso_cv with check_twoway_int = TRUE
  result <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = "outcome1",
    auxiliary_vars = auxiliary_vars,
    must_have_vars = NULL,
    check_twoway_int = TRUE,
    nfolds = 5,
    verbose = FALSE,
    standardize = TRUE,
    return_models = FALSE,
    parallel = FALSE
  )

  # Extract the interaction terms
  interaction_terms <- result$interaction_metadata$interaction_terms

  # Check if interaction terms are included in the selected variables
  expect_true(length(interaction_terms) > 0) # Should include interaction terms such as "stype:age", "stype:gender"
})

test_that("select_auxiliary_variables_lasso_cv handles continuous outcome correctly and computes additional metrics", {
  # Sample dataframe with continuous outcome variable
  df <- data.frame(
    stype = factor(c(
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M",
      "E", "M", "H", "E", "M", "E", "M", "H", "E", "M"
    )),
    age = c(
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45,
      25, 30, 35, 40, 45, 25, 30, 35, 40, 45
    ),
    gender = factor(c(
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female",
      "Male", "Female", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Female"
    )),
    outcome1 = c(
      3.2, 5.5, 4.8, 7.1, 5.6, 3.0, 6.2, 4.5, 6.8, 5.4,
      4.1, 5.8, 4.9, 6.3, 5.2, 3.3, 6.5, 4.7, 6.6, 5.7,
      3.9, 5.7, 4.6, 6.0, 5.1, 3.5, 6.1, 4.4, 6.2, 5.3,
      4.0, 5.4, 4.7, 6.1, 5.5, 3.7, 6.3, 4.6, 6.0, 5.2,
      4.2, 5.6, 4.8, 6.4, 5.5, 3.8, 6.2, 4.5, 6.3, 5.0
    ), # Continuous outcome
    outcome2 = c(
      2.5, 3.2, 4.5, 5.7, 3.1, 2.8, 4.0, 4.3, 5.6, 3.8,
      2.9, 3.4, 4.2, 5.0, 3.3, 2.7, 4.1, 4.0, 5.2, 3.9,
      3.0, 3.5, 4.3, 5.1, 3.2, 2.6, 4.0, 3.7, 5.4, 3.6,
      3.1, 3.6, 4.0, 5.3, 3.4, 2.8, 4.2, 3.9, 5.0, 3.7,
      2.8, 3.1, 4.4, 5.2, 3.3, 2.7, 4.0, 3.8, 5.5, 3.6
    ) # Continuous outcome
  )

  # Auxiliary and must-have variables
  auxiliary_vars <- c("stype", "age", "gender")
  must_have_vars <- c("stype")

  # Run the function
  result <- select_auxiliary_variables_lasso_cv(
    df = df,
    outcome_vars = c("outcome1", "outcome2"),
    auxiliary_vars = auxiliary_vars,
    must_have_vars = must_have_vars,
    check_twoway_int = TRUE,
    nfolds = 5,
    verbose = FALSE,
    standardize = TRUE,
    return_models = FALSE,
    parallel = FALSE
  )

  # Check that the goodness metrics for continuous outcomes are present
  expect_true("rss" %in% names(result$goodness_of_fit$outcome1$full_data))
  expect_true("r_squared" %in% names(result$goodness_of_fit$outcome1$full_data))
  expect_true("mse" %in% names(result$goodness_of_fit$outcome1$full_data))
  expect_true("rmse" %in% names(result$goodness_of_fit$outcome1$full_data))
  expect_true("mae" %in% names(result$goodness_of_fit$outcome1$full_data))

  expect_true("rss" %in% names(result$goodness_of_fit$outcome2$full_data))
  expect_true("r_squared" %in% names(result$goodness_of_fit$outcome2$full_data))
  expect_true("mse" %in% names(result$goodness_of_fit$outcome2$full_data))
  expect_true("rmse" %in% names(result$goodness_of_fit$outcome2$full_data))
  expect_true("mae" %in% names(result$goodness_of_fit$outcome2$full_data))
})
Any scripts or data that you put into this service are public.
auxvecLASSO documentation built on Aug. 28, 2025, 9:09 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
auxvecLASSO
LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

tests/testthat/test-select_auxiliary_variables_lasso_cv.R
In auxvecLASSO: LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

Try the auxvecLASSO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

auxvecLASSO LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

tests/testthat/test-select_auxiliary_variables_lasso_cv.R In auxvecLASSO: LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

Try the auxvecLASSO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

auxvecLASSO
LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics

tests/testthat/test-select_auxiliary_variables_lasso_cv.R
In auxvecLASSO: LASSO Auxiliary Variable Selection and Auxiliary Vector Diagnostics