tests/testthat/test-collinear.R

testthat::test_that("`collinear()` works", {
  testthat::skip_on_cran()
  #load data
  data(
    vi_smol,
    vi_predictors_numeric,
    vi_predictors_categorical,
    vi_responses
  )

  #DEFAULT CALL ----
  #Error: collinear::collinear(): argument 'df' cannot be NULL
  testthat::expect_error(
    x <- collinear(),
    regexp = "'df' cannot be NULL"
  )

  #DF ONLY ----

  ##fewer than 3 rows ----
  testthat::expect_error(
    x <- collinear(
      df = vi_smol[1:2, ],
      predictors = vi_predictors_numeric
    ),
    regexp = "has fewer than 3 rows"
  ) |>
    suppressMessages() |>
    suppressWarnings()

  ##fewer than 10 rows ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol[1:9, ],
      predictors = vi_predictors_numeric
    ),
    regexp = "has fewer than 10 rows"
  ) |>
    suppressMessages() |>
    suppressWarnings()

  x <- collinear(
    df = vi_smol[1:9, ],
    predictors = vi_predictors_numeric,
    max_cor = 0.7,
    max_vif = NULL,
    quiet = TRUE
  )

  ##fewer than 30 rows ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol[1:29, ],
      predictors = vi_predictors_numeric
    ),
    regexp = "has fewer than 30 rows"
  ) |>
    suppressMessages() |>
    suppressWarnings()

  ##more than 30 rows ----

  #max_cor and max_vif
  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      predictors = vi_predictors_numeric,
      max_cor = NULL,
      max_vif = NULL
    ),
    regexp = "setting 'max_cor' to"
  ) |>
    suppressMessages()

  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      predictors = vi_predictors_numeric,
      max_cor = NULL,
      max_vif = NULL
    ),
    regexp = "setting 'max_vif' to"
  ) |>
    suppressMessages()

  #max_cor and max_vif invalid
  f_test <- function() {
    collinear(
      df = vi_smol,
      predictors = vi_predictors_numeric,
      max_cor = 2,
      max_vif = 20
    )
  }

  testthat::expect_message(
    x <- f_test(),
    regexp = "argument 'max_cor' is outside its valid range"
  ) |>
    suppressMessages()

  testthat::expect_message(
    x <- f_test(),
    regexp = "argument 'max_vif' is outside its valid range"
  ) |>
    suppressMessages()

  #RESPONSE ----

  ##response only ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol[, c(vi_responses[1:2], vi_predictors_numeric)],
      responses = vi_responses[1:2],
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "processing response 'vi_numeric'"
  ) |>
    suppressMessages()

  testthat::expect_true(
    all(vi_responses[1:2] %in% names(x))
  )

  #check that the given response is never in selections
  for (i in vi_responses[1:2]) {
    testthat::expect_true(
      !i %in% x[[i]]$selection
    )
  }

  #PREDICTORS ----

  ##numeric predictors ----
  testthat::expect_message(
    x <- collinear(
      df = vi,
      responses = NULL,
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "selected predictors"
  ) |>
    suppressMessages()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  #test lack of formulas because there is no response
  testthat::expect_true(
    !any(c("formula", "formulas") %in% names(x))
  )

  ##categorical predictors ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = NULL,
      predictors = vi_predictors_categorical[1:4],
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  #test lack of formulas because there is no response
  testthat::expect_true(
    !any(c("formula", "formulas") %in% names(x))
  )

  ##mixed predictors ----
  x <- collinear(
    df = vi_smol,
    responses = NULL,
    predictors = c(vi_predictors_numeric[1:3], vi_predictors_categorical[1:2]),
    encoding_method = NULL,
    preference_order = NULL,
    f = NULL,
    quiet = TRUE
  ) |>
    suppressWarnings()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  #test lack of formulas because there is no response
  testthat::expect_true(
    !any(c("formula", "formulas") %in% names(x))
  )

  #PREDICTORS + RESPONSE ----

  ##numeric numeric ----
  testthat::expect_message(
    x <- collinear(
      df = vi,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  testthat::expect_true(
    all(c("linear", "smooth") %in% names(x$vi_numeric$formulas))
  )

  ##categorical numeric ----
  testthat::expect_message(
    x <- collinear(
      df = vi,
      responses = "vi_categorical",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  ##categorical categorical ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = "vi_categorical",
      predictors = vi_predictors_categorical[1:4],
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    inherits(x = x, what = "collinear_output")
  )

  ##multiple responses ----
  #all selections are the same
  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = vi_responses,
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    ),
    regexp = "processing response"
  ) |>
    suppressMessages()

  #check that the given response does not appear in the selections
  for (i in names(x)) {
    testthat::expect_true(
      !i %in% x[[i]]$selection
    )
  }

  #checking formulas
  testthat::expect_true(
    all(c("linear", "smooth") %in% names(x$vi_numeric$formulas))
  )

  testthat::expect_true(
    all(c("linear", "smooth") %in% names(x$vi_counts$formulas))
  )

  testthat::expect_true(
    all(c("linear", "smooth") %in% names(x$vi_binomial$formulas))
  )

  testthat::expect_true(
    all(c("classification") %in% names(x$vi_factor$formulas))
  )

  testthat::expect_true(
    all(c("classification") %in% names(x$vi_categorical$formulas))
  )

  #TARGET ENCODING ----
  f_test <- function() {
    collinear(
      df = vi_smol,
      responses = c("vi_numeric", "vi_categorical"),
      predictors = vi_predictors_categorical[1:4],
      encoding_method = "loo",
      preference_order = NULL,
      f = NULL,
      quiet = FALSE
    )
  }

  testthat::expect_message(
    x <- f_test(),
    regexp = "using response 'vi_numeric' to encode these categorical predictors"
  ) |>
    suppressMessages()

  testthat::expect_message(
    x <- f_test(),
    regexp = "processing response 'vi_categorical'"
  ) |>
    suppressMessages()

  #check that a categorical predictor was converted to numeric for "vi_numeric"
  testthat::expect_true(
    !is.numeric(vi_smol[["soil_type"]]) &&
      !is.numeric(x$vi_categorical$df[["soil_type"]]) &&
      is.numeric(x$vi_numeric$df[["soil_type"]])
  )

  # PREFERENCE ORDER ----

  ## no target encoding ----

  ### invalid character vector ----
  preference_order <- c(
    "hola",
    "adios"
  )

  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = preference_order,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "invalid values in argument 'preference_order'"
  ) |>
    suppressMessages()

  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = preference_order,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    !all(preference_order %in% x$vi_numeric$selection)
  )

  testthat::expect_true(
    all(vi_predictors_numeric %in% x$vi_numeric$preference_order$predictor)
  )

  ### valid character vector ----

  #uncorrelated, must show up in selection
  preference_order <- c(
    "soil_soc",
    "growing_season_temperature"
  )

  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = preference_order,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    all(preference_order %in% x$vi_numeric$selection)
  )

  testthat::expect_true(
    all(vi_predictors_numeric %in% x$vi_numeric$preference_order$predictor)
  )

  ### valid dataframe ----
  preference_df <- preference_order(
    df = vi_smol,
    responses = c("vi_numeric", "vi_categorical"),
    predictors = vi_predictors_numeric,
    f = f_auto,
    quiet = TRUE
  )

  x <- collinear(
    df = vi_smol,
    responses = c("vi_numeric", "vi_categorical"),
    predictors = vi_predictors_numeric,
    encoding_method = NULL,
    preference_order = preference_df,
    f = NULL,
    max_cor = 0.75,
    max_vif = 5,
    quiet = TRUE
  )

  testthat::expect_true(
    unique(x$vi_categorical$preference_order$response) == "vi_categorical"
  )

  ### invalid data frame ----
  preference_df <- data.frame(
    response = "hola",
    f = NA
  )

  testthat::expect_error(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = preference_df,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = TRUE
    ),
    regexp = "dataframe 'preference_order' must have these columns"
  )

  ### NULL response ----
  #same as above!
  preference <- c(
    "soil_soc",
    "growing_season_temperature"
  )

  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = NULL,
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = preference,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "'predictors' from lower to higher multicollinearity"
  ) |>
    suppressMessages()

  testthat::expect_true(
    all(preference %in% x$result$selection)
  )

  testthat::expect_true(
    preference[1] == x$result$preference_order$predictor[1]
  )

  testthat::expect_true(
    preference[2] == x$result$preference_order$predictor[2]
  )

  ### f_auto ----
  testthat::expect_message(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = f_auto,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "selected function 'f_numeric_glm"
  ) |>
    suppressMessages()

  testthat::expect_true(
    x$vi_numeric$selection[1] == x$vi_numeric$preference$predictor[1]
  )

  testthat::expect_true(
    x$vi_numeric$preference$f[1] == "f_numeric_glm"
  )

  ### f_numeric_rf ----
  x <- collinear(
    df = vi_smol,
    responses = "vi_numeric",
    predictors = vi_predictors_numeric,
    encoding_method = NULL,
    preference_order = NULL,
    f = f_numeric_rf,
    max_cor = 0.75,
    max_vif = 5,
    quiet = TRUE
  )

  testthat::expect_true(
    x$vi_numeric$selection[1] == x$vi_numeric$preference$predictor[1]
  )

  testthat::expect_true(
    x$vi_numeric$preference_order$f[1] == "f_numeric_rf"
  )

  ### bad function name ----
  my_f <- function() {
    return(NULL)
  }

  testthat::expect_error(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = my_f,
      max_cor = 0.75,
      max_vif = 5,
      quiet = TRUE
    ),
    regexp = "the function 'f' must have the argument 'df'"
  )

  ### character function name ----
  testthat::expect_error(
    x <- collinear(
      df = vi_smol,
      responses = "vi_numeric",
      predictors = vi_predictors_numeric,
      encoding_method = NULL,
      preference_order = NULL,
      f = "my_f",
      max_cor = 0.75,
      max_vif = 5,
      quiet = TRUE
    ),
    regexp = "must be a uquoted function name"
  )

  #passing a correlation matrix
  time_no_matrix <- system.time(
    expr = {
      x <- collinear(
        df = vi_smol,
        responses = "vi_numeric",
        predictors = vi_predictors_categorical[1:10],
        encoding_method = NULL,
        preference_order = NULL,
        f = NULL,
        max_cor = 0.75,
        max_vif = 5,
        quiet = TRUE
      )
    }
  )

  m <- cor_matrix(
    df = vi_smol,
    predictors = vi_predictors_categorical[1:10],
    quiet = TRUE
  )

  time_matrix <- system.time(
    expr = {
      x <- collinear(
        df = vi_smol,
        responses = "vi_numeric",
        predictors = vi_predictors_categorical[1:10],
        encoding_method = NULL,
        preference_order = NULL,
        f = NULL,
        max_cor = 0.75,
        max_vif = 5,
        quiet = TRUE,
        m = m
      )
    }
  )

  testthat::expect_true(
    time_no_matrix[3] > time_matrix[3]
  )

  #losing one column
  df <- vi_smol
  df$logical <- TRUE

  testthat::expect_message(
    x <- collinear(
      df = df,
      responses = "vi_numeric",
      predictors = c("logical", vi_predictors_numeric),
      encoding_method = NULL,
      preference_order = NULL,
      f = NULL,
      max_cor = 0.75,
      max_vif = 5,
      quiet = FALSE
    ),
    regexp = "invalid logical variables due to constant values"
  ) |>
    suppressMessages()

  testthat::expect_true(
    !"logical" %in% colnames(x$vi_numeric$df)
  )

  #cross validation
  time_no_cv <- system.time(
    expr = {
      x <- collinear(
        df = vi_smol,
        responses = "vi_numeric",
        predictors = vi_predictors_numeric,
        encoding_method = NULL,
        preference_order = NULL,
        f = f_auto,
        max_cor = 0.75,
        max_vif = 5,
        quiet = TRUE
      )
    }
  )

  time_cv <- system.time(
    expr = {
      x <- collinear(
        df = vi_smol,
        responses = "vi_numeric",
        predictors = vi_predictors_numeric,
        encoding_method = NULL,
        preference_order = NULL,
        f = f_auto,
        max_cor = 0.75,
        max_vif = 5,
        quiet = TRUE,
        cv_iterations = 100,
        cv_training_fraction = 0.5
      )
    }
  )

  testthat::expect_true(
    time_cv[3] > time_no_cv[3]
  )
})

Try the collinear package in your browser

Any scripts or data that you put into this service are public.

collinear documentation built on Dec. 8, 2025, 5:06 p.m.