test-choice_data.R
In choicedata: Working with Choice Data

test_that("choice_data can be defined", {

  ### long format (all columns)
  expect_true(
    choice_data(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_occasion = NULL,
      column_alternative = "mode",
      delimiter = "_",
      cross_section = TRUE
    ) |> is.choice_data()
  )

  ### long format (selected columns)
  expect_true(
    choice_data(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_occasion = NULL,
      column_alternative = "mode",
      column_ac_covariates = "income",
      column_as_covariates = "wait",
      delimiter = "_",
      cross_section = TRUE
    ) |> is.choice_data()
  )

  ### wide format
  expect_true(
    choice_data(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_alternative = NULL,
      column_ac_covariates = NULL,
      column_as_covariates = NULL,
      delimiter = "_",
      cross_section = FALSE
    ) |> is.choice_data()
  )

  ### wide format (selected columns)
  expect_true(
    choice_data(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_alternative = NULL,
      column_ac_covariates = NULL,
      column_as_covariates = "price",
      delimiter = "_",
      cross_section = FALSE
    ) |> is.choice_data()
  )

})

test_that("multi-character delimiters are supported", {

  df <- data.frame(
    deciderID = c(1, 1),
    occasionID = c(1, 2),
    choice = c("bus", "train"),
    cost__bus = c(10, 12),
    cost__train = c(9, 11),
    time__bus = c(5, 4),
    time__train = c(3, 2)
  )

  multi_wide <- choice_data(
    data_frame = df,
    format = "wide",
    column_choice = "choice",
    column_decider = "deciderID",
    column_occasion = "occasionID",
    delimiter = "__",
    cross_section = FALSE
  )

  expect_true(is.choice_data(multi_wide))
  expect_identical(attr(multi_wide, "delimiter"), "__")

  multi_long <- wide_to_long(
    data_frame = df,
    column_choice = "choice",
    column_alternative = "alternative",
    delimiter = "__",
    choice_type = "discrete"
  )

  expect_true(is.data.frame(multi_long))
  expect_identical(
    sort(unique(multi_long$alternative)),
    c("bus", "train")
  )
  expect_true(all(c("cost", "time") %in% names(multi_long)))
})

test_that("simulation of probit choice data works for wide covariates", {

  set.seed(1)

  choice_effects <- choice_effects(
    choice_formula = choice_formula(
      formula = choice ~ price | income,
      error_term = "probit",
      random_effects = c("price" = "cn")
    ),
    choice_alternatives = choice_alternatives(J = 3)
  )

  choice_identifiers <- generate_choice_identifiers(
    N = 5,
    Tp = c(2, 1, 3, 1, 2)
  )

  choice_covariates <- generate_choice_covariates(
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers
  )

  choice_parameters <- generate_choice_parameters(
    choice_effects = choice_effects
  )

  choice_preferences <- generate_choice_preferences(
    choice_parameters = choice_parameters,
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers
  )

  simulated_data <- generate_choice_data(
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers,
    choice_covariates = choice_covariates,
    choice_parameters = choice_parameters,
    choice_preferences = choice_preferences,
    column_choice = "choice"
  )

  expect_true(is.choice_data(simulated_data))
  expect_identical(attr(simulated_data, "format"), "wide")
  expect_equal(nrow(simulated_data), nrow(choice_covariates))
  checkmate::expect_subset(
    unique(simulated_data[[attr(simulated_data, "column_choice")]]),
    as.character(attr(choice_effects, "choice_alternatives"))
  )
})

test_that("generate_choice_data joins responses and covariates without reordering", {

  set.seed(1)

  choice_effects <- choice_effects(
    choice_formula = choice_formula(
      formula = choice ~ price | income,
      error_term = "probit",
      random_effects = c("price" = "cn")
    ),
    choice_alternatives = choice_alternatives(J = 3)
  )

  choice_identifiers <- generate_choice_identifiers(
    N = 4,
    Tp = c(2, 1, 2, 1)
  )

  choice_covariates <- generate_choice_covariates(
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers
  )

  shuffled_covariates <- choice_covariates[rev(seq_len(nrow(choice_covariates))), ]

  choice_parameters <- generate_choice_parameters(
    choice_effects = choice_effects
  )

  choice_preferences <- generate_choice_preferences(
    choice_parameters = choice_parameters,
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers
  )

  simulated_data <- generate_choice_data(
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers,
    choice_covariates = shuffled_covariates,
    choice_parameters = choice_parameters,
    choice_preferences = choice_preferences,
    column_choice = "choice"
  )

  expect_s3_class(simulated_data, "tbl_df")

})

test_that("generate_choice_data aborts when joins drop identifiers", {

  set.seed(1)

  choice_effects <- choice_effects(
    choice_formula = choice_formula(
      formula = choice ~ price | time,
      error_term = "logit"
    ),
    choice_alternatives = choice_alternatives(J = 3)
  )

  choice_identifiers <- generate_choice_identifiers(N = 4, Tp = rep(1L, 4L))

  choice_covariates <- generate_choice_covariates(
    choice_effects = choice_effects,
    choice_identifiers = choice_identifiers
  )

  incomplete_covariates <- choice_covariates[-1, ]

  choice_parameters <- generate_choice_parameters(choice_effects = choice_effects)

  expect_error(
    generate_choice_data(
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers,
      choice_covariates = incomplete_covariates,
      choice_parameters = choice_parameters,
      column_choice = "choice"
    ),
    "Missing rows",
    fixed = TRUE
  )
})

test_that("alternative names can be guessed from wide format", {

  ### with column_choice available
  expect_identical(
    guess_alternatives_wide(
      data_frame = train_choice,
      column_choice = "choice",
      delimiter = "_"
    ),
    c("A", "B")
  )

  ### without column_choice available
  expect_identical(
    guess_alternatives_wide(
      data_frame = train_choice,
      column_choice = NULL,
      delimiter = "_"
    ),
    c("A", "B")
  )

  wide_with_delimiter <- data.frame(
    travel_time_car = c(10, 12),
    travel_time_bus = c(15, 18),
    travel_cost_car = c(3, 4),
    travel_cost_bus = c(2, 2)
  )
  expect_identical(
    guess_alternatives_wide(
      data_frame = wide_with_delimiter,
      column_choice = NULL,
      delimiter = "_"
    ),
    c("bus", "car")
  )
})

test_that("data can be transformed between long and wide format", {

  ### from long format to wide format
  expect_identical(
    long_to_wide(
      data_frame = travel_mode_choice,
      column_as_covariates = character(), # ignore as covariates
      column_choice = "choice",
      column_alternative = "mode",
      column_decider = "individual",
      column_occasion = NULL
    ) |> colnames(),
    c("individual", "income", "size", "choice")
  )
  expect_identical(
    long_to_wide(
      data_frame = travel_mode_choice,
      column_alternative = "mode",
      column_decider = "individual"
    ) |> colnames(),
    c("individual", "income", "size", "wait_plane", "wait_train",
      "wait_bus", "wait_car", "cost_plane", "cost_train", "cost_bus",
      "cost_car", "travel_plane", "travel_train", "travel_bus", "travel_car",
      "choice")
  )

  ### from wide format to long format
  expect_identical(
    wide_to_long(
      data_frame = train_choice[, 1:3]
    ) |> colnames(),
    c("deciderID", "occasionID", "choice", "alternative")
  )
  expect_identical(
    wide_to_long(
      data_frame = train_choice
    ) |> colnames(),
    c("deciderID", "occasionID", "choice", "alternative", "price",
      "time", "change", "comfort")
  )

  ### from wide format to long format without alternatives
  expect_identical(
    wide_to_long(
      data_frame = train_choice[, -3], column_choice = NULL
    ) |> colnames(),
    c("deciderID", "occasionID", "alternative", "price",
      "time", "change", "comfort")
  )

})

test_that("alternative-specific covariates can be detected", {

  ### long format (trivial case)
  expect_identical(
    check_as_covariates(
      data_frame = travel_mode_choice[, c("individual", "mode")],
      format = "long",
      column_choice = NULL,
      column_decider = "individual",
      column_occasion = NULL,
      column_alternative = "mode"
    ),
    list(
      column_ac_covariates = character(0),
      column_as_covariates = character(0),
      column_as_covariates_wide = character(0)
    )
  )

  ### long format (all columns)
  expect_identical(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_occasion = NULL,
      column_alternative = "mode"
    ),
    list(
      column_ac_covariates = c("income", "size"),
      column_as_covariates = c("wait", "cost", "travel"),
      column_as_covariates_wide = c(
        "wait_bus", "cost_bus", "travel_bus", "wait_car", "cost_car",
        "travel_car", "wait_plane", "cost_plane", "travel_plane", "wait_train",
        "cost_train", "travel_train"
      )
    )
  )

  ### long format (selected columns)
  expect_identical(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_alternative = "mode",
      column_ac_covariates = "size",
      column_as_covariates = "wait"
    ),
    list(
      column_ac_covariates = "size",
      column_as_covariates = "wait",
      column_as_covariates_wide = c(
        "wait_bus", "wait_car", "wait_plane", "wait_train"
      )
    )
  )
  expect_error(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_alternative = "mode",
      column_ac_covariates = "unknown"
    ),
    "Unknown"
  )
  expect_error(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_alternative = "mode",
      column_as_covariates = "unknown"
    ),
    "Unknown"
  )
  expect_error(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_alternative = "mode",
      column_ac_covariates = "wait"
    ),
    "Found varying"
  )
  expect_error(
    check_as_covariates(
      data_frame = travel_mode_choice,
      format = "long",
      column_choice = "choice",
      column_decider = "individual",
      column_alternative = "mode",
      column_as_covariates = "size"
    ),
    "Found constant"
  )

  ### wide format (trivial case)
  expect_identical(
    check_as_covariates(
      data_frame = train_choice[, c("deciderID", "occasionID")],
      format = "wide",
      column_choice = NULL,
      column_decider = "deciderID",
      column_occasion = "occasionID",
    ),
    list(
      column_ac_covariates = character(0),
      column_as_covariates = character(0),
      column_as_covariates_wide = character(0)
    )
  )

  ### wide format (all columns)
  expect_identical(
    check_as_covariates(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      delimiter = "_"
    ),
    list(
      column_ac_covariates = character(0),
      column_as_covariates = c(
        "change", "comfort", "price", "time"
      ),
      column_as_covariates_wide = c(
        "price_A", "time_A", "change_A", "comfort_A",
        "price_B", "time_B", "change_B", "comfort_B"
      )
    )
  )

  ### wide format (selected columns)
  expect_identical(
    check_as_covariates(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_ac_covariates = character(),
      column_as_covariates = "price",
      delimiter = "_"
    ),
    list(
      column_ac_covariates = character(0),
      column_as_covariates = "price",
      column_as_covariates_wide = c("price_A", "price_B")
    )
  )
  expect_error(
    check_as_covariates(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_ac_covariates = "unknown",
      delimiter = "_"
    ),
    "Unknown"
  )
  expect_error(
    check_as_covariates(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_as_covariates = "unknown",
      delimiter = "_"
    ),
    "missing"
  )
  expect_error(
    check_as_covariates(
      data_frame = train_choice,
      format = "wide",
      column_choice = "choice",
      column_decider = "deciderID",
      column_occasion = "occasionID",
      column_ac_covariates = "price",
      delimiter = "_"
    ),
    "Unknown"
  )
})

test_that("ranked choice data can be indexed", {
  ranked_df <- data.frame(
    deciderID = rep(1:2, each = 3),
    alternative = rep(c("A", "B", "C"), times = 2),
    choice = c(1, 2, 3, 2, 1, 3),
    stringsAsFactors = FALSE
  )

  ch_data <- choice_data(
    data_frame = ranked_df,
    format = "long",
    column_choice = "choice",
    column_decider = "deciderID",
    column_alternative = "alternative",
    choice_type = "ranked"
  )
  effects <- choice_effects(
    choice_formula = choice_formula(
      formula = choice ~ 0 | 0 | 0,
      error_term = "probit"
    ),
    choice_alternatives = choice_alternatives(
      J = 3,
      alternatives = c("A", "B", "C")
    )
  )

  indices <- extract_choice_indices(ch_data, effects)
  expect_equal(indices[[1]], c(1L, 2L, 3L))
  expect_equal(indices[[2]], c(2L, 1L, 3L))
})

test_that("ranked choice data round-trips between long and wide", {
  ranked_df <- data.frame(
    deciderID = rep(1:2, each = 3),
    alternative = rep(c("A", "B", "C"), times = 2),
    choice = c(1, 2, 3, 2, 1, 3),
    stringsAsFactors = FALSE
  )

  wide_ranked <- long_to_wide(
    data_frame = ranked_df,
    column_choice = "choice",
    column_alternative = "alternative",
    column_decider = "deciderID",
    choice_type = "ranked"
  )

  expect_setequal(
    grep("^choice_", names(wide_ranked), value = TRUE),
    paste0("choice_", c("A", "B", "C"))
  )
  expect_equal(wide_ranked$choice, c("A", "B"))

  long_ranked <- wide_to_long(
    data_frame = wide_ranked,
    column_choice = "choice",
    column_alternative = "alternative",
    alternatives = c("A", "B", "C"),
    choice_type = "ranked"
  )

  long_ranked <- long_ranked[order(long_ranked$deciderID, long_ranked$alternative), ]
  ranked_df <- ranked_df[order(ranked_df$deciderID, ranked_df$alternative), ]
  expect_equal(long_ranked$choice, ranked_df$choice)
  expect_equal(long_ranked$alternative, ranked_df$alternative)
  expect_equal(long_ranked$deciderID, ranked_df$deciderID)
})

test_that("ordered choice data preserves categories", {
  ordered_df <- data.frame(
    deciderID = 1:4,
    choice = factor(c("low", "medium", "high", "medium"), ordered = TRUE)
  )

  ch_data <- choice_data(
    data_frame = ordered_df,
    format = "wide",
    column_choice = "choice",
    column_decider = "deciderID",
    choice_type = "ordered"
  )
  effects <- choice_effects(
    choice_formula = choice_formula(
      formula = choice ~ 0 | 0 | 0,
      error_term = "probit"
    ),
    choice_alternatives = choice_alternatives(
      J = 3,
      alternatives = c("high", "low", "medium"),
      ordered = TRUE
    )
  )

  indices <- extract_choice_indices(ch_data, effects)
  expect_equal(unlist(indices), c(2L, 3L, 1L, 3L))
})

test_that("is.choice_data validates inputs and reports variable names", {
  cd <- choice_data(
    data_frame = train_choice,
    format = "wide",
    column_choice = "choice",
    column_decider = "deciderID",
    column_occasion = "occasionID"
  )
  expect_true(is.choice_data(cd))
  expect_error(
    is.choice_data(train_choice, var_name = "train_choice"),
    "train_choice",
    fixed = TRUE
  )
  expect_false(is.choice_data(train_choice, error = FALSE))
})

test_that("choice_data respects custom delimiters in long format", {

  custom_delimiter <- "-"
  long_data <- data.frame(
    individual = rep(1:2, each = 2),
    mode = rep(c("car", "bus"), times = 2),
    choice = c(1, 0, 0, 1),
    income = rep(c(50, 60), each = 2),
    wait = c(5, 10, 3, 6),
    stringsAsFactors = FALSE
  )

  choice_obj <- choice_data(
    data_frame = long_data,
    format = "long",
    column_choice = "choice",
    column_decider = "individual",
    column_occasion = NULL,
    column_alternative = "mode",
    column_ac_covariates = "income",
    column_as_covariates = "wait",
    delimiter = custom_delimiter,
    cross_section = TRUE
  )

  expect_true(is.choice_data(choice_obj))

  round_trip_wide <- long_to_wide(
    data_frame = as.data.frame(choice_obj),
    column_ac_covariates = attr(choice_obj, "column_ac_covariates"),
    column_as_covariates = attr(choice_obj, "column_as_covariates"),
    column_choice = attr(choice_obj, "column_choice"),
    column_alternative = attr(choice_obj, "column_alternative"),
    column_decider = attr(choice_obj, "column_decider"),
    column_occasion = attr(choice_obj, "column_occasion"),
    delimiter = custom_delimiter
  )

  expected_wide_columns <- c(
    "individual",
    "income",
    paste0("wait", custom_delimiter, c("car", "bus")),
    "choice"
  )

  expect_setequal(names(round_trip_wide), expected_wide_columns)
  expect_setequal(attr(choice_obj, "column_as_covariates_wide"),
    paste0("wait", custom_delimiter, c("car", "bus"))
  )
})

test_that("ordered simulations propagate the choice type", {

  for (error_term in c("logit", "probit")) {
    set.seed(if (error_term == "logit") 1 else 2)

    choice_effects <- choice_effects(
      choice_formula = choice_formula(
        formula = choice ~ income | 0,
        error_term = error_term
      ),
      choice_alternatives = choice_alternatives(
        J = 3,
        alternatives = c("low", "medium", "high"),
        ordered = TRUE
      )
    )

    choice_identifiers <- generate_choice_identifiers(
      N = 4,
      Tp = rep(1, 4)
    )

    choice_covariates <- generate_choice_covariates(
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers
    )

    choice_parameters <- generate_choice_parameters(
      choice_effects = choice_effects
    )

    choice_preferences <- generate_choice_preferences(
      choice_parameters = choice_parameters,
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers
    )

    simulated_data <- generate_choice_data(
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers,
      choice_covariates = choice_covariates,
      choice_parameters = choice_parameters,
      choice_preferences = choice_preferences,
      column_choice = "choice"
    )

    expect_true(is.choice_data(simulated_data), info = error_term)
    expect_identical(attr(simulated_data, "choice_type"), "ordered")
    checkmate::expect_subset(
      unique(simulated_data[[attr(simulated_data, "column_choice")]]),
      as.character(attr(choice_effects, "choice_alternatives"))
    )
  }
})

test_that("ranked simulations support logit and probit error terms", {

  for (error_term in c("logit", "probit")) {
    set.seed(if (error_term == "logit") 11 else 12)

    choice_effects <- choice_effects(
      choice_formula = choice_formula(
        formula = choice ~ price | time,
        error_term = error_term
      ),
      choice_alternatives = choice_alternatives(J = 4)
    )

    choice_identifiers <- generate_choice_identifiers(
      N = 3,
      Tp = rep(1L, 3L)
    )

    choice_covariates <- generate_choice_covariates(
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers
    )

    choice_parameters <- generate_choice_parameters(
      choice_effects = choice_effects
    )

    choice_preferences <- generate_choice_preferences(
      choice_parameters = choice_parameters,
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers
    )

    simulated_data <- generate_choice_data(
      choice_effects = choice_effects,
      choice_identifiers = choice_identifiers,
      choice_covariates = choice_covariates,
      choice_parameters = choice_parameters,
      choice_preferences = choice_preferences,
      column_choice = "choice",
      choice_type = "ranked"
    )

    expect_true(is.choice_data(simulated_data), info = error_term)
    expect_identical(attr(simulated_data, "choice_type"), "ranked")

    alt_names <- as.character(attr(choice_effects, "choice_alternatives"))
    rank_cols <- paste0("choice_", alt_names)
    expect_true(all(rank_cols %in% names(simulated_data)), info = error_term)
    expect_true(all(vapply(rank_cols, function(col) {
      is.integer(simulated_data[[col]])
    }, logical(1))), info = error_term)

    ranking_matrix <- as.matrix(simulated_data[rank_cols])
    expect_true(all(ranking_matrix >= 1 &
      ranking_matrix <= length(alt_names)), info = error_term)

    inferred_top <- alt_names[max.col(-ranking_matrix, ties.method = "first")]
    expect_equal(simulated_data$choice, inferred_top, info = error_term)
  }
})