Nothing
#' Prepare choice data for estimation
#'
#' @description
#' This function prepares choice data for estimation.
#'
#' @details
#' Requirements for the \code{data.frame} \code{choice_data}:
#' \itemize{
#' \item It **must** contain a column named \code{id} which contains unique
#' identifier for each decision maker.
#' \item It **can** contain a column named \code{idc} which contains unique
#' identifier for each choice situation of each decision maker.
#' If this information is missing, these identifier are generated
#' automatically by the appearance of the choices in the data set.
#' \item It **can** contain a column named \code{choice} with the observed
#' choices, where \code{choice} must match the name of the dependent
#' variable in \code{form}.
#' Such a column is required for model fitting but not for prediction.
#' \item It **must** contain a numeric column named *p_j* for each alternative
#' specific covariate *p* in \code{form} and each choice alternative *j*
#' in \code{alternatives}.
#' \item It **must** contain a numeric column named *q* for each covariate *q*
#' in \code{form} that is constant across alternatives.
#' }
#'
#' In the ordered case (\code{ordered = TRUE}), the column \code{choice} must
#' contain the full ranking of the alternatives in each choice occasion as a
#' character, where the alternatives are separated by commas, see the examples.
#'
#' See [the vignette on choice data](https://loelschlaeger.de/RprobitB/articles/v02_choice_data.html)
#' for more details.
#'
#' @param choice_data \[`data.frame`\]\cr
#' Choice data in wide format, where each row represents one choice occasion.
#'
#' @param id \[`character(1)`\]\cr
#' The name of the column in \code{choice_data} that contains unique identifier
#' for each decision maker.
#'
#' @param idc \[`character(1)`\]\cr
#' The name of the column in \code{choice_data} that contains
#' unique identifier for each choice situation of each decision maker.
#' By default, these identifier are generated by the order of appearance.
#'
#' @inheritParams check_form
#' @inheritParams RprobitB_data
#' @inheritParams missing_covariates
#'
#' @return
#' An object of class \code{RprobitB_data}.
#'
#' @examples
#' data <- prepare_data(
#' form = choice ~ price + time + comfort + change | 0,
#' choice_data = train_choice,
#' re = c("price", "time"),
#' id = "deciderID",
#' idc = "occasionID",
#' standardize = c("price", "time")
#' )
#'
#' ### ranked case
#' choice_data <- data.frame(
#' "id" = 1:3, "choice" = c("A,B,C", "A,C,B", "B,C,A"), "cov" = 1
#' )
#' data <- prepare_data(
#' form = choice ~ 0 | cov + 0,
#' choice_data = choice_data,
#' ranked = TRUE
#' )
#'
#' @export
#'
#' @seealso
#' \itemize{
#' \item [check_form()] for checking the model formula
#' \item [overview_effects()] for an overview of the model effects
#' \item [create_lagged_cov()] for creating lagged covariates
#' \item [as_cov_names()] for re-labeling alternative-specific covariates
#' \item [simulate_choices()] for simulating choice data
#' \item [train_test()] for splitting choice data into a train and test subset
#' }
prepare_data <- function(
form, choice_data, re = NULL, alternatives = NULL, ordered = FALSE,
ranked = FALSE, base = NULL, id = "id", idc = NULL, standardize = NULL,
impute = "complete_cases"
) {
### check 'form'
oeli::input_check_response(
check = oeli::check_missing(form),
var_name = "form"
)
check_form_out <- check_form(form = form, re = re, ordered = ordered)
form <- check_form_out$form
choice <- check_form_out$choice
re <- check_form_out$re
vars <- check_form_out$vars
ASC <- check_form_out$ASC
### check other inputs
oeli::input_check_response(
check = checkmate::check_flag(ordered),
var_name = "ordered"
)
oeli::input_check_response(
check = checkmate::check_flag(ranked),
var_name = "ranked"
)
if (isTRUE(ordered) && isTRUE(ranked)) {
stop("'ordered' and 'ranked' cannot both be TRUE.", call. = FALSE)
}
### check 'choice_data'
oeli::input_check_response(
check = checkmate::check_data_frame(choice_data),
var_name = "choice_data"
)
oeli::input_check_response(
check = checkmate::check_string(id),
var_name = "id"
)
if (!id %in% colnames(choice_data)) {
stop(
paste0(
"Decider identification column '", id, "' not found in 'choice_data'."
),
call. = FALSE
)
}
if (!is.null(idc)) {
oeli::input_check_response(
check = checkmate::check_string(idc),
var_name = "idc"
)
if (!idc %in% colnames(choice_data)) {
stop(
paste0(
"Choice occasion identification column '", idc,
"' not found in 'choice_data'."
),
call. = FALSE
)
}
}
### transform 'id' of 'choice_data' to factor
choice_data[, id] <- as.factor(choice_data[, id])
### sort 'choice_data' by 'id'
choice_data <- choice_data[order(choice_data[, id]), ]
### create choice occasion 'idc' (if not specified)
if (is.null(idc)) {
idc <- "idc"
choice_data[, idc] <- unlist(
sapply(table(choice_data[, id]), seq_len, simplify = FALSE)
)
}
### transform 'idc' of 'choice_data' to factor
choice_data[, idc] <- as.factor(choice_data[, idc])
### sort 'choice_data' first by column 'id' and second by column 'idc'
choice_data <- choice_data[order(choice_data[, id], choice_data[, idc]), ]
### handle missing covariates
choice_data <- missing_covariates(
choice_data = choice_data, impute = impute,
col_ignore = c(id, idc, choice)
)
### check if 'choice_data' contains choices
choice_available <- (choice %in% colnames(choice_data))
if (!choice_available) choice <- NA
### check alternative set
if (ordered) {
if (is.null(alternatives)) {
stop(
"Please specify 'alternatives', ordered from worst to best.",
call. = FALSE
)
}
} else {
if (is.null(alternatives)) {
if (choice_available) {
alternatives <- as.character(unique(choice_data[[choice]]))
if (ranked) {
alternatives <- unique(unlist(strsplit(alternatives, ",")))
}
} else {
stop(
"Please specify 'alternatives' if choices are not available.",
call. = FALSE
)
}
} else {
if (!is.character(alternatives)) {
stop(
"'alternatives' must be a character vector.",
call. = FALSE
)
}
if (choice_available && !ranked) {
choice_data <- choice_data[choice_data[[choice]] %in% alternatives, ]
choice_data[, id] <- droplevels(choice_data[, id])
choice_data[, idc] <- droplevels(choice_data[, idc])
if (nrow(choice_data) == 0) {
stop(
paste(
"No choices for", paste(alternatives, collapse = ", "), "found."
),
call. = FALSE
)
}
}
}
alternatives <- sort(alternatives)
}
J <- length(alternatives)
if (J <= 1) {
stop(
"At least two choice alternatives are required, only one provided.",
call. = FALSE
)
}
if (ordered == TRUE && J <= 2) {
stop(
"Please specify 3 or more alternatives for the ordered case.",
call. = FALSE
)
}
if (ranked == TRUE && J <= 2) {
stop(
"Please specify 3 or more alternatives for the ranked case.",
call. = FALSE
)
}
### determine index of base alternative
if (ordered || (!ASC && length(vars[[1]]) == 0 && length(vars[[2]]) == 0)) {
base <- NULL
} else {
if (is.null(base)) {
base <- alternatives[J]
base_index <- J
} else if (any(alternatives == base)) {
base_index <- which(alternatives == base)
} else {
base <- alternatives[J]
warning(
paste0(
"'base' not contained in 'alternatives'. ",
"Set 'base = ", alternatives[J], "' instead."
),
immediate. = TRUE, call. = FALSE
)
base_index <- J
}
}
### check if all required covariates are present in 'choice_data' and numerics
for (var in vars[[2]]) {
if (!var %in% names(choice_data)) {
stop(paste0("Column '", var, "' not found in 'choice_data'."),
call. = FALSE
)
}
if (!is.numeric(choice_data[, var])) {
stop(paste0("Column '", var, "' in 'choice_data' is not numeric."),
call. = FALSE
)
}
}
for (var in c(vars[[1]], vars[[3]])) {
for (j in alternatives) {
if (!paste0(var, "_", j) %in% names(choice_data)) {
stop(
paste0(
"Column '", paste0(var, "_", j),
"' not found in 'choice_data'."
),
call. = FALSE
)
}
if (!is.numeric(choice_data[, paste0(var, "_", j)])) {
stop(
paste0(
"Column '", paste0(var, "_", j),
"' in 'choice_data' is not numeric."
),
call. = FALSE
)
}
}
}
### determine number and names of linear coefficients
effects <- overview_effects(form, re, alternatives, base, ordered)
P_f <- sum(effects$random == FALSE)
P_r <- sum(effects$random == TRUE)
### artificially add ASCs
if (ASC) choice_data[, "ASC"] <- 1
### standardize covariates
if (!is.null(standardize)) {
if (!is.character(standardize)) {
stop("'standardize' must be a character (vector).",
call. = FALSE
)
}
if (identical(standardize, "all")) {
standardize <- c(
apply(expand.grid(vars[[1]], alternatives), 1, paste, collapse = "_"),
vars[[2]],
apply(expand.grid(vars[[3]], alternatives), 1, paste, collapse = "_")
)
}
if ("ASC" %in% standardize) {
warning(
"Removed 'ASC' from 'standardize'.", call. = FALSE, immediate. = TRUE
)
standardize <- standardize[-which(standardize == "ASC")]
}
for (var in vars[[2]]) {
if (var %in% standardize) {
choice_data[, var] <- scale(choice_data[, var])
}
}
for (var in c(vars[[1]], vars[[3]])) {
for (j in alternatives) {
var_alt <- paste0(var, "_", j)
if (var_alt %in% standardize) {
choice_data[, var_alt] <- scale(choice_data[, var_alt])
}
}
}
}
### transform 'choice_data' in list format 'data'
ids <- unique(choice_data[, id])
N <- length(ids)
T <- as.numeric(table(choice_data[, id]))
data <- list()
pb <- RprobitB_pb(title = "Preparing data", total = N, tail = "deciders")
for (n in seq_len(N)) {
RprobitB_pb_tick(pb)
data[[n]] <- list()
data_n <- choice_data[choice_data[, id] == ids[n], ]
X_n <- list()
for (t in seq_len(T[n])) {
data_nt <- data_n[t, ]
if (ordered) {
X_nt <- matrix(data_nt[, vars[[2]]], nrow = 1)
colnames(X_nt) <- vars[[2]]
} else {
X_nt <- matrix(NA_real_, nrow = J, ncol = 0)
### type-1 covariates
for (var in vars[[1]]) {
old_names <- colnames(X_nt)
col <- numeric(J)
for (j in 1:J) {
col[j] <- data_nt[, paste0(var, "_", alternatives[j])]
}
X_nt <- cbind(X_nt, col)
colnames(X_nt) <- c(old_names, var)
}
### type-2 covariates
for (var in c(vars[[2]], if (ASC) "ASC")) {
old_names <- colnames(X_nt)
mat <- matrix(0, J, J)
for (j in (1:J)[-base_index]) {
mat[j, j] <- data_nt[, var]
}
mat <- mat[, -base_index, drop = FALSE]
X_nt <- cbind(X_nt, mat)
colnames(X_nt) <- c(
old_names,
paste0(
var, "_",
alternatives[(1:J)[-base_index]]
)
)
}
### type-3 covariates
for (var in vars[[3]]) {
old_names <- colnames(X_nt)
mat <- matrix(0, J, J)
for (j in 1:J) {
mat[j, j] <- data_nt[, paste0(var, "_", alternatives[j])]
}
X_nt <- cbind(X_nt, mat)
colnames(X_nt) <- c(old_names, paste0(var, "_", alternatives))
}
}
### sort covariates
X_nt <- X_nt[, effects$effect, drop = FALSE]
### save in list
X_n[[t]] <- X_nt
}
data[[n]][["X"]] <- X_n
data[[n]][["y"]] <- if (choice_available) data_n[[choice]] else NA
}
### delete "ASC" from 'choice_data'
if (ASC) choice_data$ASC <- NULL
### save cov names
alt_length <- length(alternatives)
cov_names <- c(
if (length(vars[[1]]) > 0) {
paste(rep(vars[[1]], each = alt_length), alternatives, sep = "_")
},
vars[[2]],
if (length(vars[[3]]) > 0) {
paste(rep(vars[[3]], each = alt_length), alternatives, sep = "_")
}
)
### create output
RprobitB_data(
data = data,
choice_data = choice_data,
N = N,
T = T,
J = J,
P_f = P_f,
P_r = P_r,
alternatives = alternatives,
ordered = ordered,
ranked = ranked,
base = base,
form = form,
re = re,
ASC = ASC,
effects = effects,
standardize = standardize,
simulated = FALSE,
choice_available = choice_available,
true_parameter = NULL,
res_var_names = list(
"choice" = choice, "cov" = cov_names, "id" = id, "idc" = idc
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.