R/prepare_data.R
In RprobitB: Bayesian Probit Choice Modeling

Documented in prepare_data

#' Prepare choice data for estimation
#'
#' @description
#' This function prepares choice data for estimation.
#'
#' @details
#' Requirements for the \code{data.frame} \code{choice_data}:
#' \itemize{
#'   \item It **must** contain a column named \code{id} which contains unique
#'         identifier for each decision maker.
#'   \item It **can** contain a column named \code{idc} which contains unique
#'         identifier for each choice situation of each decision maker.
#'         If this information is missing, these identifier are generated
#'         automatically by the appearance of the choices in the data set.
#'   \item It **can** contain a column named \code{choice} with the observed
#'         choices, where \code{choice} must match the name of the dependent
#'         variable in \code{form}.
#'         Such a column is required for model fitting but not for prediction.
#'   \item It **must** contain a numeric column named *p_j* for each alternative
#'         specific covariate *p* in \code{form} and each choice alternative *j*
#'         in \code{alternatives}.
#'   \item It **must** contain a numeric column named *q* for each covariate *q*
#'         in \code{form} that is constant across alternatives.
#' }
#'
#' In the ordered case (\code{ordered = TRUE}), the column \code{choice} must
#' contain the full ranking of the alternatives in each choice occasion as a
#' character, where the alternatives are separated by commas, see the examples.
#'
#' See [the vignette on choice data](https://loelschlaeger.de/RprobitB/articles/v02_choice_data.html)
#' for more details.
#'
#' @param choice_data \[`data.frame`\]\cr
#' Choice data in wide format, where each row represents one choice occasion.
#'
#' @param id \[`character(1)`\]\cr
#' The name of the column in \code{choice_data} that contains unique identifier
#' for each decision maker.
#'
#' @param idc \[`character(1)`\]\cr
#' The name of the column in \code{choice_data} that contains
#' unique identifier for each choice situation of each decision maker.
#' By default, these identifier are generated by the order of appearance.
#'
#' @inheritParams check_form
#' @inheritParams RprobitB_data
#' @inheritParams missing_covariates
#'
#' @return
#' An object of class \code{RprobitB_data}.
#'
#' @examples
#' data <- prepare_data(
#'   form = choice ~ price + time + comfort + change | 0,
#'   choice_data = train_choice,
#'   re = c("price", "time"),
#'   id = "deciderID",
#'   idc = "occasionID",
#'   standardize = c("price", "time")
#' )
#'
#' ### ranked case
#' choice_data <- data.frame(
#'   "id" = 1:3, "choice" = c("A,B,C", "A,C,B", "B,C,A"), "cov" = 1
#' )
#' data <- prepare_data(
#'   form = choice ~ 0 | cov + 0,
#'   choice_data = choice_data,
#'   ranked = TRUE
#' )
#'
#' @export
#'
#' @seealso
#' \itemize{
#'   \item [check_form()] for checking the model formula
#'   \item [overview_effects()] for an overview of the model effects
#'   \item [create_lagged_cov()] for creating lagged covariates
#'   \item [as_cov_names()] for re-labeling alternative-specific covariates
#'   \item [simulate_choices()] for simulating choice data
#'   \item [train_test()] for splitting choice data into a train and test subset
#' }

prepare_data <- function(
    form, choice_data, re = NULL, alternatives = NULL, ordered = FALSE,
    ranked = FALSE, base = NULL, id = "id", idc = NULL, standardize = NULL,
    impute = "complete_cases"
  ) {

  ### check 'form'
  oeli::input_check_response(
    check = oeli::check_missing(form),
    var_name = "form"
  )
  check_form_out <- check_form(form = form, re = re, ordered = ordered)
  form <- check_form_out$form
  choice <- check_form_out$choice
  re <- check_form_out$re
  vars <- check_form_out$vars
  ASC <- check_form_out$ASC

  ### check other inputs
  oeli::input_check_response(
    check = checkmate::check_flag(ordered),
    var_name = "ordered"
  )
  oeli::input_check_response(
    check = checkmate::check_flag(ranked),
    var_name = "ranked"
  )
  if (isTRUE(ordered) && isTRUE(ranked)) {
    stop("'ordered' and 'ranked' cannot both be TRUE.", call. = FALSE)
  }

  ### check 'choice_data'
  oeli::input_check_response(
    check = checkmate::check_data_frame(choice_data),
    var_name = "choice_data"
  )
  oeli::input_check_response(
    check = checkmate::check_string(id),
    var_name = "id"
  )
  if (!id %in% colnames(choice_data)) {
    stop(
      paste0(
        "Decider identification column '", id, "' not found in 'choice_data'."
      ),
      call. = FALSE
    )
  }
  if (!is.null(idc)) {
    oeli::input_check_response(
      check = checkmate::check_string(idc),
      var_name = "idc"
    )
    if (!idc %in% colnames(choice_data)) {
      stop(
        paste0(
          "Choice occasion identification column '", idc,
          "' not found in 'choice_data'."
        ),
        call. = FALSE
      )
    }
  }

  ### transform 'id' of 'choice_data' to factor
  choice_data[, id] <- as.factor(choice_data[, id])

  ### sort 'choice_data' by 'id'
  choice_data <- choice_data[order(choice_data[, id]), ]

  ### create choice occasion 'idc' (if not specified)
  if (is.null(idc)) {
    idc <- "idc"
    choice_data[, idc] <- unlist(
      sapply(table(choice_data[, id]), seq_len, simplify = FALSE)
    )
  }

  ### transform 'idc' of 'choice_data' to factor
  choice_data[, idc] <- as.factor(choice_data[, idc])

  ### sort 'choice_data' first by column 'id' and second by column 'idc'
  choice_data <- choice_data[order(choice_data[, id], choice_data[, idc]), ]

  ### handle missing covariates
  choice_data <- missing_covariates(
    choice_data = choice_data, impute = impute,
    col_ignore = c(id, idc, choice)
  )

  ### check if 'choice_data' contains choices
  choice_available <- (choice %in% colnames(choice_data))
  if (!choice_available) choice <- NA

  ### check alternative set
  if (ordered) {
    if (is.null(alternatives)) {
      stop(
        "Please specify 'alternatives', ordered from worst to best.",
        call. = FALSE
      )
    }
  } else {
    if (is.null(alternatives)) {
      if (choice_available) {
        alternatives <- as.character(unique(choice_data[[choice]]))
        if (ranked) {
          alternatives <- unique(unlist(strsplit(alternatives, ",")))
        }
      } else {
        stop(
          "Please specify 'alternatives' if choices are not available.",
          call. = FALSE
        )
      }
    } else {
      if (!is.character(alternatives)) {
        stop(
          "'alternatives' must be a character vector.",
          call. = FALSE
        )
      }
      if (choice_available && !ranked) {
        choice_data <- choice_data[choice_data[[choice]] %in% alternatives, ]
        choice_data[, id] <- droplevels(choice_data[, id])
        choice_data[, idc] <- droplevels(choice_data[, idc])
        if (nrow(choice_data) == 0) {
          stop(
            paste(
              "No choices for", paste(alternatives, collapse = ", "), "found."
            ),
            call. = FALSE
          )
        }
      }
    }
    alternatives <- sort(alternatives)
  }
  J <- length(alternatives)
  if (J <= 1) {
    stop(
      "At least two choice alternatives are required, only one provided.",
      call. = FALSE
    )
  }
  if (ordered == TRUE && J <= 2) {
    stop(
      "Please specify 3 or more alternatives for the ordered case.",
      call. = FALSE
    )
  }
  if (ranked == TRUE && J <= 2) {
    stop(
      "Please specify 3 or more alternatives for the ranked case.",
      call. = FALSE
    )
  }

  ### determine index of base alternative
  if (ordered || (!ASC && length(vars[[1]]) == 0 && length(vars[[2]]) == 0)) {
    base <- NULL
  } else {
    if (is.null(base)) {
      base <- alternatives[J]
      base_index <- J
    } else if (any(alternatives == base)) {
      base_index <- which(alternatives == base)
    } else {
      base <- alternatives[J]
      warning(
        paste0(
          "'base' not contained in 'alternatives'. ",
          "Set 'base = ", alternatives[J], "' instead."
        ),
        immediate. = TRUE, call. = FALSE
      )
      base_index <- J
    }
  }

  ### check if all required covariates are present in 'choice_data' and numerics
  for (var in vars[[2]]) {
    if (!var %in% names(choice_data)) {
      stop(paste0("Column '", var, "' not found in 'choice_data'."),
           call. = FALSE
      )
    }
    if (!is.numeric(choice_data[, var])) {
      stop(paste0("Column '", var, "' in 'choice_data' is not numeric."),
           call. = FALSE
      )
    }
  }
  for (var in c(vars[[1]], vars[[3]])) {
    for (j in alternatives) {
      if (!paste0(var, "_", j) %in% names(choice_data)) {
        stop(
          paste0(
            "Column '", paste0(var, "_", j),
            "' not found in 'choice_data'."
          ),
          call. = FALSE
        )
      }
      if (!is.numeric(choice_data[, paste0(var, "_", j)])) {
        stop(
          paste0(
            "Column '", paste0(var, "_", j),
            "' in 'choice_data' is not numeric."
          ),
          call. = FALSE
        )
      }
    }
  }

  ### determine number and names of linear coefficients
  effects <- overview_effects(form, re, alternatives, base, ordered)
  P_f <- sum(effects$random == FALSE)
  P_r <- sum(effects$random == TRUE)

  ### artificially add ASCs
  if (ASC) choice_data[, "ASC"] <- 1

  ### standardize covariates
  if (!is.null(standardize)) {
    if (!is.character(standardize)) {
      stop("'standardize' must be a character (vector).",
           call. = FALSE
      )
    }
    if (identical(standardize, "all")) {
      standardize <- c(
        apply(expand.grid(vars[[1]], alternatives), 1, paste, collapse = "_"),
        vars[[2]],
        apply(expand.grid(vars[[3]], alternatives), 1, paste, collapse = "_")
      )
    }
    if ("ASC" %in% standardize) {
      warning(
        "Removed 'ASC' from 'standardize'.", call. = FALSE, immediate. = TRUE
      )
      standardize <- standardize[-which(standardize == "ASC")]
    }
    for (var in vars[[2]]) {
      if (var %in% standardize) {
        choice_data[, var] <- scale(choice_data[, var])
      }
    }
    for (var in c(vars[[1]], vars[[3]])) {
      for (j in alternatives) {
        var_alt <- paste0(var, "_", j)
        if (var_alt %in% standardize) {
          choice_data[, var_alt] <- scale(choice_data[, var_alt])
        }
      }
    }
  }

  ### transform 'choice_data' in list format 'data'
  ids <- unique(choice_data[, id])
  N <- length(ids)
  T <- as.numeric(table(choice_data[, id]))
  data <- list()
  pb <- RprobitB_pb(title = "Preparing data", total = N, tail = "deciders")
  for (n in seq_len(N)) {
    RprobitB_pb_tick(pb)
    data[[n]] <- list()
    data_n <- choice_data[choice_data[, id] == ids[n], ]
    X_n <- list()

    for (t in seq_len(T[n])) {
      data_nt <- data_n[t, ]

      if (ordered) {
        X_nt <- matrix(data_nt[, vars[[2]]], nrow = 1)
        colnames(X_nt) <- vars[[2]]
      } else {
        X_nt <- matrix(NA_real_, nrow = J, ncol = 0)

        ### type-1 covariates
        for (var in vars[[1]]) {
          old_names <- colnames(X_nt)
          col <- numeric(J)
          for (j in 1:J) {
            col[j] <- data_nt[, paste0(var, "_", alternatives[j])]
          }
          X_nt <- cbind(X_nt, col)
          colnames(X_nt) <- c(old_names, var)
        }

        ### type-2 covariates
        for (var in c(vars[[2]], if (ASC) "ASC")) {
          old_names <- colnames(X_nt)
          mat <- matrix(0, J, J)
          for (j in (1:J)[-base_index]) {
            mat[j, j] <- data_nt[, var]
          }
          mat <- mat[, -base_index, drop = FALSE]
          X_nt <- cbind(X_nt, mat)
          colnames(X_nt) <- c(
            old_names,
            paste0(
              var, "_",
              alternatives[(1:J)[-base_index]]
            )
          )
        }

        ### type-3 covariates
        for (var in vars[[3]]) {
          old_names <- colnames(X_nt)
          mat <- matrix(0, J, J)
          for (j in 1:J) {
            mat[j, j] <- data_nt[, paste0(var, "_", alternatives[j])]
          }
          X_nt <- cbind(X_nt, mat)
          colnames(X_nt) <- c(old_names, paste0(var, "_", alternatives))
        }
      }

      ### sort covariates
      X_nt <- X_nt[, effects$effect, drop = FALSE]

      ### save in list
      X_n[[t]] <- X_nt
    }

    data[[n]][["X"]] <- X_n
    data[[n]][["y"]] <- if (choice_available) data_n[[choice]] else NA
  }

  ### delete "ASC" from 'choice_data'
  if (ASC) choice_data$ASC <- NULL

  ### save cov names
  alt_length <- length(alternatives)
  cov_names <- c(
    if (length(vars[[1]]) > 0) {
      paste(rep(vars[[1]], each = alt_length), alternatives, sep = "_")
    },
    vars[[2]],
    if (length(vars[[3]]) > 0) {
      paste(rep(vars[[3]], each = alt_length), alternatives, sep = "_")
    }
  )

  ### create output
  RprobitB_data(
    data = data,
    choice_data = choice_data,
    N = N,
    T = T,
    J = J,
    P_f = P_f,
    P_r = P_r,
    alternatives = alternatives,
    ordered = ordered,
    ranked = ranked,
    base = base,
    form = form,
    re = re,
    ASC = ASC,
    effects = effects,
    standardize = standardize,
    simulated = FALSE,
    choice_available = choice_available,
    true_parameter = NULL,
    res_var_names = list(
      "choice" = choice, "cov" = cov_names, "id" = id, "idc" = idc
    )
  )
}