R/approach_regression_separate.R

Defines functions regression.cv_message regression.prep_message_comb regression.prep_message_batch regression.separate_time_mess regression.check_namespaces regression.check_vfold_cv_para regression.check_recipe_func regression.check_parameters regression.get_tune regression.get_y_hat regression.get_string_to_R regression.train_model prepare_data.regression_separate setup_approach.regression_separate

Documented in prepare_data.regression_separate regression.check_namespaces regression.check_parameters regression.check_recipe_func regression.check_vfold_cv_para regression.cv_message regression.get_string_to_R regression.get_tune regression.get_y_hat regression.prep_message_batch regression.prep_message_comb regression.separate_time_mess regression.train_model setup_approach.regression_separate

# Shapr functions ======================================================================================================
#' @rdname setup_approach
#'
#' @param regression.model A `tidymodels` object of class `model_specs`. Default is a linear regression model, i.e.,
#' [parsnip::linear_reg()]. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models,
#' and see the vignette for how to add new/own models. Note, to make it easier to call `explain()` from Python, the
#' `regression.model` parameter can also be a string specifying the model which will be parsed and evaluated. For
#' example, `"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"`
#' is also a valid input. It is essential to include the package prefix if the package is not loaded.
#' @param regression.tune_values Either `NULL` (default), a data.frame/data.table/tibble, or a function.
#' The data.frame must contain the possible hyperparameter value combinations to try.
#' The column names must match the names of the tuneable parameters specified in `regression.model`.
#' If `regression.tune_values` is a function, then it should take one argument `x` which is the training data
#' for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above.
#' Using a function allows the hyperparameter values to change based on the size of the combination. See the regression
#' vignette for several examples.
#' Note, to make it easier to call `explain()` from Python, the `regression.tune_values` can also be a string
#' containing an R function. For example,
#' `"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"` is also a valid input.
#' It is essential to include the package prefix if the package is not loaded.
#' @param regression.vfold_cv_para Either `NULL` (default) or a named list containing
#' the parameters to be sent to [rsample::vfold_cv()]. See the regression vignette for
#' several examples.
#' @param regression.recipe_func Either `NULL` (default) or a function that that takes in a [recipes::recipe()]
#' object and returns a modified [recipes::recipe()] with potentially additional recipe steps. See the regression
#' vignette for several examples.
#' Note, to make it easier to call `explain()` from Python, the `regression.recipe_func` can also be a string
#' containing an R function. For example,
#' `"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"` is also
#' a valid input. It is essential to include the package prefix if the package is not loaded.
#' @inheritParams default_doc_explain
#'
#' @export
#' @author Lars Henry Berge Olsen
setup_approach.regression_separate <- function(internal,
                                               regression.model = parsnip::linear_reg(),
                                               regression.tune_values = NULL,
                                               regression.vfold_cv_para = NULL,
                                               regression.recipe_func = NULL,
                                               ...) {
  # Check that required libraries are installed
  regression.check_namespaces()

  # Small printout to the user
  if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_separate'.")
  if (internal$parameters$verbose == 2) regression.separate_time_mess() # TODO: maybe remove

  # Add the default parameter values for the non-user specified parameters for the separate regression approach
  defaults <-
    mget(c("regression.model", "regression.tune_values", "regression.vfold_cv_para", "regression.recipe_func"))
  internal <- insert_defaults(internal, defaults)

  # Check the parameters to the regression approach
  internal <- regression.check_parameters(internal = internal)

  # Small printout to the user
  if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_separate'.")

  return(internal) # Return the updated internal list
}

#' @inheritParams default_doc
#' @rdname prepare_data
#' @export
#' @author Lars Henry Berge Olsen
prepare_data.regression_separate <- function(internal, index_features = NULL, ...) {
  # Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above.
  requireNamespace("workflows", quietly = TRUE)

  # Get the features in the batch
  features <- internal$objects$X$features[index_features]

  # Small printout to the user about which batch that are currently worked on
  if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features)

  # Initialize empty data table with specific column names and id_combination (transformed to integer later). The data
  # table will contain the contribution function values for the coalitions given by `index_features` and all explicands.
  dt_res_column_names <- c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain)))
  dt_res <- data.table(matrix(ncol = length(dt_res_column_names), nrow = 0, dimnames = list(NULL, dt_res_column_names)))

  # Iterate over the coalitions provided by index_features.
  # Note that index_features will never be NULL and never contain the empty or grand coalitions.
  for (comb_idx in seq_along(features)) {
    # Get the column indices of the features in current coalition/combination
    current_comb <- features[[comb_idx]]

    # Extract the current training (and add y_hat as response) and explain data
    current_x_train <- internal$data$x_train[, ..current_comb][, "y_hat" := internal$data$x_train_y_hat]
    current_x_explain <- internal$data$x_explain[, ..current_comb]

    # Fit the current separate regression model to the current training data
    if (internal$parameters$verbose == 2) regression.prep_message_comb(internal, index_features, comb_idx)
    regression.current_fit <- regression.train_model(
      x = current_x_train,
      seed = internal$parameters$seed,
      verbose = internal$parameters$verbose,
      regression.model = internal$parameters$regression.model,
      regression.tune = internal$parameters$regression.tune,
      regression.tune_values = internal$parameters$regression.tune_values,
      regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para,
      regression.recipe_func = internal$parameters$regression.recipe_func
    )

    # Compute the predicted response for the explicands, i.e., the v(S, x_i) for all explicands x_i.
    pred_explicand <- predict(regression.current_fit, new_data = current_x_explain)$.pred

    # Add the new contribution function values for the current coalitions S to the result data table as a new row
    dt_res <- rbind(dt_res, data.table(index_features[comb_idx], matrix(pred_explicand, nrow = 1)), use.names = FALSE)
  }

  # Set id_combination to be the key
  dt_res[, id_combination := as.integer(id_combination)]
  data.table::setkey(dt_res, id_combination)

  # Return the estimated contribution function values
  return(dt_res)
}

# Train functions ======================================================================================================
#' Train a tidymodels model via workflows
#'
#' Function that trains a `tidymodels` model via `workflows` based on the provided input parameters.
#' This function allows for cross validating the hyperparameters of the model.
#'
#' @inheritParams setup_approach.regression_separate
#' @inheritParams explain
#' @param x Data.table containing the data. Either the training data or the explicands. If `x` is the explicands,
#' then `index_features` must be provided.
#' @param regression.tune Logical (default is `FALSE`). If `TRUE`, then we are to tune the hyperparemeters based on
#' the values provided in `regression.tune_values`. Note that no checks are conducted as this is checked earlier in
#' `setup_approach.regression_separate` and `setup_approach.regression_surrogate`.
#' @param regression.response_var String (default is `y_hat`) containing the name of the response variable.
#' @param regression.surrogate_n_comb Integer (default is `NULL`). The number of times each training observations
#' has been augmented. If `NULL`, then we assume that we are doing separate regression.
#'
#' @return A trained `tidymodels` model based on the provided input parameters.
#' @export
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.train_model <- function(x,
                                   seed = 1,
                                   verbose = 0,
                                   regression.model = parsnip::linear_reg(),
                                   regression.tune = FALSE,
                                   regression.tune_values = NULL,
                                   regression.vfold_cv_para = NULL,
                                   regression.recipe_func = NULL,
                                   regression.response_var = "y_hat",
                                   regression.surrogate_n_comb = NULL) {
  # Create a recipe to the augmented training data
  regression.recipe <- recipes::recipe(as.formula(paste(regression.response_var, "~ .")), data = x)

  # Update the recipe if user has provided a function for this. User is responsible for that the function works.
  # This function can, e.g., add transformations, normalization, dummy encoding, interactions, and so on.
  if (!is.null(regression.recipe_func)) regression.recipe <- regression.recipe_func(regression.recipe)

  # Combine workflow, model specification, and recipe
  regression.workflow <-
    workflows::add_recipe(workflows::add_model(workflows::workflow(), regression.model), regression.recipe)

  # Check if we are to tune hyperparameters in the regression model, as we then need to update the workflow.
  # If we are not doing any hyperparameter tuning, then the workflow above is enough.
  if (regression.tune) {
    # Set up the V-fold cross validation using the user provided parameters in `regression.vfold_cv_para`.
    # Note if `regression.vfold_cv_para` is NULL, then we use the default parameters in `vfold_cv()`.
    regression.folds <- do.call(rsample::vfold_cv, c(list(data = x), regression.vfold_cv_para))

    # Check if we are doing surrogate regression, as we then need to update the indices as the augmented
    # training data is highly correlated due to the augmentations which will mess up the cross validation.
    # Since one there assumes that the training and evaluation data are independent. The following code ensures
    # that all augmentations of a single training observations are either in the training or evaluation data.
    if (!is.null(regression.surrogate_n_comb)) {
      if (!is.null(regression.vfold_cv_para) && any(names(regression.vfold_cv_para) != "v")) {
        stop("The `regression.vfold_cv_para` parameter supports only the `v` parameter for surrogate regression.")
      }

      n <- nrow(x) / regression.surrogate_n_comb # Get the number of training observations (before augmentation)
      n_folds <- nrow(regression.folds) # Get the number of folds
      folds <- sample(rep(seq_len(n_folds), length.out = n)) # Sample in which fold the i'th obs is in the eval data
      indices <- lapply(split(seq_len(n), folds), function(x) setdiff(seq_len(n), x)) # Sample the training indices

      # Loop over the folds, extend the indices to reflect the augmentation, and insert the updated training indices
      for (fold_idx in seq(n_folds)) {
        regression.folds$splits[[fold_idx]]$in_id <-
          unlist(lapply(
            indices[[fold_idx]],
            function(idx) seq(regression.surrogate_n_comb * (idx - 1) + 1, regression.surrogate_n_comb * idx)
          ))
      }
    }

    # Extract the grid of hyperparameter values. Note that regression.tune_values is either a data.frame or a function.
    if (is.data.frame(regression.tune_values)) {
      regression.grid <- regression.tune_values
    } else {
      regression.grid <- regression.tune_values(x[, -..regression.response_var])
    }

    # Add the hyperparameter tuning to the workflow
    regression.results <- tune::tune_grid(
      object = regression.workflow,
      resamples = regression.folds,
      grid = regression.grid,
      metrics = yardstick::metric_set(yardstick::rmse)
    )

    # Small printout to the user
    if (verbose == 2) regression.cv_message(regression.results = regression.results, regression.grid = regression.grid)

    # Set seed for reproducibility. Without this we get different results based on if we run in parallel or sequential
    set.seed(seed)

    # Update the workflow by finalizing it using the hyperparameters that attained the best rmse
    regression.workflow <-
      tune::finalize_workflow(regression.workflow, tune::select_best(regression.results, metric = "rmse"))
  }

  # Fit the model to the augmented training data and return the trained model
  return(parsnip::fit(regression.workflow, data = x))
}


# Get functions ========================================================================================================
#' Convert the string into an R object
#'
#' @param string A character vector/string containing the text to convert into R code.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.get_string_to_R <- function(string) {
  return(eval(parse(text = string)))
}

#' Get the predicted responses
#'
#' @inheritParams default_doc
#'
#' @return The same `internal` list, but added vectors `internal$data$x_train_y_hat` and
#' `internal$data$x_explain_y_hat` containing the predicted response of the training and explain data.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.get_y_hat <- function(internal, model, predict_model) {
  # Predict the response of the training and explain data. Former is the response the regression models are fitted to.
  internal$data$x_train_y_hat <- predict_model(model, internal$data$x_train)
  internal$data$x_explain_y_hat <- predict_model(model, internal$data$x_explain)
  return(internal)
}

#' Get if model is to be tuned
#'
#' That is, if the regression model contains hyperparameters we are to tune using cross validation.
#' See \href{https://www.tidymodels.org/find/parsnip/#model-args}{tidymodels} for default model hyperparameters.
#'
#' @inheritParams setup_approach.regression_separate
#' @inheritParams explain
#'
#' @return A boolean variable indicating if the regression model is to be tuned.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.get_tune <- function(regression.model, regression.tune_values, x_train) {
  # Check that the regression model is a tidymodels object
  if (is.null(regression.model) || !"model_spec" %in% class(regression.model)) {
    stop("`regression.model` must be a tidymodels object with class 'model_spec'. See documentation.")
  }

  # Check if we are to tune some model hyperparameters
  regression.para <- lapply(regression.model$args, function(para) rlang::quo_get_expr(para))
  regression.para_tune <- lapply(regression.para, function(para) !is.null(para) && grepl("tune()", para))
  regression.para_tune_names <- names(regression.para_tune)[unlist(regression.para_tune)]
  regression.tune <- any(unlist(regression.para_tune))

  # Check that user have provided a tuning
  if (isTRUE(regression.tune) && is.null(regression.tune_values)) {
    stop("`regression.tune_values` must be provided when `regression.model` contains hyperparameters to tune.")
  }

  # Check function or tibble
  if (!is.null(regression.tune_values) &&
    !is.data.frame(regression.tune_values) &&
    !is.function(regression.tune_values)) {
    stop("`regression.tune_values` must be of either class `data.frame` or `function`. See documentation.")
  }

  # Get the grid values. And if user provided a function, then check that it is a data.frame.
  regression.tune_values_grid <- regression.tune_values
  if (is.function(regression.tune_values)) {
    regression.tune_values_grid <- regression.tune_values(x_train)
    if (!is.data.frame(regression.tune_values_grid)) {
      stop("The output of the user provided `regression.tune_values` function must be of class `data.frame`.")
    }
  }

  # Get the names of the hyperparameters the user provided values for
  regression.tune_values_names <- names(regression.tune_values_grid)

  # Check that user have provided values for the hyperparameters to tune
  if (!(all(regression.tune_values_names %in% regression.para_tune_names) &&
    all(regression.para_tune_names %in% regression.tune_values_names))) {
    stop(paste0(
      "The tunable parameters in `regression.model` ('",
      paste(regression.para_tune_names, collapse = "', '"), "') and `regression.tune_values` ('",
      paste(regression.tune_values_names, collapse = "', '"), "') must match."
    ))
  }

  # Return if we are to tune some model hyperparameters
  return(regression.tune)
}

# Check functions ======================================================================================================
#' Check regression parameters
#'
#' @inheritParams default_doc
#'
#' @return The same `internal` list, but added logical indicator `internal$parameters$regression.tune`
#' if we are to tune the regression model/models.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.check_parameters <- function(internal) {
  # Convert the objects to R-objects if they are strings
  if (is.character(internal$parameters$regression.model)) {
    internal$parameters$regression.model <- regression.get_string_to_R(internal$parameters$regression.model)
  }
  if (is.character(internal$parameters$regression.tune_values)) {
    internal$parameters$regression.tune_values <- regression.get_string_to_R(internal$parameters$regression.tune_values)
  }
  if (is.character(internal$parameters$regression.recipe_func)) {
    internal$parameters$regression.recipe_func <- regression.get_string_to_R(internal$parameters$regression.recipe_func)
  }

  # Check that it is a function that returns the RHS of the formula for arbitrary feature name inputs
  regression.check_recipe_func(
    regression.recipe_func = internal$parameters$regression.recipe_func,
    x_explain = internal$data$x_explain
  )

  # Check that `regression.vfold_cv_para` is either NULL or a named list that only contains recognized parameters
  regression.check_vfold_cv_para(regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para)

  # Check that `regression.check_sur_n_comb` is a valid value (only applicable for surrogate regression)
  regression.check_sur_n_comb(
    regression.surrogate_n_comb = internal$parameters$regression.surrogate_n_comb,
    used_n_combinations = internal$parameters$used_n_combinations
  )

  # Check and get if we are to tune the hyperparameters of the regression model
  internal$parameters$regression.tune <- regression.get_tune(
    regression.model = internal$parameters$regression.model,
    regression.tune_values = internal$parameters$regression.tune_values,
    x_train = internal$data$x_train
  )

  return(internal)
}

#' Check `regression.recipe_func`
#'
#' Check that regression.recipe_func is a function that returns the
#' RHS of the formula for arbitrary feature name inputs.
#'
#' @inheritParams explain
#' @inheritParams setup_approach.regression_separate
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.check_recipe_func <- function(regression.recipe_func, x_explain) {
  if (!is.null(regression.recipe_func) && !is.function(regression.recipe_func)) {
    stop("`regression.recipe_func` must be a function. See documentation.")
  }

  if (!is.null(regression.recipe_func) && is.function(regression.recipe_func)) {
    x_temp <- copy(x_explain)[, "y_hat_temp" := 1]
    regression.recipe_func_output <- regression.recipe_func(recipes::recipe(y_hat_temp ~ ., data = x_temp))
    if (!"recipe" %in% class(regression.recipe_func_output)) {
      stop("The output of the `regression.recipe_func` must be of class `recipe`.")
    }
  }
}

#' Check the parameters that are sent to [rsample::vfold_cv()]
#'
#' Check that `regression.vfold_cv_para` is either NULL or a named list that only contains recognized parameters.
#'
#' @inheritParams setup_approach.regression_separate
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.check_vfold_cv_para <- function(regression.vfold_cv_para) {
  if (!is.null(regression.vfold_cv_para)) {
    # Check that regression.vfold_cv_para is a named list
    if (!is.list(regression.vfold_cv_para) || is.null(names(regression.vfold_cv_para))) {
      stop("`regression.vfold_cv_para` must be a named list. See documentation using '?shapr::explain()'.")
    }

    # Check that all entries are parameters in the rsample::vfold_cv() function
    unknown_para_names <-
      names(regression.vfold_cv_para)[!names(regression.vfold_cv_para) %in% methods::formalArgs(rsample::vfold_cv)[-1]]
    if (length(unknown_para_names) > 0) {
      stop(paste0(
        "The following parameters in `regression.vfold_cv_para` are not supported by `rsample::vfold_cv()`: '",
        paste0(unknown_para_names, collapse = "', '"), "'."
      ))
    }

    # Ensure that we have at least two folds in the cross validation procedure
    if ("v" %in% names(regression.vfold_cv_para) && regression.vfold_cv_para[["v"]] <= 1) {
      stop("The parameter `v` in `regression.vfold_cv_para` must be strictly larger than 1.")
    }
  }
}

#' Check that needed libraries are installed
#'
#' This function checks that the `parsnip`, `recipes`, `workflows`, `tune`, `dials`,
#' `yardstick`, `hardhat`, `rsample`, and `rlang` packages are available.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.check_namespaces <- function() {
  namespaces <- c("parsnip", "recipes", "workflows", "tune", "dials", "yardstick", "hardhat", "rsample", "rlang")
  for (namespace in namespaces) {
    if (!requireNamespace(namespace, quietly = TRUE)) {
      stop(paste0(
        "`", namespace, "` is not installed. Please run `install.packages('", namespace, "')` to install ",
        "it or run `install.packages('tidymodels')` to install all relevant packages."
      ))
    }
  }
}

# Message functions ====================================================================================================
#' Produce time message for separate regression
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.separate_time_mess <- function() {
  message(paste(
    "When using `approach = 'regression_separate'` the `explanation$timing$timing_secs` object \n",
    "can be missleading as `setup_computation` does not contain the training times of the \n",
    "regression models as they are trained on the fly in `compute_vS`. This is to reduce memory \n",
    "usage and to improve efficency.\n"
  )) # TODO: should we add the time somewhere else?
}

#' Produce message about which batch prepare_data is working on
#' @inheritParams default_doc
#' @inheritParams default_doc_explain
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.prep_message_batch <- function(internal, index_features) {
  message(paste0(
    "Working on batch ", internal$objects$X[id_combination == index_features[1]]$batch, " of ",
    internal$parameters$n_batches, " in `prepare_data.", internal$parameters$approach, "()`."
  ))
}

#' Produce message about which combination prepare_data is working on
#' @inheritParams default_doc
#' @inheritParams default_doc_explain
#' @param comb_idx Integer. The index of the combination in a specific batch.
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.prep_message_comb <- function(internal, index_features, comb_idx) {
  message(paste0(
    "Working on combination with id ", internal$objects$X$id_combination[index_features[comb_idx]],
    " of ", internal$parameters$used_n_combinations, "."
  ))
}

#' Produce message about which batch prepare_data is working on
#'
#' @param regression.results The results of the CV procedures.
#' @param regression.grid Object containing the hyperparameter values.
#' @param n_cv Integer (default is 10) specifying the number of CV hyperparameter configurations to print.
#'
#' @author Lars Henry Berge Olsen
#' @keywords internal
regression.cv_message <- function(regression.results, regression.grid, n_cv = 10) {
  # Get the feature names and add evaluation metric rmse
  feature_names <- names(regression.grid)
  feature_names_rmse <- c(feature_names, "rmse", "rmse_std_err")

  # Let n_cv be the minimum of the provided value and the number of possible printouts
  n_cv <- min(n_cv, nrow(regression.grid))

  # Extract the n_cv best results
  best_results <- tune::show_best(regression.results, n = n_cv, metric = "rmse")

  # Needed to make prinout tables prettier to ensure same column dimensions for all settings.
  regression.grid_best <- best_results[, feature_names]
  regression.grid_best$rmse <- round(best_results$mean, 2)
  regression.grid_best$rmse_std <- round(best_results$std_err, 2)
  width <- sapply(regression.grid_best, function(x) max(nchar(as.character(unique(x)))))

  # Message title of the results
  message(paste0("Results of the ", best_results$n[1], "-fold cross validation (top ", n_cv, " best configurations):"))

  # Iterate over the n_cv best results and print out the hyper parameter values and the rmse and rmse_std_err
  for (row_idx in seq_len(nrow(best_results))) {
    best_result <- best_results[row_idx, ]
    feature_values <- best_result[feature_names]
    feature_values_rmse <- c(
      feature_values,
      format(round(best_result$mean, 2), nsmall = 2), format(round(best_result$std_err, 2), nsmall = 2)
    )
    values_fixed_len <- sapply(
      seq_along(feature_values_rmse),
      function(x) format(as.character(feature_values_rmse[x]), width = width[x], justify = "left")
    )
    message(paste0("#", row_idx, ": ", paste(paste(feature_names_rmse, "=", values_fixed_len), collapse = "  "), ""))
  }

  message("") # Empty message to get a blank line
}
NorskRegnesentral/shapr documentation built on April 19, 2024, 1:19 p.m.