# R/features.R In shapr: Prediction Explanation with Dependence-Aware Shapley Values

#### Documented in apply_dummiesfeature_combinationsmake_dummies

#' Define feature combinations, and fetch additional information about each unique combination
#'
#' @param m Positive integer. Total number of features.
#' @param exact Logical. If \code{TRUE} all \code{2^m} combinations are generated, otherwise a
#' subsample of the combinations is used.
#' @param n_combinations Positive integer. Note that if \code{exact = TRUE},
#' \code{n_combinations} is ignored. However, if \code{m > 12} you'll need to add a positive integer
#' value for \code{n_combinations}.
#' @param weight_zero_m Numeric. The value to use as a replacement for infinite combination
#' weights when doing numerical operations.
#'
#' @return A data.table that contains the following columns:
#' \describe{
#' \item{id_combination}{Positive integer. Represents a unique key for each combination. Note that the table
#' is sorted by \code{id_combination}, so that is always equal to \code{x[["id_combination"]] = 1:nrow(x)}.}
#' \item{features}{List. Each item of the list is an integer vector where \code{features[[i]]}
#' represents the indices of the features included in combination \code{i}. Note that all the items
#' are sorted such that \code{features[[i]] == sort(features[[i]])} is always true.}
#' \item{n_features}{Vector of positive integers. \code{n_features[i]} equals the number of features in combination
#' \code{i}, i.e. \code{n_features[i] = length(features[[i]])}.}.
#' \item{N}{Positive integer. The number of unique ways to sample \code{n_features[i]} features
#' from \code{m} different features, without replacement.}
#' }
#'
#' @export
#'
#' @author Nikolai Sellereite, Martin Jullum
#'
#' @examples
#' # All combinations
#' x <- feature_combinations(m = 3)
#' nrow(x) # Equals 2^3 = 8
#'
#' # Subsample of combinations
#' x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2)
feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6) {

# Force user to use a natural number for n_combinations if m > 13
if (m > 13 & is.null(n_combinations)) {
stop(
paste0(
"Due to computational complexity, we recommend setting n_combinations = 10 000\n",
"if the number of features is larger than 13. Note that you can force the use of the exact\n",
"method (i.e. n_combinations = NULL) by setting n_combinations equal to 2^m,\n",
"where m is the number of features."
)
)
}

# Not supported for m > 30
if (m > 30) {
stop("Currently we are not supporting cases where the number of features is greater than 30.")
}

if (!exact && n_combinations > (2^m - 2)) {
n_combinations <- 2^m - 2
exact <- TRUE
message(
paste0(
"\nn_combinations is larger than or equal to 2^m = ", 2^m, ". \n",
)
)
}

if (exact) {
dt <- feature_exact(m, weight_zero_m)
} else {
dt <- feature_not_exact(m, n_combinations, weight_zero_m)
stopifnot(
data.table::is.data.table(dt),
!is.null(dt[["p"]])
)
p <- NULL # due to NSE notes in R CMD check
dt[, p := NULL]
}

return(dt)
}

#' @keywords internal
feature_exact <- function(m, weight_zero_m = 10^6) {
features <- id_combination <- n_features <- shapley_weight <- N <- NULL # due to NSE notes in R CMD check

dt <- data.table::data.table(id_combination = seq(2^m))
combinations <- lapply(0:m, utils::combn, x = m, simplify = FALSE)
dt[, features := unlist(combinations, recursive = FALSE)]
dt[, n_features := length(features[]), id_combination]
dt[, N := .N, n_features]
dt[, shapley_weight := shapley_weights(m = m, N = N, n_features, weight_zero_m)]

return(dt)
}

#' @keywords internal
feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6) {
features <- id_combination <- n_features <- shapley_weight <- N <- NULL # due to NSE notes in R CMD check

# Find weights for given number of features ----------
n_features <- seq(m - 1)
n <- sapply(n_features, choose, n = m)
w <- shapley_weights(m = m, N = n, n_features) * n
p <- w / sum(w)

# Sample number of chosen features ----------
X <- data.table::data.table(
n_features = c(
0,
sample(
x = n_features,
size = n_combinations,
replace = TRUE,
prob = p
),
m
)
)
X[, n_features := as.integer(n_features)]

# Sample specific set of features -------
data.table::setkeyv(X, "n_features")
feature_sample <- sample_features_cpp(m, X[["n_features"]])

# Get number of occurences and duplicated rows-------
is_duplicate <- NULL # due to NSE notes in R CMD check
r <- helper_feature(m, feature_sample)
X[, is_duplicate := r[["is_duplicate"]]]

# When we sample combinations the Shapley weight is equal
# to the frequency of the given combination
X[, shapley_weight := r[["sample_frequence"]]]

# Populate table and remove duplicated rows -------
X[, features := feature_sample]
if (any(X[["is_duplicate"]])) {
X <- X[is_duplicate == FALSE]
}
X[, is_duplicate := NULL]

# Add shapley weight and number of combinations
X[c(1, .N), shapley_weight := weight_zero_m]
X[, N := 1]
ind <- X[, .I[between(n_features, 1, m - 1)]]
X[ind, p := p[n_features]]
X[ind, N := n[n_features]]

# Set column order and key table
data.table::setkeyv(X, "n_features")
X[, id_combination := .I]
X[, N := as.integer(N)]
nms <- c("id_combination", "features", "n_features", "N", "shapley_weight", "p")
data.table::setcolorder(X, nms)

return(X)
}

#' @keywords internal
helper_feature <- function(m, feature_sample) {
sample_frequence <- is_duplicate <- NULL # due to NSE notes in R CMD check

x <- feature_matrix_cpp(feature_sample, m)
dt <- data.table::data.table(x)
cnms <- paste0("V", seq(m))
data.table::setnames(dt, cnms)
dt[, sample_frequence := as.integer(.N), by = cnms]
dt[, is_duplicate := duplicated(dt)]
dt[, (cnms) := NULL]

return(dt)
}

#' Initiate the making of dummy variables
#'
#' @param traindata data.table or data.frame.
#'
#' @param testdata data.table or data.frame. New data that has the same
#' feature names, types, and levels as \code{traindata}.
#'
#' @return A list that contains the following entries:
#' \describe{
#' \item{feature_list}{List. Output from \code{check_features}}
#' \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as
#' one-hot encoded variables.}
#' \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as
#' one-hot encoded variables.}
#' \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to
#' \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to
#' }
#'
#' @export
#'
#' @author Annabelle Redelmeier, Martin Jullum
#'
#' @examples
#' if (requireNamespace("MASS", quietly = TRUE)) {
#'   data("Boston", package = "MASS")
#'   x_var <- c("lstat", "rm", "dis", "indus")
#'   y_var <- "medv"
#'   x_train <- as.data.frame(Boston[401:411, x_var])
#'   y_train <- Boston[401:408, y_var]
#'   x_test <- as.data.frame(Boston[1:4, x_var])
#'
#'   # convert to factors for illustational purpose
#'   x_train$rm <- factor(round(x_train$rm))
#'   x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) #' #' dummylist <- make_dummies(traindata = x_train, testdata = x_test) #' } make_dummies <- function(traindata, testdata) { if (all(is.null(colnames(traindata)))) { stop(paste0("The traindata is missing column names")) } if (all(is.null(colnames(testdata)))) { stop(paste0("The testdata is missing column names")) } train_dt <- data.table::as.data.table(traindata) test_dt <- data.table::as.data.table(testdata) feature_list_train <- get_data_specs(train_dt) feature_list_test <- get_data_specs(test_dt) feature_list_train$specs_type <- "traindata"
feature_list_test$specs_type <- "testdata" updater <- check_features(feature_list_train, feature_list_test, F) # Reorderes factor levels so that they match each other update_data(train_dt, updater) update_data(test_dt, updater) feature_list <- updater # Extracts the components factor_features <- feature_list$labels[updater$classes == "factor"] if (length(factor_features) > 0) { factor_list <- feature_list$factor_levels[factor_features]
feature_list$contrasts_list <- lapply(train_dt[, factor_features, with = FALSE], contrasts, contrasts = FALSE) # get train dummies m <- model.frame( data = train_dt, xlev = factor_list ) train_dummies <- model.matrix( object = ~ . + 0, data = m, contrasts.arg = feature_list$contrasts_list
)

# get test dummies
m <- model.frame(
data = test_dt,
xlev = factor_list
)
test_dummies <- model.matrix(
object = ~ . + 0,
data = m,
contrasts.arg = feature_list$contrasts_list ) } else { train_dummies <- train_dt test_dummies <- test_dt } return(list( feature_list = feature_list, train_dummies = train_dummies, test_dummies = test_dummies, traindata_new = train_dt, testdata_new = test_dt )) } #' Apply dummy variables - this is an internal function intended only to be used in #' predict_model.xgb.Booster() #' #' @param feature_list List. The \code{feature_list} object in the output object after running #' \code{\link[shapr:make_dummies]{make_dummies}} #' #' @param testdata data.table or data.frame. New data that has the same #' feature names, types, and levels as \code{feature_list}. #' #' @return A data.table with all features but where the factors in \code{testdata} are #' one-hot encoded variables as specified in feature_list #' #' @author Annabelle Redelmeier, Martin Jullum #' #' @keywords internal #' apply_dummies <- function(feature_list, testdata) { if (all(is.null(colnames(testdata)))) { stop(paste0("The testdata is missing column names")) } test_dt <- data.table::as.data.table(testdata) feature_list_test <- get_data_specs(test_dt) feature_list_test$specs_type <- "testdata"

updater <- check_features(feature_list, feature_list_test, F)

# Reorderes factor levels so that they match
update_data(test_dt, updater)

factor_features <- feature_list$labels[updater$classes == "factor"] # check which features are factors

if (length(factor_features) > 0) {
factor_list <- feature_list$factor_levels[factor_features] m <- model.frame( data = test_dt, xlev = factor_list ) x <- model.matrix( object = ~ . + 0, data = m, contrasts.arg = feature_list$contrasts_list
)
} else {
x <- test_dt
}

return(x)
}


## Try the shapr package in your browser

Any scripts or data that you put into this service are public.

shapr documentation built on Jan. 28, 2021, 5:06 p.m.