#' feature_importance_permutation
#'
#' @description
#' returns which columns are the most important in the fitted model.
#' This is done by permuting the inputs and measuring the deterioration of the metric
#' This implementation is not suitbile for one-hot encoded categorical variables.
#' The permutation importance is defined to be the difference between the baseline metric and metric from permutating the feature column.
#'
#' @param data dataframe - data from which the model can give predictions.
#' For xgboost it must contain only features used in \code{model} in the correct order
#' Often this dataset is the validation data
#' @param model model object - tested examples are lm glm and xgboost
#' @param actual vector[Numeric] - target to be predicted. Must be normalised by exposure
#' @param weight vector[Numeric] - exposure for predictions
#' @param metric function - Of type in admr::metric_ - must have arguments actual, predicted, weight
#' @param nrounds integer - Number of times to permute each feature
#' @param seed integer - random seed for permuations
#' @param ... OPTIONAL: - Arguments included but not defined above will be carried through to the metric
#'
#' @return dataframe with columns
#' :col_index - position of feature in \code{data}
#' :feature - name of feature in \code{data}
#' :importance_mean - importance of feature
#' :importance_sd - standard deviation of importance, will be NA if nrounds = 1
#' this data can be used to find the most important features in the model
#' @export
#'
#' @examples
#'
#' input_data <- data.frame(x1=runif(100, 0, 25), x2=runif(100, 0, 25), x3=runif(100, 0, 25)) %>%
#' mutate(target=x1^2 * 0.01 + x2 + rnorm(n(),sd=5))
#'
#' #LM
#' model_lm <- glm(target ~ poly(x1, 2) + x2, data=input_data)
#'
#' feature_importance_permutation(data=input_data %>% select(-target), model=model_lm, actual=input_data[["target"]])
#'
#' #GLM
#' model_glm <- glm(target ~ poly(x1, 2) + x2 + x3, data=input_data)
#'
#' feature_importance_permutation(data=input_data %>% select(-target), model=model_glm, actual=input_data[["target"]])
#'
#' #GBM
#' model_gbm <- xgboost(data = as.matrix(input_data %>% select(-target)), label=input_data[["target"]], nrounds=20, verbose = 0)
#'
#'
#' feature_importance_permutation(model=model_gbm,
#' data=input_data %>% select(-target),
#' actual=input_data[["target"]])
#'
feature_importance_permutation <- function(data,
model,
actual,
weight=rep(1, nrow(data)),
metric=metric_rmse,
nrounds=10,
seed=666,
...){
# checks on inputs
checkmate::assert_data_frame(data)
checkmate::assert_numeric(actual, len = nrow(data))
checkmate::assert_numeric(weight, len = nrow(data), lower=0)
checkmate::assert_integerish(nrounds, len=1, lower=1)
checkmate::assert_integerish(seed, len=1, lower=1)
kwargs=list(...)
checkmate::assert_function(metric, args=c("actual", "predicted", "weight", names(kwargs)))
# extract feature names
model.features <- colnames(data)
# Define function to get the metric
get_metric <- function(data_){
# Get average prediction
if(any(class(model)=="xgb.Booster")){
# create dmatrix for standard data
d_mat <- xgboost::xgb.DMatrix(data=data_ %>% sapply(as.numeric) %>% as.matrix())
predicted <- predict(object=model, newdata=d_mat) %>% as.vector()
}else{
predicted <- predict(object=model, newdata=data_) %>% as.vector()
}
score <- metric %>% do.call(append(list(actual=actual, predicted=predicted, weight=weight),
kwargs))
}
score_standard <- get_metric(data_=data)
# Permute features and score -----------------
permuted_data <- data
for (ii in 1:length(model.features)){ # for all features
var_ii <- model.features[ii]
original_values_ii <- data[[var_ii]] # save original ordering for feature
scores_permuted_ii <- c() # empty vector to contain scores
for (kk in 1:nrounds){
set.seed((((seed+length(model.features)) * ii) + kk)) # set seed
permuted_data[[var_ii]] <- sample(original_values_ii) # permute feature
scores_permuted_ii[kk] <- get_metric(data_=permuted_data) # append to vector of scores
}
permuted_data[[var_ii]] <- original_values_ii # replace with original ordering
# create output for feature var_ii
out_df_ii <- data.frame(col_index=ii,
feature=var_ii,
importance_mean=score_standard - (scores_permuted_ii %>% mean()),
importance_sd=scores_permuted_ii %>% sd(),
stringsAsFactors = FALSE)
# append to existing output
if (ii==1){
out_df <- out_df_ii
}else{
out_df <- rbind(out_df, out_df_ii)
}
}
# If a low score is good switch the ordering
if (out_df[["importance_mean"]] %>% mean() < 0){
out_df[["importance_mean"]] = -out_df[["importance_mean"]]
}
return(out_df %>% dplyr::arrange(desc(importance_mean), feature))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.