Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.width = 8,
fig.height = 6
)
set.seed(123)
# Quiet down
lgr::get_logger("mlr3")$set_threshold("warn")
options("xplain.progress" = interactive())
## ----setup--------------------------------------------------------------------
library(xplainfi)
library(mlr3)
library(mlr3learners)
library(data.table)
library(ggplot2)
## ----setup-problem------------------------------------------------------------
task <- tgen("friedman1")$generate(n = 300)
learner <- lrn("regr.ranger", num.trees = 100)
measure <- msr("regr.mse")
resampling <- rsmp("cv", folds = 3)
## ----pfi-basic----------------------------------------------------------------
pfi <- PFI$new(
task = task,
learner = learner,
measure = measure,
resampling = resampling,
n_repeats = 10
)
pfi$compute()
pfi$importance()
## ----pfi-parameters-----------------------------------------------------------
pfi_stable <- PFI$new(
task = task,
learner = learner,
measure = measure,
resampling = resampling,
n_repeats = 50
)
pfi_stable$compute()
pfi_stable$importance()
## ----pif-nrepeats-------------------------------------------------------------
pfi_stable$scores()[feature == "important2", ] |>
ggplot(aes(y = importance, x = factor(iter_rsmp))) +
geom_boxplot() +
labs(
title = "PFI variability within resampling iterations",
subtitle = "Setting n_repeats higher improves PFI estimates",
y = "PFI score (important2)",
x = "Resampling iteration (3-fold CV)"
) +
theme_minimal()
## ----pfi-scores-tmp, echo=FALSE-----------------------------------------------
pfi_important2_range = round(range(pfi_stable$scores()[feature == "important2", importance]), 2)
## ----pfi-ratio----------------------------------------------------------------
pfi_stable$importance(relation = "ratio")
## ----loco-basic---------------------------------------------------------------
loco <- LOCO$new(
task = task,
learner = learner,
measure = measure,
resampling = resampling,
n_repeats = 10
)
loco$compute()
loco$importance()
## ----samplers-demo------------------------------------------------------------
arf_sampler <- ConditionalARFSampler$new(task)
sample_data <- task$data(rows = 1:5)
sample_data[, .(important1, important2)]
## ----conditional-sampling-----------------------------------------------------
sampled_conditional <- arf_sampler$sample_newdata(
feature = "important1",
newdata = sample_data,
conditioning_set = c("important2", "important3")
)
sample_data[, .(important1, important2, important3)]
sampled_conditional[, .(important1, important2, important3)]
## ----detailed-scores----------------------------------------------------------
pfi$scores() |>
head(10) |>
knitr::kable(digits = 4, caption = "Detailed PFI scores (first 10 rows)")
## ----scoring-summary----------------------------------------------------------
pfi$scores()[, .(
features = uniqueN(feature),
resampling_folds = uniqueN(iter_rsmp),
permutation_iters = uniqueN(iter_repeat),
total_scores = .N
)]
## ----detailed-scores-ratio----------------------------------------------------
pfi$scores(relation = "ratio") |>
head(10) |>
knitr::kable(digits = 4, caption = "PFI scores using the ratio (first 10 rows)")
## ----pfi-obs-scores-----------------------------------------------------------
pfi$obs_loss()
## ----pretrained-pfi-----------------------------------------------------------
resampling_holdout <- rsmp("holdout")$instantiate(task)
learner_trained <- lrn("regr.ranger", num.trees = 100)
learner_trained$train(task, row_ids = resampling_holdout$train_set(1))
pfi_pretrained <- PFI$new(
task = task,
learner = learner_trained,
measure = measure,
resampling = resampling_holdout,
n_repeats = 10
)
pfi_pretrained$compute()
pfi_pretrained$importance()
## ----pretrained-custom--------------------------------------------------------
# Simulate: learner was trained elsewhere, we have new data to use
new_data <- tgen("friedman1")$generate(n = 100)
# Same as rsmp_all_test(task)
resampling_custom <- rsmp("custom")$instantiate(
new_data,
train_sets = list(integer(0)),
test_sets = list(new_data$row_ids)
)
pfi_newdata <- PFI$new(
task = new_data,
learner = learner_trained,
measure = measure,
resampling = resampling_custom,
n_repeats = 10
)
pfi_newdata$compute()
pfi_newdata$importance()
## ----pretrained-error, error = TRUE-------------------------------------------
try({
PFI$new(
task = task,
learner = learner_trained,
measure = measure,
resampling = rsmp("cv", folds = 3)
)
})
## ----parallel-future, eval = FALSE--------------------------------------------
# library(future)
# plan("multisession", workers = 2)
#
# # PFI with parallelization across features
# pfi_parallel = PFI$new(
# task,
# learner = lrn("regr.ranger"),
# measure = msr("regr.mse"),
# n_repeats = 10
# )
# pfi_parallel$compute()
# pfi_parallel$importance()
#
# # LOCO with parallelization (uses mlr3fselect internally)
# loco_parallel = LOCO$new(
# task,
# learner = lrn("regr.ranger"),
# measure = msr("regr.mse")
# )
# loco_parallel$compute()
# loco_parallel$importance()
## ----parallel-mirai, eval = FALSE---------------------------------------------
# library(mirai)
# daemons(n = 2)
#
# # Same PFI/LOCO code works with mirai backend
# pfi_parallel = PFI$new(
# task,
# learner = lrn("regr.ranger"),
# measure = msr("regr.mse"),
# n_repeats = 10
# )
# pfi_parallel$compute()
# pfi_parallel$importance()
#
# # Clean up daemons when done
# daemons(0)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.