inst/doc/feature-samplers.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
	collapse = TRUE,
	comment = "#>",
	fig.width = 7,
	fig.height = 5
)

## ----load-libraries-----------------------------------------------------------
library(xplainfi)
library(mlr3)
library(mlr3learners)
library(data.table)

# Create a task for demonstration
task_mixed = tsk("penguins")
task_numeric = sim_dgp_correlated(n = 200)

## ----feature-types------------------------------------------------------------
# Check supported feature types for different samplers
task_mixed$feature_types
permutation = MarginalPermutationSampler$new(task_mixed)
permutation$feature_types

## ----sample-methods-----------------------------------------------------------
# Sample from stored task (using row_ids)
sampled_task = permutation$sample(
	feature = "bill_length",
	row_ids = 40:45
)
sampled_task

# Sample from "external" data
test_data = task_mixed$data(rows = 40:45)
sampled_external = permutation$sample_newdata(
	feature = "bill_length",
	newdata = test_data
)
sampled_external

## ----permutation-example------------------------------------------------------
# Create permutation sampler
permutation = MarginalPermutationSampler$new(task_mixed)

# Sample a continuous feature
original = task_mixed$data(rows = 1:10)
sampled = permutation$sample("bill_length", row_ids = 1:10)

# Compare original and sampled values
data.table(
	original_bill = original$bill_length,
	sampled_bill = sampled$bill_length,
	sex = original$sex # Unchanged
)

## ----marginal-ref-example-----------------------------------------------------
# Create marginal reference sampler with n_samples reference pool
marginal_ref = MarginalReferenceSampler$new(task_mixed, n_samples = 30L)

# Sample a feature - each row gets values from a randomly sampled reference row
original = task_mixed$data(rows = 1:5)
sampled = marginal_ref$sample("bill_length", row_ids = 1:5)

# Compare
data.table(
	original_bill = original$bill_length,
	sampled_bill = sampled$bill_length,
	sex = original$sex # Unchanged
)

## ----correlation-preservation-------------------------------------------------
# Sample with MarginalPermutationSampler (breaks correlations)
perm = MarginalPermutationSampler$new(task_numeric)
sampled_perm = perm$sample(c("x1", "x2"), row_ids = 1:10)

# Sample with MarginalReferenceSampler (preserves within-row correlations)
ref = MarginalReferenceSampler$new(task_numeric, n_samples = 50L)
sampled_ref = ref$sample(c("x1", "x2"), row_ids = 1:10)

# Check correlations
cor_original = cor(task_numeric$data()$x1, task_numeric$data()$x2)
cor_perm = cor(sampled_perm$x1, sampled_perm$x2)
cor_ref = cor(sampled_ref$x1, sampled_ref$x2)

data.table(
	method = c("Original", "Permutation", "Reference"),
	correlation = c(cor_original, cor_perm, cor_ref)
)

## ----gaussian-sampler---------------------------------------------------------
# Create Gaussian conditional sampler
gaussian = ConditionalGaussianSampler$new(task_numeric)

# Sample x1 conditioned on other features
sampled = gaussian$sample(
	feature = "x1",
	row_ids = 1:10,
	conditioning_set = c("x2", "x3", "x4")
)

# Compare original and conditionally sampled values
original = task_numeric$data(rows = 1:10)
data.table(
	original = original$x1,
	sampled = sampled$x1,
	x2 = original$x2 # Conditioning feature (unchanged)
)

## ----arf-sampler--------------------------------------------------------------
# Create ARF sampler (works with full task including categorical features)
arf = ConditionalARFSampler$new(task_mixed, num_trees = 20, verbose = FALSE)

# Sample island conditioned on body measurements
sampled = arf$sample(
	feature = "island",
	row_ids = 1:10,
	conditioning_set = c("bill_length", "body_mass")
)

# Compare original and sampled island
original = task_mixed$data(rows = 1:10)
data.table(
	original_island = original$island,
	sampled_island = sampled$island,
	bill_length = original$bill_length, # Conditioning feature
	body_mass = original$body_mass # Conditioning feature
)

## ----ctree-sampler------------------------------------------------------------
# Create ctree sampler
ctree = ConditionalCtreeSampler$new(task_mixed)

# Sample with default parameters
sampled = ctree$sample(
	feature = "bill_length",
	row_ids = 1:10,
	conditioning_set = "island"
)

original = task_mixed$data(rows = 1:10)
data.table(
	island = original$island, # Conditioning feature
	original = original$bill_length,
	sampled = sampled$bill_length
)

## ----knn-sampler-numeric------------------------------------------------------
# Create kNN sampler with k=5 neighbors
knn_numeric = ConditionalKNNSampler$new(task_numeric, k = 5)

# Sample x1 based on nearest neighbors in (x2, x3) space
sampled_numeric = knn_numeric$sample(
	feature = "x1",
	row_ids = 1:5,
	conditioning_set = c("x2", "x3")
)

original_numeric = task_numeric$data(rows = 1:5)
data.table(
	x2 = original_numeric$x2,
	x3 = original_numeric$x3,
	original_x1 = original_numeric$x1,
	sampled_x1 = sampled_numeric$x1
)

## ----knn-sampler-mixed--------------------------------------------------------
# Use task with categorical features
knn_mixed = ConditionalKNNSampler$new(task_mixed, k = 5)

# Sample bill_length conditioning on island (categorical) and body_mass (numeric)
sampled_mixed = knn_mixed$sample(
	feature = "bill_length",
	row_ids = 1:5,
	conditioning_set = c("island", "body_mass")
)

original_mixed = task_mixed$data(rows = 1:5)
data.table(
	island = original_mixed$island,
	body_mass = original_mixed$body_mass,
	original_bill = original_mixed$bill_length,
	sampled_bill = sampled_mixed$bill_length
)

## ----knockoff-sampler---------------------------------------------------------
# Create Gaussian knockoff sampler (using task_numeric from earlier)
knockoff = KnockoffGaussianSampler$new(task_numeric)

# Generate knockoffs
original = task_numeric$data(rows = 1:5)
knockoffs = knockoff$sample(
	feature = task_numeric$feature_names,
	row_ids = 1:5
)

# Original vs knockoff values
data.table(
	x1_original = original$x1,
	x1_knockoff = knockoffs$x1,
	x2_original = original$x2,
	x2_knockoff = knockoffs$x2
)

## ----cfi-knockoff, eval = FALSE-----------------------------------------------
# # CFI with knockoff sampler for conditional independence testing
# cfi_knockoff = CFI$new(
# 	task = task_numeric,
# 	learner = lrn("regr.ranger"),
# 	measure = msr("regr.mse"),
# 	sampler = knockoff
# )
# 
# # Compute importance with CPI-based inference
# cfi_knockoff$compute()
# cfi_knockoff$importance(ci_method = "cpi")

Try the xplainfi package in your browser

Any scripts or data that you put into this service are public.

xplainfi documentation built on Feb. 27, 2026, 1:08 a.m.